Skip to content

Commit 02b6c33

Browse files
committed
fix container_oom_events_total always returns 0.
In a Kubernetes pod, if a container is OOM-killed, it will be deleted and a new container will be created. Therefore, the `container_oom_events_total` metric will always be 0. Refactor the collector of oom events, and retain the deleted container oom information for a period of events Signed-off-by: joey <[email protected]>
1 parent f95f133 commit 02b6c33

File tree

10 files changed

+212
-53
lines changed

10 files changed

+212
-53
lines changed

cmd/cadvisor.go

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"runtime"
2626
"strings"
2727
"syscall"
28+
"time"
2829

2930
cadvisorhttp "github.com/google/cadvisor/cmd/internal/http"
3031
"github.com/google/cadvisor/container"
@@ -75,6 +76,8 @@ var perfEvents = flag.String("perf_events_config", "", "Path to a JSON file cont
7576

7677
var resctrlInterval = flag.Duration("resctrl_interval", 0, "Resctrl mon groups updating interval. Zero value disables updating mon groups.")
7778

79+
var oomEventRetainDuration = flag.Duration("oom_event_retain_time", 5*time.Minute, "Duration for which to retain OOM events. Zero value disables retaining OOM events.")
80+
7881
var (
7982
// Metrics to be ignored.
8083
// Tcp metrics are ignored by default.
@@ -133,7 +136,17 @@ func main() {
133136

134137
collectorHTTPClient := createCollectorHTTPClient(*collectorCert, *collectorKey)
135138

136-
resourceManager, err := manager.New(memoryStorage, sysFs, manager.HousekeepingConfigFlags, includedMetrics, &collectorHTTPClient, strings.Split(*rawCgroupPrefixWhiteList, ","), strings.Split(*envMetadataWhiteList, ","), *perfEvents, *resctrlInterval)
139+
containerLabelFunc := metrics.DefaultContainerLabels
140+
if !*storeContainerLabels {
141+
whitelistedLabels := strings.Split(*whitelistedContainerLabels, ",")
142+
// Trim spacing in labels
143+
for i := range whitelistedLabels {
144+
whitelistedLabels[i] = strings.TrimSpace(whitelistedLabels[i])
145+
}
146+
containerLabelFunc = metrics.BaseContainerLabels(whitelistedLabels)
147+
}
148+
149+
resourceManager, err := manager.New(memoryStorage, sysFs, manager.HousekeepingConfigFlags, includedMetrics, &collectorHTTPClient, strings.Split(*rawCgroupPrefixWhiteList, ","), strings.Split(*envMetadataWhiteList, ","), *perfEvents, *resctrlInterval, containerLabelFunc, oomEventRetainDuration)
137150
if err != nil {
138151
klog.Fatalf("Failed to create a manager: %s", err)
139152
}
@@ -153,16 +166,6 @@ func main() {
153166
klog.Fatalf("Failed to register HTTP handlers: %v", err)
154167
}
155168

156-
containerLabelFunc := metrics.DefaultContainerLabels
157-
if !*storeContainerLabels {
158-
whitelistedLabels := strings.Split(*whitelistedContainerLabels, ",")
159-
// Trim spacing in labels
160-
for i := range whitelistedLabels {
161-
whitelistedLabels[i] = strings.TrimSpace(whitelistedLabels[i])
162-
}
163-
containerLabelFunc = metrics.BaseContainerLabels(whitelistedLabels)
164-
}
165-
166169
// Register Prometheus collector to gather information about containers, Go runtime, processes, and machine
167170
cadvisorhttp.RegisterPrometheusHandler(mux, resourceManager, *prometheusEndpoint, containerLabelFunc, includedMetrics)
168171

info/v1/container.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -967,8 +967,6 @@ type ContainerStats struct {
967967
Resctrl ResctrlStats `json:"resctrl,omitempty"`
968968

969969
CpuSet CPUSetStats `json:"cpuset,omitempty"`
970-
971-
OOMEvents uint64 `json:"oom_events,omitempty"`
972970
}
973971

974972
func timeEq(t1, t2 time.Time, tolerance time.Duration) bool {

manager/container.go

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ import (
2727
"strconv"
2828
"strings"
2929
"sync"
30-
"sync/atomic"
3130
"time"
3231

3332
"github.com/google/cadvisor/cache/memory"
@@ -65,7 +64,6 @@ type containerInfo struct {
6564
}
6665

6766
type containerData struct {
68-
oomEvents uint64
6967
handler container.ContainerHandler
7068
info containerInfo
7169
memoryCache *memory.InMemoryCache
@@ -669,8 +667,6 @@ func (cd *containerData) updateStats() error {
669667
}
670668
}
671669

672-
stats.OOMEvents = atomic.LoadUint64(&cd.oomEvents)
673-
674670
var customStatsErr error
675671
cm := cd.collectorManager.(*collector.GenericCollectorManager)
676672
if len(cm.Collectors) > 0 {

manager/manager.go

Lines changed: 77 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ import (
3737
info "github.com/google/cadvisor/info/v1"
3838
v2 "github.com/google/cadvisor/info/v2"
3939
"github.com/google/cadvisor/machine"
40+
"github.com/google/cadvisor/metrics"
4041
"github.com/google/cadvisor/nvm"
4142
"github.com/google/cadvisor/perf"
4243
"github.com/google/cadvisor/resctrl"
@@ -144,6 +145,8 @@ type Manager interface {
144145
AllPodmanContainers(c *info.ContainerInfoRequest) (map[string]info.ContainerInfo, error)
145146

146147
PodmanContainer(containerName string, query *info.ContainerInfoRequest) (info.ContainerInfo, error)
148+
149+
GetOOMInfos() map[string]*oomparser.ContainerOomInfo
147150
}
148151

149152
// Housekeeping configuration for the manager
@@ -153,7 +156,9 @@ type HouskeepingConfig = struct {
153156
}
154157

155158
// New takes a memory storage and returns a new manager.
156-
func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, houskeepingConfig HouskeepingConfig, includedMetricsSet container.MetricSet, collectorHTTPClient *http.Client, rawContainerCgroupPathPrefixWhiteList, containerEnvMetadataWhiteList []string, perfEventsFile string, resctrlInterval time.Duration) (Manager, error) {
159+
func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, houskeepingConfig HouskeepingConfig, includedMetricsSet container.MetricSet,
160+
collectorHTTPClient *http.Client, rawContainerCgroupPathPrefixWhiteList, containerEnvMetadataWhiteList []string,
161+
perfEventsFile string, resctrlInterval time.Duration, f metrics.ContainerLabelsFunc, oomRetainDuration *time.Duration) (Manager, error) {
157162
if memoryCache == nil {
158163
return nil, fmt.Errorf("manager requires memory storage")
159164
}
@@ -208,6 +213,9 @@ func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, houskeepingConfig
208213
collectorHTTPClient: collectorHTTPClient,
209214
rawContainerCgroupPathPrefixWhiteList: rawContainerCgroupPathPrefixWhiteList,
210215
containerEnvMetadataWhiteList: containerEnvMetadataWhiteList,
216+
oomInfos: map[string]*oomparser.ContainerOomInfo{},
217+
containerLabelFunc: f,
218+
oomRetainDuration: oomRetainDuration,
211219
}
212220

213221
machineInfo, err := machine.Info(sysfs, fsInfo, inHostNamespace)
@@ -247,6 +255,7 @@ type namespacedContainerName struct {
247255
}
248256

249257
type manager struct {
258+
oomInfos map[string]*oomparser.ContainerOomInfo
250259
containers map[namespacedContainerName]*containerData
251260
containersLock sync.RWMutex
252261
memoryCache *memory.InMemoryCache
@@ -271,6 +280,8 @@ type manager struct {
271280
rawContainerCgroupPathPrefixWhiteList []string
272281
// List of container env prefix whitelist, the matched container envs would be collected into metrics as extra labels.
273282
containerEnvMetadataWhiteList []string
283+
containerLabelFunc metrics.ContainerLabelsFunc
284+
oomRetainDuration *time.Duration
274285
}
275286

276287
func (m *manager) PodmanContainer(containerName string, query *info.ContainerInfoRequest) (info.ContainerInfo, error) {
@@ -318,7 +329,7 @@ func (m *manager) Start() error {
318329
return err
319330
}
320331
klog.V(2).Infof("Starting recovery of all containers")
321-
err = m.detectSubcontainers("/")
332+
err = m.detectSubContainers("/")
322333
if err != nil {
323334
return err
324335
}
@@ -340,6 +351,7 @@ func (m *manager) Start() error {
340351
quitUpdateMachineInfo := make(chan error)
341352
m.quitChannels = append(m.quitChannels, quitUpdateMachineInfo)
342353
go m.updateMachineInfo(quitUpdateMachineInfo)
354+
go m.cleanUpOomInfos()
343355

344356
return nil
345357
}
@@ -363,6 +375,61 @@ func (m *manager) Stop() error {
363375
return nil
364376
}
365377

378+
func (m *manager) GetOOMInfos() map[string]*oomparser.ContainerOomInfo {
379+
m.containersLock.RLock()
380+
defer m.containersLock.RUnlock()
381+
oomInfos := make(map[string]*oomparser.ContainerOomInfo)
382+
for k, v := range m.oomInfos {
383+
if time.Since(v.TimeOfDeath) > *m.oomRetainDuration {
384+
continue
385+
}
386+
oomInfos[k] = v
387+
}
388+
return oomInfos
389+
}
390+
391+
func (m *manager) cleanUpOomInfos() {
392+
ticker := time.NewTicker(time.Minute)
393+
defer ticker.Stop()
394+
395+
for {
396+
select {
397+
case <-ticker.C:
398+
m.containersLock.Lock()
399+
for k, v := range m.oomInfos {
400+
if time.Since(v.TimeOfDeath) > *m.oomRetainDuration {
401+
delete(m.oomInfos, k)
402+
}
403+
}
404+
m.containersLock.Unlock()
405+
}
406+
}
407+
}
408+
409+
func (m *manager) addOrUpdateOomInfo(cont *containerData, timeOfDeath time.Time) error {
410+
m.containersLock.Lock()
411+
defer m.containersLock.Unlock()
412+
413+
contInfo, err := m.containerDataToContainerInfo(cont, &info.ContainerInfoRequest{
414+
NumStats: 60,
415+
})
416+
if err != nil {
417+
return err
418+
}
419+
if oomInfo, ok := m.oomInfos[contInfo.Id]; ok {
420+
atomic.AddUint64(&oomInfo.OomEvents, 1)
421+
return nil
422+
}
423+
containerLabels := m.containerLabelFunc(contInfo)
424+
newOomInfo := &oomparser.ContainerOomInfo{
425+
MetricLabels: containerLabels,
426+
TimeOfDeath: timeOfDeath,
427+
}
428+
atomic.AddUint64(&newOomInfo.OomEvents, 1)
429+
m.oomInfos[contInfo.Id] = newOomInfo
430+
return nil
431+
}
432+
366433
func (m *manager) destroyCollectors() {
367434
for _, container := range m.containers {
368435
container.perfCollector.Destroy()
@@ -406,7 +473,7 @@ func (m *manager) globalHousekeeping(quit chan error) {
406473
start := time.Now()
407474

408475
// Check for new containers.
409-
err := m.detectSubcontainers("/")
476+
err := m.detectSubContainers("/")
410477
if err != nil {
411478
klog.Errorf("Failed to detect containers: %s", err)
412479
}
@@ -1056,7 +1123,7 @@ func (m *manager) destroyContainerLocked(containerName string) error {
10561123

10571124
// Detect all containers that have been added or deleted from the specified container.
10581125
func (m *manager) getContainersDiff(containerName string) (added []info.ContainerReference, removed []info.ContainerReference, err error) {
1059-
// Get all subcontainers recursively.
1126+
// Get all subContainers recursively.
10601127
m.containersLock.RLock()
10611128
cont, ok := m.containers[namespacedContainerName{
10621129
Name: containerName,
@@ -1103,8 +1170,8 @@ func (m *manager) getContainersDiff(containerName string) (added []info.Containe
11031170
return
11041171
}
11051172

1106-
// Detect the existing subcontainers and reflect the setup here.
1107-
func (m *manager) detectSubcontainers(containerName string) error {
1173+
// Detect the existing subContainers and reflect the setup here.
1174+
func (m *manager) detectSubContainers(containerName string) error {
11081175
added, removed, err := m.getContainersDiff(containerName)
11091176
if err != nil {
11101177
return err
@@ -1147,7 +1214,7 @@ func (m *manager) watchForNewContainers(quit chan error) error {
11471214
}
11481215

11491216
// There is a race between starting the watch and new container creation so we do a detection before we read new containers.
1150-
err := m.detectSubcontainers("/")
1217+
err := m.detectSubContainers("/")
11511218
if err != nil {
11521219
return err
11531220
}
@@ -1247,7 +1314,9 @@ func (m *manager) watchForNewOoms() error {
12471314
continue
12481315
}
12491316
for _, cont := range conts {
1250-
atomic.AddUint64(&cont.oomEvents, 1)
1317+
if err := m.addOrUpdateOomInfo(cont, oomInstance.TimeOfDeath); err != nil {
1318+
klog.Errorf("failed to add OOM info for %q: %v", oomInstance.ContainerName, err)
1319+
}
12511320
}
12521321
}
12531322
}()

manager/manager_test.go

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,9 @@ import (
3030
info "github.com/google/cadvisor/info/v1"
3131
itest "github.com/google/cadvisor/info/v1/test"
3232
v2 "github.com/google/cadvisor/info/v2"
33+
"github.com/google/cadvisor/metrics"
34+
"github.com/google/cadvisor/utils/oomparser"
3335
"github.com/google/cadvisor/utils/sysfs/fakesysfs"
34-
3536
"github.com/stretchr/testify/assert"
3637
clock "k8s.io/utils/clock/testing"
3738

@@ -531,3 +532,50 @@ func TestDockerContainersInfo(t *testing.T) {
531532
t.Errorf("expected error %q but received %q", expectedError, err)
532533
}
533534
}
535+
536+
func Test_addOrUpdateOomInfo(t *testing.T) {
537+
m := manager{
538+
oomInfos: map[string]*oomparser.ContainerOomInfo{},
539+
memoryCache: memory.New(2*time.Minute, nil),
540+
containerLabelFunc: metrics.DefaultContainerLabels,
541+
}
542+
contReference := info.ContainerReference{
543+
Name: "testcontainer",
544+
Id: "testcontainer1",
545+
}
546+
contInfo := &info.ContainerInfo{
547+
ContainerReference: contReference,
548+
}
549+
spec := info.ContainerSpec{}
550+
mockHandler := containertest.NewMockContainerHandler("testcontainer")
551+
mockHandler.On("GetSpec").Return(
552+
spec,
553+
nil,
554+
).Times(2)
555+
mockHandler.On("ListContainers", container.ListSelf).Return(
556+
[]info.ContainerReference{contReference},
557+
nil,
558+
).Times(2)
559+
collectorManager, _ := collector.NewCollectorManager()
560+
cont := &containerData{
561+
info: containerInfo{
562+
ContainerReference: contReference,
563+
},
564+
handler: mockHandler,
565+
collectorManager: collectorManager,
566+
infoLastUpdatedTime: time.Now(),
567+
clock: clock.NewFakeClock(time.Now()),
568+
}
569+
m.memoryCache.AddStats(contInfo, &info.ContainerStats{
570+
Timestamp: time.Now(),
571+
})
572+
573+
err := m.addOrUpdateOomInfo(cont, time.Now())
574+
assert.NoError(t, err)
575+
assert.Equal(t, 1, len(m.oomInfos))
576+
assert.Equal(t, uint64(1), m.oomInfos["testcontainer1"].OomEvents)
577+
578+
// update oom info, increase oom event counts
579+
assert.NoError(t, m.addOrUpdateOomInfo(cont, time.Now()))
580+
assert.Equal(t, uint64(2), m.oomInfos["testcontainer1"].OomEvents)
581+
}

metrics/metrics.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919

2020
info "github.com/google/cadvisor/info/v1"
2121
v2 "github.com/google/cadvisor/info/v2"
22+
"github.com/google/cadvisor/utils/oomparser"
2223
)
2324

2425
// metricValue describes a single metric value for a given set of label values
@@ -39,4 +40,6 @@ type infoProvider interface {
3940
GetVersionInfo() (*info.VersionInfo, error)
4041
// GetMachineInfo provides information about the machine.
4142
GetMachineInfo() (*info.MachineInfo, error)
43+
// GetOOMInfos provides information about OOMs.
44+
GetOOMInfos() map[string]*oomparser.ContainerOomInfo
4245
}

0 commit comments

Comments
 (0)