Skip to content

fix container_oom_events_total always returns 0. #3278

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 14 additions & 11 deletions cmd/cadvisor.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"runtime"
"strings"
"syscall"
"time"

cadvisorhttp "github.com/google/cadvisor/cmd/internal/http"
"github.com/google/cadvisor/container"
Expand Down Expand Up @@ -75,6 +76,8 @@ var perfEvents = flag.String("perf_events_config", "", "Path to a JSON file cont

var resctrlInterval = flag.Duration("resctrl_interval", 0, "Resctrl mon groups updating interval. Zero value disables updating mon groups.")

var oomEventRetainDuration = flag.Duration("oom_event_retain_time", 5*time.Minute, "Duration for which to retain OOM events. Zero value disables retaining OOM events.")

var (
// Metrics to be ignored.
// Tcp metrics are ignored by default.
Expand Down Expand Up @@ -132,7 +135,17 @@ func main() {

collectorHTTPClient := createCollectorHTTPClient(*collectorCert, *collectorKey)

resourceManager, err := manager.New(memoryStorage, sysFs, manager.HousekeepingConfigFlags, includedMetrics, &collectorHTTPClient, strings.Split(*rawCgroupPrefixWhiteList, ","), strings.Split(*envMetadataWhiteList, ","), *perfEvents, *resctrlInterval)
containerLabelFunc := metrics.DefaultContainerLabels
if !*storeContainerLabels {
whitelistedLabels := strings.Split(*whitelistedContainerLabels, ",")
// Trim spacing in labels
for i := range whitelistedLabels {
whitelistedLabels[i] = strings.TrimSpace(whitelistedLabels[i])
}
containerLabelFunc = metrics.BaseContainerLabels(whitelistedLabels)
}

resourceManager, err := manager.New(memoryStorage, sysFs, manager.HousekeepingConfigFlags, includedMetrics, &collectorHTTPClient, strings.Split(*rawCgroupPrefixWhiteList, ","), strings.Split(*envMetadataWhiteList, ","), *perfEvents, *resctrlInterval, containerLabelFunc, oomEventRetainDuration)
if err != nil {
klog.Fatalf("Failed to create a manager: %s", err)
}
Expand All @@ -152,16 +165,6 @@ func main() {
klog.Fatalf("Failed to register HTTP handlers: %v", err)
}

containerLabelFunc := metrics.DefaultContainerLabels
if !*storeContainerLabels {
whitelistedLabels := strings.Split(*whitelistedContainerLabels, ",")
// Trim spacing in labels
for i := range whitelistedLabels {
whitelistedLabels[i] = strings.TrimSpace(whitelistedLabels[i])
}
containerLabelFunc = metrics.BaseContainerLabels(whitelistedLabels)
}

// Register Prometheus collector to gather information about containers, Go runtime, processes, and machine
cadvisorhttp.RegisterPrometheusHandler(mux, resourceManager, *prometheusEndpoint, containerLabelFunc, includedMetrics)

Expand Down
2 changes: 0 additions & 2 deletions info/v1/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -967,8 +967,6 @@ type ContainerStats struct {
Resctrl ResctrlStats `json:"resctrl,omitempty"`

CpuSet CPUSetStats `json:"cpuset,omitempty"`

OOMEvents uint64 `json:"oom_events,omitempty"`
}

func timeEq(t1, t2 time.Time, tolerance time.Duration) bool {
Expand Down
4 changes: 0 additions & 4 deletions manager/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ import (
"strconv"
"strings"
"sync"
"sync/atomic"
"time"

"github.com/google/cadvisor/cache/memory"
Expand Down Expand Up @@ -65,7 +64,6 @@ type containerInfo struct {
}

type containerData struct {
oomEvents uint64
handler container.ContainerHandler
info containerInfo
memoryCache *memory.InMemoryCache
Expand Down Expand Up @@ -669,8 +667,6 @@ func (cd *containerData) updateStats() error {
}
}

stats.OOMEvents = atomic.LoadUint64(&cd.oomEvents)

var customStatsErr error
cm := cd.collectorManager.(*collector.GenericCollectorManager)
if len(cm.Collectors) > 0 {
Expand Down
85 changes: 77 additions & 8 deletions manager/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ import (
info "github.com/google/cadvisor/info/v1"
v2 "github.com/google/cadvisor/info/v2"
"github.com/google/cadvisor/machine"
"github.com/google/cadvisor/metrics"
"github.com/google/cadvisor/nvm"
"github.com/google/cadvisor/perf"
"github.com/google/cadvisor/resctrl"
Expand Down Expand Up @@ -144,6 +145,8 @@ type Manager interface {
AllPodmanContainers(c *info.ContainerInfoRequest) (map[string]info.ContainerInfo, error)

PodmanContainer(containerName string, query *info.ContainerInfoRequest) (info.ContainerInfo, error)

GetOOMInfos() map[string]*oomparser.ContainerOomInfo
}

// Housekeeping configuration for the manager
Expand All @@ -153,7 +156,9 @@ type HousekeepingConfig = struct {
}

// New takes a memory storage and returns a new manager.
func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, HousekeepingConfig HousekeepingConfig, includedMetricsSet container.MetricSet, collectorHTTPClient *http.Client, rawContainerCgroupPathPrefixWhiteList, containerEnvMetadataWhiteList []string, perfEventsFile string, resctrlInterval time.Duration) (Manager, error) {
func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, HousekeepingConfig HousekeepingConfig, includedMetricsSet container.MetricSet,
collectorHTTPClient *http.Client, rawContainerCgroupPathPrefixWhiteList, containerEnvMetadataWhiteList []string,
perfEventsFile string, resctrlInterval time.Duration, f metrics.ContainerLabelsFunc, oomRetainDuration *time.Duration) (Manager, error) {
if memoryCache == nil {
return nil, fmt.Errorf("manager requires memory storage")
}
Expand Down Expand Up @@ -208,6 +213,9 @@ func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, HousekeepingConfi
collectorHTTPClient: collectorHTTPClient,
rawContainerCgroupPathPrefixWhiteList: rawContainerCgroupPathPrefixWhiteList,
containerEnvMetadataWhiteList: containerEnvMetadataWhiteList,
oomInfos: map[string]*oomparser.ContainerOomInfo{},
containerLabelFunc: f,
oomRetainDuration: oomRetainDuration,
}

machineInfo, err := machine.Info(sysfs, fsInfo, inHostNamespace)
Expand Down Expand Up @@ -247,6 +255,7 @@ type namespacedContainerName struct {
}

type manager struct {
oomInfos map[string]*oomparser.ContainerOomInfo
containers map[namespacedContainerName]*containerData
containersLock sync.RWMutex
memoryCache *memory.InMemoryCache
Expand All @@ -271,6 +280,8 @@ type manager struct {
rawContainerCgroupPathPrefixWhiteList []string
// List of container env prefix whitelist, the matched container envs would be collected into metrics as extra labels.
containerEnvMetadataWhiteList []string
containerLabelFunc metrics.ContainerLabelsFunc
oomRetainDuration *time.Duration
}

func (m *manager) PodmanContainer(containerName string, query *info.ContainerInfoRequest) (info.ContainerInfo, error) {
Expand Down Expand Up @@ -318,7 +329,7 @@ func (m *manager) Start() error {
return err
}
klog.V(2).Infof("Starting recovery of all containers")
err = m.detectSubcontainers("/")
err = m.detectSubContainers("/")
if err != nil {
return err
}
Expand All @@ -340,6 +351,7 @@ func (m *manager) Start() error {
quitUpdateMachineInfo := make(chan error)
m.quitChannels = append(m.quitChannels, quitUpdateMachineInfo)
go m.updateMachineInfo(quitUpdateMachineInfo)
go m.cleanUpOomInfos()

return nil
}
Expand All @@ -363,6 +375,61 @@ func (m *manager) Stop() error {
return nil
}

func (m *manager) GetOOMInfos() map[string]*oomparser.ContainerOomInfo {
m.containersLock.RLock()
defer m.containersLock.RUnlock()
oomInfos := make(map[string]*oomparser.ContainerOomInfo)
for k, v := range m.oomInfos {
if time.Since(v.TimeOfDeath) > *m.oomRetainDuration {
continue
}
oomInfos[k] = v
}
return oomInfos
}

func (m *manager) cleanUpOomInfos() {
ticker := time.NewTicker(time.Minute)
defer ticker.Stop()

for {
select {
case <-ticker.C:
m.containersLock.Lock()
for k, v := range m.oomInfos {
if time.Since(v.TimeOfDeath) > *m.oomRetainDuration {
delete(m.oomInfos, k)
}
}
m.containersLock.Unlock()
}
}
}

func (m *manager) addOrUpdateOomInfo(cont *containerData, timeOfDeath time.Time) error {
m.containersLock.Lock()
defer m.containersLock.Unlock()

contInfo, err := m.containerDataToContainerInfo(cont, &info.ContainerInfoRequest{
NumStats: 60,
})
if err != nil {
return err
}
if oomInfo, ok := m.oomInfos[contInfo.Id]; ok {
atomic.AddUint64(&oomInfo.OomEvents, 1)
return nil
}
containerLabels := m.containerLabelFunc(contInfo)
newOomInfo := &oomparser.ContainerOomInfo{
MetricLabels: containerLabels,
TimeOfDeath: timeOfDeath,
}
atomic.AddUint64(&newOomInfo.OomEvents, 1)
m.oomInfos[contInfo.Id] = newOomInfo
return nil
}

func (m *manager) destroyCollectors() {
for _, container := range m.containers {
container.perfCollector.Destroy()
Expand Down Expand Up @@ -406,7 +473,7 @@ func (m *manager) globalHousekeeping(quit chan error) {
start := time.Now()

// Check for new containers.
err := m.detectSubcontainers("/")
err := m.detectSubContainers("/")
if err != nil {
klog.Errorf("Failed to detect containers: %s", err)
}
Expand Down Expand Up @@ -1056,7 +1123,7 @@ func (m *manager) destroyContainerLocked(containerName string) error {

// Detect all containers that have been added or deleted from the specified container.
func (m *manager) getContainersDiff(containerName string) (added []info.ContainerReference, removed []info.ContainerReference, err error) {
// Get all subcontainers recursively.
// Get all subContainers recursively.
m.containersLock.RLock()
cont, ok := m.containers[namespacedContainerName{
Name: containerName,
Expand Down Expand Up @@ -1103,8 +1170,8 @@ func (m *manager) getContainersDiff(containerName string) (added []info.Containe
return
}

// Detect the existing subcontainers and reflect the setup here.
func (m *manager) detectSubcontainers(containerName string) error {
// Detect the existing subContainers and reflect the setup here.
func (m *manager) detectSubContainers(containerName string) error {
added, removed, err := m.getContainersDiff(containerName)
if err != nil {
return err
Expand Down Expand Up @@ -1147,7 +1214,7 @@ func (m *manager) watchForNewContainers(quit chan error) error {
}

// There is a race between starting the watch and new container creation so we do a detection before we read new containers.
err := m.detectSubcontainers("/")
err := m.detectSubContainers("/")
if err != nil {
return err
}
Expand Down Expand Up @@ -1247,7 +1314,9 @@ func (m *manager) watchForNewOoms() error {
continue
}
for _, cont := range conts {
atomic.AddUint64(&cont.oomEvents, 1)
if err := m.addOrUpdateOomInfo(cont, oomInstance.TimeOfDeath); err != nil {
klog.Errorf("failed to add OOM info for %q: %v", oomInstance.ContainerName, err)
}
Comment on lines +1317 to +1319
Copy link

@dgrisonnet dgrisonnet Apr 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is dependent on the existence of the container:

cadvisor/manager/manager.go

Lines 1307 to 1311 in 0b6dfeb

conts, err := m.getRequestedContainers(oomInstance.ContainerName, request)
if err != nil {
klog.V(2).Infof("failed getting container info for %q: %v", oomInstance.ContainerName, err)
continue
}

If we want to address the current dependency issue of the metric on the container lifecycle, I think we shouldn't rely on the containers seen by the manager at all and rather just produce metrics based on the info that cadvisor parses via the oom parser.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shouldn't it be possible to track the necessary information before containers are killed to have them available independent of the container lifecycle?

Copy link

@dgrisonnet dgrisonnet Apr 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is possible but that would require to cache all the container data.
For a separate reason, there was an attempt to create a cache in #2974, but I don't know if any of the maintainers would be willing to pull on the trigger on such a big change. At least it wasn't conceivable in the other PR due to the memory implications.
Also Kubernetes is and has been to trying to move away from cadvisor in favor of container-runtime metrics which doesn't go in the favor of such an investment in cadvisor.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is possible but that would require to cache all the container data.

true, but not all container data, just the bare-minimum to be able to produce useful metrics

}
}
}()
Expand Down
50 changes: 49 additions & 1 deletion manager/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,9 @@ import (
info "github.com/google/cadvisor/info/v1"
itest "github.com/google/cadvisor/info/v1/test"
v2 "github.com/google/cadvisor/info/v2"
"github.com/google/cadvisor/metrics"
"github.com/google/cadvisor/utils/oomparser"
"github.com/google/cadvisor/utils/sysfs/fakesysfs"

"github.com/stretchr/testify/assert"
clock "k8s.io/utils/clock/testing"

Expand Down Expand Up @@ -531,3 +532,50 @@ func TestDockerContainersInfo(t *testing.T) {
t.Errorf("expected error %q but received %q", expectedError, err)
}
}

func Test_addOrUpdateOomInfo(t *testing.T) {
m := manager{
oomInfos: map[string]*oomparser.ContainerOomInfo{},
memoryCache: memory.New(2*time.Minute, nil),
containerLabelFunc: metrics.DefaultContainerLabels,
}
contReference := info.ContainerReference{
Name: "testcontainer",
Id: "testcontainer1",
}
contInfo := &info.ContainerInfo{
ContainerReference: contReference,
}
spec := info.ContainerSpec{}
mockHandler := containertest.NewMockContainerHandler("testcontainer")
mockHandler.On("GetSpec").Return(
spec,
nil,
).Times(2)
mockHandler.On("ListContainers", container.ListSelf).Return(
[]info.ContainerReference{contReference},
nil,
).Times(2)
collectorManager, _ := collector.NewCollectorManager()
cont := &containerData{
info: containerInfo{
ContainerReference: contReference,
},
handler: mockHandler,
collectorManager: collectorManager,
infoLastUpdatedTime: time.Now(),
clock: clock.NewFakeClock(time.Now()),
}
m.memoryCache.AddStats(contInfo, &info.ContainerStats{
Timestamp: time.Now(),
})

err := m.addOrUpdateOomInfo(cont, time.Now())
assert.NoError(t, err)
assert.Equal(t, 1, len(m.oomInfos))
assert.Equal(t, uint64(1), m.oomInfos["testcontainer1"].OomEvents)

// update oom info, increase oom event counts
assert.NoError(t, m.addOrUpdateOomInfo(cont, time.Now()))
assert.Equal(t, uint64(2), m.oomInfos["testcontainer1"].OomEvents)
}
3 changes: 3 additions & 0 deletions metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (

info "github.com/google/cadvisor/info/v1"
v2 "github.com/google/cadvisor/info/v2"
"github.com/google/cadvisor/utils/oomparser"
)

// metricValue describes a single metric value for a given set of label values
Expand All @@ -39,4 +40,6 @@ type infoProvider interface {
GetVersionInfo() (*info.VersionInfo, error)
// GetMachineInfo provides information about the machine.
GetMachineInfo() (*info.MachineInfo, error)
// GetOOMInfos provides information about OOMs.
GetOOMInfos() map[string]*oomparser.ContainerOomInfo
}
Loading