diff --git a/cmd/cadvisor.go b/cmd/cadvisor.go index fc34d967bd..a9c68bc596 100644 --- a/cmd/cadvisor.go +++ b/cmd/cadvisor.go @@ -25,6 +25,7 @@ import ( "runtime" "strings" "syscall" + "time" cadvisorhttp "github.com/google/cadvisor/cmd/internal/http" "github.com/google/cadvisor/container" @@ -75,6 +76,8 @@ var perfEvents = flag.String("perf_events_config", "", "Path to a JSON file cont var resctrlInterval = flag.Duration("resctrl_interval", 0, "Resctrl mon groups updating interval. Zero value disables updating mon groups.") +var oomEventRetainDuration = flag.Duration("oom_event_retain_time", 5*time.Minute, "Duration for which to retain OOM events. Zero value disables retaining OOM events.") + var ( // Metrics to be ignored. // Tcp metrics are ignored by default. @@ -132,7 +135,17 @@ func main() { collectorHTTPClient := createCollectorHTTPClient(*collectorCert, *collectorKey) - resourceManager, err := manager.New(memoryStorage, sysFs, manager.HousekeepingConfigFlags, includedMetrics, &collectorHTTPClient, strings.Split(*rawCgroupPrefixWhiteList, ","), strings.Split(*envMetadataWhiteList, ","), *perfEvents, *resctrlInterval) + containerLabelFunc := metrics.DefaultContainerLabels + if !*storeContainerLabels { + whitelistedLabels := strings.Split(*whitelistedContainerLabels, ",") + // Trim spacing in labels + for i := range whitelistedLabels { + whitelistedLabels[i] = strings.TrimSpace(whitelistedLabels[i]) + } + containerLabelFunc = metrics.BaseContainerLabels(whitelistedLabels) + } + + resourceManager, err := manager.New(memoryStorage, sysFs, manager.HousekeepingConfigFlags, includedMetrics, &collectorHTTPClient, strings.Split(*rawCgroupPrefixWhiteList, ","), strings.Split(*envMetadataWhiteList, ","), *perfEvents, *resctrlInterval, containerLabelFunc, oomEventRetainDuration) if err != nil { klog.Fatalf("Failed to create a manager: %s", err) } @@ -152,16 +165,6 @@ func main() { klog.Fatalf("Failed to register HTTP handlers: %v", err) } - containerLabelFunc := metrics.DefaultContainerLabels - if !*storeContainerLabels { - whitelistedLabels := strings.Split(*whitelistedContainerLabels, ",") - // Trim spacing in labels - for i := range whitelistedLabels { - whitelistedLabels[i] = strings.TrimSpace(whitelistedLabels[i]) - } - containerLabelFunc = metrics.BaseContainerLabels(whitelistedLabels) - } - // Register Prometheus collector to gather information about containers, Go runtime, processes, and machine cadvisorhttp.RegisterPrometheusHandler(mux, resourceManager, *prometheusEndpoint, containerLabelFunc, includedMetrics) diff --git a/info/v1/container.go b/info/v1/container.go index efcfd5628e..2f74b05f28 100644 --- a/info/v1/container.go +++ b/info/v1/container.go @@ -967,8 +967,6 @@ type ContainerStats struct { Resctrl ResctrlStats `json:"resctrl,omitempty"` CpuSet CPUSetStats `json:"cpuset,omitempty"` - - OOMEvents uint64 `json:"oom_events,omitempty"` } func timeEq(t1, t2 time.Time, tolerance time.Duration) bool { diff --git a/manager/container.go b/manager/container.go index cf868953c9..291725daf3 100644 --- a/manager/container.go +++ b/manager/container.go @@ -27,7 +27,6 @@ import ( "strconv" "strings" "sync" - "sync/atomic" "time" "github.com/google/cadvisor/cache/memory" @@ -65,7 +64,6 @@ type containerInfo struct { } type containerData struct { - oomEvents uint64 handler container.ContainerHandler info containerInfo memoryCache *memory.InMemoryCache @@ -669,8 +667,6 @@ func (cd *containerData) updateStats() error { } } - stats.OOMEvents = atomic.LoadUint64(&cd.oomEvents) - var customStatsErr error cm := cd.collectorManager.(*collector.GenericCollectorManager) if len(cm.Collectors) > 0 { diff --git a/manager/manager.go b/manager/manager.go index 4f88e603fe..c324c571ed 100644 --- a/manager/manager.go +++ b/manager/manager.go @@ -37,6 +37,7 @@ import ( info "github.com/google/cadvisor/info/v1" v2 "github.com/google/cadvisor/info/v2" "github.com/google/cadvisor/machine" + "github.com/google/cadvisor/metrics" "github.com/google/cadvisor/nvm" "github.com/google/cadvisor/perf" "github.com/google/cadvisor/resctrl" @@ -144,6 +145,8 @@ type Manager interface { AllPodmanContainers(c *info.ContainerInfoRequest) (map[string]info.ContainerInfo, error) PodmanContainer(containerName string, query *info.ContainerInfoRequest) (info.ContainerInfo, error) + + GetOOMInfos() map[string]*oomparser.ContainerOomInfo } // Housekeeping configuration for the manager @@ -153,7 +156,9 @@ type HousekeepingConfig = struct { } // New takes a memory storage and returns a new manager. -func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, HousekeepingConfig HousekeepingConfig, includedMetricsSet container.MetricSet, collectorHTTPClient *http.Client, rawContainerCgroupPathPrefixWhiteList, containerEnvMetadataWhiteList []string, perfEventsFile string, resctrlInterval time.Duration) (Manager, error) { +func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, HousekeepingConfig HousekeepingConfig, includedMetricsSet container.MetricSet, + collectorHTTPClient *http.Client, rawContainerCgroupPathPrefixWhiteList, containerEnvMetadataWhiteList []string, + perfEventsFile string, resctrlInterval time.Duration, f metrics.ContainerLabelsFunc, oomRetainDuration *time.Duration) (Manager, error) { if memoryCache == nil { return nil, fmt.Errorf("manager requires memory storage") } @@ -208,6 +213,9 @@ func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, HousekeepingConfi collectorHTTPClient: collectorHTTPClient, rawContainerCgroupPathPrefixWhiteList: rawContainerCgroupPathPrefixWhiteList, containerEnvMetadataWhiteList: containerEnvMetadataWhiteList, + oomInfos: map[string]*oomparser.ContainerOomInfo{}, + containerLabelFunc: f, + oomRetainDuration: oomRetainDuration, } machineInfo, err := machine.Info(sysfs, fsInfo, inHostNamespace) @@ -247,6 +255,7 @@ type namespacedContainerName struct { } type manager struct { + oomInfos map[string]*oomparser.ContainerOomInfo containers map[namespacedContainerName]*containerData containersLock sync.RWMutex memoryCache *memory.InMemoryCache @@ -271,6 +280,8 @@ type manager struct { rawContainerCgroupPathPrefixWhiteList []string // List of container env prefix whitelist, the matched container envs would be collected into metrics as extra labels. containerEnvMetadataWhiteList []string + containerLabelFunc metrics.ContainerLabelsFunc + oomRetainDuration *time.Duration } func (m *manager) PodmanContainer(containerName string, query *info.ContainerInfoRequest) (info.ContainerInfo, error) { @@ -318,7 +329,7 @@ func (m *manager) Start() error { return err } klog.V(2).Infof("Starting recovery of all containers") - err = m.detectSubcontainers("/") + err = m.detectSubContainers("/") if err != nil { return err } @@ -340,6 +351,7 @@ func (m *manager) Start() error { quitUpdateMachineInfo := make(chan error) m.quitChannels = append(m.quitChannels, quitUpdateMachineInfo) go m.updateMachineInfo(quitUpdateMachineInfo) + go m.cleanUpOomInfos() return nil } @@ -363,6 +375,61 @@ func (m *manager) Stop() error { return nil } +func (m *manager) GetOOMInfos() map[string]*oomparser.ContainerOomInfo { + m.containersLock.RLock() + defer m.containersLock.RUnlock() + oomInfos := make(map[string]*oomparser.ContainerOomInfo) + for k, v := range m.oomInfos { + if time.Since(v.TimeOfDeath) > *m.oomRetainDuration { + continue + } + oomInfos[k] = v + } + return oomInfos +} + +func (m *manager) cleanUpOomInfos() { + ticker := time.NewTicker(time.Minute) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + m.containersLock.Lock() + for k, v := range m.oomInfos { + if time.Since(v.TimeOfDeath) > *m.oomRetainDuration { + delete(m.oomInfos, k) + } + } + m.containersLock.Unlock() + } + } +} + +func (m *manager) addOrUpdateOomInfo(cont *containerData, timeOfDeath time.Time) error { + m.containersLock.Lock() + defer m.containersLock.Unlock() + + contInfo, err := m.containerDataToContainerInfo(cont, &info.ContainerInfoRequest{ + NumStats: 60, + }) + if err != nil { + return err + } + if oomInfo, ok := m.oomInfos[contInfo.Id]; ok { + atomic.AddUint64(&oomInfo.OomEvents, 1) + return nil + } + containerLabels := m.containerLabelFunc(contInfo) + newOomInfo := &oomparser.ContainerOomInfo{ + MetricLabels: containerLabels, + TimeOfDeath: timeOfDeath, + } + atomic.AddUint64(&newOomInfo.OomEvents, 1) + m.oomInfos[contInfo.Id] = newOomInfo + return nil +} + func (m *manager) destroyCollectors() { for _, container := range m.containers { container.perfCollector.Destroy() @@ -406,7 +473,7 @@ func (m *manager) globalHousekeeping(quit chan error) { start := time.Now() // Check for new containers. - err := m.detectSubcontainers("/") + err := m.detectSubContainers("/") if err != nil { klog.Errorf("Failed to detect containers: %s", err) } @@ -1056,7 +1123,7 @@ func (m *manager) destroyContainerLocked(containerName string) error { // Detect all containers that have been added or deleted from the specified container. func (m *manager) getContainersDiff(containerName string) (added []info.ContainerReference, removed []info.ContainerReference, err error) { - // Get all subcontainers recursively. + // Get all subContainers recursively. m.containersLock.RLock() cont, ok := m.containers[namespacedContainerName{ Name: containerName, @@ -1103,8 +1170,8 @@ func (m *manager) getContainersDiff(containerName string) (added []info.Containe return } -// Detect the existing subcontainers and reflect the setup here. -func (m *manager) detectSubcontainers(containerName string) error { +// Detect the existing subContainers and reflect the setup here. +func (m *manager) detectSubContainers(containerName string) error { added, removed, err := m.getContainersDiff(containerName) if err != nil { return err @@ -1147,7 +1214,7 @@ func (m *manager) watchForNewContainers(quit chan error) error { } // There is a race between starting the watch and new container creation so we do a detection before we read new containers. - err := m.detectSubcontainers("/") + err := m.detectSubContainers("/") if err != nil { return err } @@ -1247,7 +1314,9 @@ func (m *manager) watchForNewOoms() error { continue } for _, cont := range conts { - atomic.AddUint64(&cont.oomEvents, 1) + if err := m.addOrUpdateOomInfo(cont, oomInstance.TimeOfDeath); err != nil { + klog.Errorf("failed to add OOM info for %q: %v", oomInstance.ContainerName, err) + } } } }() diff --git a/manager/manager_test.go b/manager/manager_test.go index 6e6312a69f..c15a940c9e 100644 --- a/manager/manager_test.go +++ b/manager/manager_test.go @@ -30,8 +30,9 @@ import ( info "github.com/google/cadvisor/info/v1" itest "github.com/google/cadvisor/info/v1/test" v2 "github.com/google/cadvisor/info/v2" + "github.com/google/cadvisor/metrics" + "github.com/google/cadvisor/utils/oomparser" "github.com/google/cadvisor/utils/sysfs/fakesysfs" - "github.com/stretchr/testify/assert" clock "k8s.io/utils/clock/testing" @@ -531,3 +532,50 @@ func TestDockerContainersInfo(t *testing.T) { t.Errorf("expected error %q but received %q", expectedError, err) } } + +func Test_addOrUpdateOomInfo(t *testing.T) { + m := manager{ + oomInfos: map[string]*oomparser.ContainerOomInfo{}, + memoryCache: memory.New(2*time.Minute, nil), + containerLabelFunc: metrics.DefaultContainerLabels, + } + contReference := info.ContainerReference{ + Name: "testcontainer", + Id: "testcontainer1", + } + contInfo := &info.ContainerInfo{ + ContainerReference: contReference, + } + spec := info.ContainerSpec{} + mockHandler := containertest.NewMockContainerHandler("testcontainer") + mockHandler.On("GetSpec").Return( + spec, + nil, + ).Times(2) + mockHandler.On("ListContainers", container.ListSelf).Return( + []info.ContainerReference{contReference}, + nil, + ).Times(2) + collectorManager, _ := collector.NewCollectorManager() + cont := &containerData{ + info: containerInfo{ + ContainerReference: contReference, + }, + handler: mockHandler, + collectorManager: collectorManager, + infoLastUpdatedTime: time.Now(), + clock: clock.NewFakeClock(time.Now()), + } + m.memoryCache.AddStats(contInfo, &info.ContainerStats{ + Timestamp: time.Now(), + }) + + err := m.addOrUpdateOomInfo(cont, time.Now()) + assert.NoError(t, err) + assert.Equal(t, 1, len(m.oomInfos)) + assert.Equal(t, uint64(1), m.oomInfos["testcontainer1"].OomEvents) + + // update oom info, increase oom event counts + assert.NoError(t, m.addOrUpdateOomInfo(cont, time.Now())) + assert.Equal(t, uint64(2), m.oomInfos["testcontainer1"].OomEvents) +} diff --git a/metrics/metrics.go b/metrics/metrics.go index f314f631d7..bf7daab72c 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -19,6 +19,7 @@ import ( info "github.com/google/cadvisor/info/v1" v2 "github.com/google/cadvisor/info/v2" + "github.com/google/cadvisor/utils/oomparser" ) // metricValue describes a single metric value for a given set of label values @@ -39,4 +40,6 @@ type infoProvider interface { GetVersionInfo() (*info.VersionInfo, error) // GetMachineInfo provides information about the machine. GetMachineInfo() (*info.MachineInfo, error) + // GetOOMInfos provides information about OOMs. + GetOOMInfos() map[string]*oomparser.ContainerOomInfo } diff --git a/metrics/prometheus.go b/metrics/prometheus.go index a6fc3dff4c..fc09bd2a8f 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -1712,16 +1712,6 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri }, }...) } - if includedMetrics.Has(container.OOMMetrics) { - c.containerMetrics = append(c.containerMetrics, containerMetric{ - name: "container_oom_events_total", - help: "Count of out of memory events observed for the container", - valueType: prometheus.CounterValue, - getValues: func(s *info.ContainerStats) metricValues { - return metricValues{{value: float64(s.OOMEvents), timestamp: s.Timestamp}} - }, - }) - } return c } @@ -1832,24 +1822,21 @@ func (c *PrometheusCollector) collectContainersInfo(ch chan<- prometheus.Metric) } } + if c.includedMetrics.Has(container.OOMMetrics) { + for _, oomInfo := range c.infoProvider.GetOOMInfos() { + labels, values := genLabelValues(rawLabels, oomInfo.MetricLabels) + ch <- prometheus.NewMetricWithTimestamp(oomInfo.TimeOfDeath, prometheus.MustNewConstMetric( + prometheus.NewDesc("container_oom_events_total", "Count of out of memory events observed for the container", labels, nil), + prometheus.CounterValue, + float64(oomInfo.OomEvents), + values..., + )) + } + } + for _, cont := range containers { - values := make([]string, 0, len(rawLabels)) - labels := make([]string, 0, len(rawLabels)) containerLabels := c.containerLabelsFunc(cont) - for l := range rawLabels { - duplicate := false - sl := sanitizeLabelName(l) - for _, x := range labels { - if sl == x { - duplicate = true - break - } - } - if !duplicate { - labels = append(labels, sl) - values = append(values, containerLabels[l]) - } - } + labels, values := genLabelValues(rawLabels, containerLabels) // Container spec desc := prometheus.NewDesc("container_start_time_seconds", "Start time of the container since unix epoch in seconds.", labels, nil) @@ -1940,6 +1927,26 @@ func sanitizeLabelName(name string) string { return invalidNameCharRE.ReplaceAllString(name, "_") } +func genLabelValues(rawLabels map[string]struct{}, sourceLabels map[string]string) ([]string, []string) { + values := make([]string, 0, len(rawLabels)) + labels := make([]string, 0, len(rawLabels)) + for l := range rawLabels { + duplicate := false + sl := sanitizeLabelName(l) + for _, x := range labels { + if sl == x { + duplicate = true + break + } + } + if !duplicate { + labels = append(labels, sl) + values = append(values, sourceLabels[l]) + } + } + return labels, values +} + func getNumaStatsPerNode(nodeStats map[uint8]uint64, labels []string, timestamp time.Time) metricValues { mValues := make(metricValues, 0, len(nodeStats)) for node, stat := range nodeStats { diff --git a/metrics/prometheus_fake.go b/metrics/prometheus_fake.go index 7b7399778d..3e0e1bc928 100644 --- a/metrics/prometheus_fake.go +++ b/metrics/prometheus_fake.go @@ -20,6 +20,7 @@ import ( info "github.com/google/cadvisor/info/v1" v2 "github.com/google/cadvisor/info/v2" + "github.com/google/cadvisor/utils/oomparser" ) type testSubcontainersInfoProvider struct{} @@ -739,6 +740,23 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req }, nil } +func (p testSubcontainersInfoProvider) GetOOMInfos() map[string]*oomparser.ContainerOomInfo { + return map[string]*oomparser.ContainerOomInfo{ + "testcontainer": { + OomEvents: 0, + TimeOfDeath: time.Unix(1395066363, 0), + MetricLabels: map[string]string{ + "container_env_foo+env": "prod", + "container_label_foo.label": "bar", + "id": "testcontainer", + "image": "test", + "name": "testcontaineralias", + "zone.name": "hello", + }, + }, + } +} + type erroringSubcontainersInfoProvider struct { successfulProvider testSubcontainersInfoProvider shouldFail bool @@ -765,3 +783,10 @@ func (p *erroringSubcontainersInfoProvider) GetRequestedContainersInfo( } return p.successfulProvider.GetRequestedContainersInfo(a, opt) } + +func (p *erroringSubcontainersInfoProvider) GetOOMInfos() map[string]*oomparser.ContainerOomInfo { + if p.shouldFail { + return map[string]*oomparser.ContainerOomInfo{} + } + return p.successfulProvider.GetOOMInfos() +} diff --git a/metrics/prometheus_test.go b/metrics/prometheus_test.go index a9ef6f8899..69894e8668 100644 --- a/metrics/prometheus_test.go +++ b/metrics/prometheus_test.go @@ -23,6 +23,7 @@ import ( "github.com/google/cadvisor/container" info "github.com/google/cadvisor/info/v1" v2 "github.com/google/cadvisor/info/v2" + "github.com/google/cadvisor/utils/oomparser" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/testutil" @@ -150,6 +151,10 @@ func (m *mockInfoProvider) GetMachineInfo() (*info.MachineInfo, error) { return nil, errors.New("not supported") } +func (m *mockInfoProvider) GetOOMInfos() map[string]*oomparser.ContainerOomInfo { + return nil +} + func mockLabelFunc(*info.ContainerInfo) map[string]string { return map[string]string{} } diff --git a/utils/oomparser/oomparser.go b/utils/oomparser/oomparser.go index 32dc838fb9..958a91dbcc 100644 --- a/utils/oomparser/oomparser.go +++ b/utils/oomparser/oomparser.go @@ -21,7 +21,6 @@ import ( "time" "github.com/euank/go-kmsg-parser/kmsgparser" - "k8s.io/klog/v2" ) @@ -39,6 +38,12 @@ type OomParser struct { parser kmsgparser.Parser } +type ContainerOomInfo struct { + MetricLabels map[string]string + OomEvents uint64 + TimeOfDeath time.Time +} + // struct that contains information related to an OOM kill instance type OomInstance struct { // process id of the killed process