@@ -37,6 +37,7 @@ import (
37
37
info "github.com/google/cadvisor/info/v1"
38
38
v2 "github.com/google/cadvisor/info/v2"
39
39
"github.com/google/cadvisor/machine"
40
+ "github.com/google/cadvisor/metrics"
40
41
"github.com/google/cadvisor/nvm"
41
42
"github.com/google/cadvisor/perf"
42
43
"github.com/google/cadvisor/resctrl"
@@ -144,6 +145,8 @@ type Manager interface {
144
145
AllPodmanContainers (c * info.ContainerInfoRequest ) (map [string ]info.ContainerInfo , error )
145
146
146
147
PodmanContainer (containerName string , query * info.ContainerInfoRequest ) (info.ContainerInfo , error )
148
+
149
+ GetOOMInfos () map [string ]* oomparser.ContainerOomInfo
147
150
}
148
151
149
152
// Housekeeping configuration for the manager
@@ -153,7 +156,9 @@ type HouskeepingConfig = struct {
153
156
}
154
157
155
158
// New takes a memory storage and returns a new manager.
156
- func New (memoryCache * memory.InMemoryCache , sysfs sysfs.SysFs , houskeepingConfig HouskeepingConfig , includedMetricsSet container.MetricSet , collectorHTTPClient * http.Client , rawContainerCgroupPathPrefixWhiteList , containerEnvMetadataWhiteList []string , perfEventsFile string , resctrlInterval time.Duration ) (Manager , error ) {
159
+ func New (memoryCache * memory.InMemoryCache , sysfs sysfs.SysFs , houskeepingConfig HouskeepingConfig , includedMetricsSet container.MetricSet ,
160
+ collectorHTTPClient * http.Client , rawContainerCgroupPathPrefixWhiteList , containerEnvMetadataWhiteList []string ,
161
+ perfEventsFile string , resctrlInterval time.Duration , f metrics.ContainerLabelsFunc , oomRetainDuration * time.Duration ) (Manager , error ) {
157
162
if memoryCache == nil {
158
163
return nil , fmt .Errorf ("manager requires memory storage" )
159
164
}
@@ -208,6 +213,9 @@ func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, houskeepingConfig
208
213
collectorHTTPClient : collectorHTTPClient ,
209
214
rawContainerCgroupPathPrefixWhiteList : rawContainerCgroupPathPrefixWhiteList ,
210
215
containerEnvMetadataWhiteList : containerEnvMetadataWhiteList ,
216
+ oomInfos : map [string ]* oomparser.ContainerOomInfo {},
217
+ containerLabelFunc : f ,
218
+ oomRetainDuration : oomRetainDuration ,
211
219
}
212
220
213
221
machineInfo , err := machine .Info (sysfs , fsInfo , inHostNamespace )
@@ -247,6 +255,7 @@ type namespacedContainerName struct {
247
255
}
248
256
249
257
type manager struct {
258
+ oomInfos map [string ]* oomparser.ContainerOomInfo
250
259
containers map [namespacedContainerName ]* containerData
251
260
containersLock sync.RWMutex
252
261
memoryCache * memory.InMemoryCache
@@ -271,6 +280,8 @@ type manager struct {
271
280
rawContainerCgroupPathPrefixWhiteList []string
272
281
// List of container env prefix whitelist, the matched container envs would be collected into metrics as extra labels.
273
282
containerEnvMetadataWhiteList []string
283
+ containerLabelFunc metrics.ContainerLabelsFunc
284
+ oomRetainDuration * time.Duration
274
285
}
275
286
276
287
func (m * manager ) PodmanContainer (containerName string , query * info.ContainerInfoRequest ) (info.ContainerInfo , error ) {
@@ -318,7 +329,7 @@ func (m *manager) Start() error {
318
329
return err
319
330
}
320
331
klog .V (2 ).Infof ("Starting recovery of all containers" )
321
- err = m .detectSubcontainers ("/" )
332
+ err = m .detectSubContainers ("/" )
322
333
if err != nil {
323
334
return err
324
335
}
@@ -340,6 +351,7 @@ func (m *manager) Start() error {
340
351
quitUpdateMachineInfo := make (chan error )
341
352
m .quitChannels = append (m .quitChannels , quitUpdateMachineInfo )
342
353
go m .updateMachineInfo (quitUpdateMachineInfo )
354
+ go m .cleanUpOomInfos ()
343
355
344
356
return nil
345
357
}
@@ -363,6 +375,61 @@ func (m *manager) Stop() error {
363
375
return nil
364
376
}
365
377
378
+ func (m * manager ) GetOOMInfos () map [string ]* oomparser.ContainerOomInfo {
379
+ m .containersLock .RLock ()
380
+ defer m .containersLock .RUnlock ()
381
+ oomInfos := make (map [string ]* oomparser.ContainerOomInfo )
382
+ for k , v := range m .oomInfos {
383
+ if time .Since (v .TimeOfDeath ) > * m .oomRetainDuration {
384
+ continue
385
+ }
386
+ oomInfos [k ] = v
387
+ }
388
+ return oomInfos
389
+ }
390
+
391
+ func (m * manager ) cleanUpOomInfos () {
392
+ ticker := time .NewTicker (time .Minute )
393
+ defer ticker .Stop ()
394
+
395
+ for {
396
+ select {
397
+ case <- ticker .C :
398
+ m .containersLock .Lock ()
399
+ for k , v := range m .oomInfos {
400
+ if time .Since (v .TimeOfDeath ) > * m .oomRetainDuration {
401
+ delete (m .oomInfos , k )
402
+ }
403
+ }
404
+ m .containersLock .Unlock ()
405
+ }
406
+ }
407
+ }
408
+
409
+ func (m * manager ) addOrUpdateOomInfo (cont * containerData , timeOfDeath time.Time ) error {
410
+ m .containersLock .Lock ()
411
+ defer m .containersLock .Unlock ()
412
+
413
+ contInfo , err := m .containerDataToContainerInfo (cont , & info.ContainerInfoRequest {
414
+ NumStats : 60 ,
415
+ })
416
+ if err != nil {
417
+ return err
418
+ }
419
+ if oomInfo , ok := m .oomInfos [contInfo .Id ]; ok {
420
+ atomic .AddUint64 (& oomInfo .OomEvents , 1 )
421
+ return nil
422
+ }
423
+ containerLabels := m .containerLabelFunc (contInfo )
424
+ newOomInfo := & oomparser.ContainerOomInfo {
425
+ MetricLabels : containerLabels ,
426
+ TimeOfDeath : timeOfDeath ,
427
+ }
428
+ atomic .AddUint64 (& newOomInfo .OomEvents , 1 )
429
+ m .oomInfos [contInfo .Id ] = newOomInfo
430
+ return nil
431
+ }
432
+
366
433
func (m * manager ) destroyCollectors () {
367
434
for _ , container := range m .containers {
368
435
container .perfCollector .Destroy ()
@@ -406,7 +473,7 @@ func (m *manager) globalHousekeeping(quit chan error) {
406
473
start := time .Now ()
407
474
408
475
// Check for new containers.
409
- err := m .detectSubcontainers ("/" )
476
+ err := m .detectSubContainers ("/" )
410
477
if err != nil {
411
478
klog .Errorf ("Failed to detect containers: %s" , err )
412
479
}
@@ -1056,7 +1123,7 @@ func (m *manager) destroyContainerLocked(containerName string) error {
1056
1123
1057
1124
// Detect all containers that have been added or deleted from the specified container.
1058
1125
func (m * manager ) getContainersDiff (containerName string ) (added []info.ContainerReference , removed []info.ContainerReference , err error ) {
1059
- // Get all subcontainers recursively.
1126
+ // Get all subContainers recursively.
1060
1127
m .containersLock .RLock ()
1061
1128
cont , ok := m .containers [namespacedContainerName {
1062
1129
Name : containerName ,
@@ -1103,8 +1170,8 @@ func (m *manager) getContainersDiff(containerName string) (added []info.Containe
1103
1170
return
1104
1171
}
1105
1172
1106
- // Detect the existing subcontainers and reflect the setup here.
1107
- func (m * manager ) detectSubcontainers (containerName string ) error {
1173
+ // Detect the existing subContainers and reflect the setup here.
1174
+ func (m * manager ) detectSubContainers (containerName string ) error {
1108
1175
added , removed , err := m .getContainersDiff (containerName )
1109
1176
if err != nil {
1110
1177
return err
@@ -1147,7 +1214,7 @@ func (m *manager) watchForNewContainers(quit chan error) error {
1147
1214
}
1148
1215
1149
1216
// There is a race between starting the watch and new container creation so we do a detection before we read new containers.
1150
- err := m .detectSubcontainers ("/" )
1217
+ err := m .detectSubContainers ("/" )
1151
1218
if err != nil {
1152
1219
return err
1153
1220
}
@@ -1247,7 +1314,9 @@ func (m *manager) watchForNewOoms() error {
1247
1314
continue
1248
1315
}
1249
1316
for _ , cont := range conts {
1250
- atomic .AddUint64 (& cont .oomEvents , 1 )
1317
+ if err := m .addOrUpdateOomInfo (cont , oomInstance .TimeOfDeath ); err != nil {
1318
+ klog .Errorf ("failed to add OOM info for %q: %v" , oomInstance .ContainerName , err )
1319
+ }
1251
1320
}
1252
1321
}
1253
1322
}()
0 commit comments