@@ -255,18 +255,20 @@ objects:
255
255
miqTarget: "ContainerNode"
256
256
severity: "HIGH"
257
257
message: "{{$labels.instance}} is down"
258
+
258
259
recording.rules : |
259
260
groups:
260
261
- name: aggregate_container_resources
261
262
rules:
262
263
- record: container_cpu_usage_rate
263
- expr: sum without (cpu) (rate(container_cpu_usage_seconds_total[3m ]))
264
+ expr: sum without (cpu) (rate(container_cpu_usage_seconds_total[5m ]))
264
265
- record: container_memory_rss_by_type
265
266
expr: container_memory_rss{id=~"/|/system.slice|/kubepods.slice"} > 0
266
267
- record: container_cpu_usage_percent_by_host
267
- expr: sum by (hostname,type)(rate(container_cpu_usage_seconds_total{id="/"}[3m ])) / on (hostname,type) machine_cpu_cores
268
+ expr: sum by (hostname,type)(rate(container_cpu_usage_seconds_total{id="/"}[5m ])) / on (hostname,type) machine_cpu_cores
268
269
- record: apiserver_request_count_by_resources
269
270
expr: sum without (client,instance,contentType) (rate(apiserver_request_count[5m]))
271
+
270
272
prometheus.yml : |
271
273
rule_files:
272
274
- '*.rules'
@@ -304,24 +306,6 @@ objects:
304
306
action: keep
305
307
regex: default;kubernetes;https
306
308
307
- # Scrape config for nodes.
308
- #
309
- # Each node exposes a /metrics endpoint that contains operational metrics for
310
- # the Kubelet and other components.
311
- - job_name: 'kubernetes-nodes'
312
-
313
- scheme: https
314
- tls_config:
315
- ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
316
- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
317
-
318
- kubernetes_sd_configs:
319
- - role: node
320
-
321
- relabel_configs:
322
- - action: labelmap
323
- regex: __meta_kubernetes_node_label_(.+)
324
-
325
309
# Scrape config for controllers.
326
310
#
327
311
# Each master node exposes a /metrics endpoint on :8444 that contains operational metrics for
@@ -352,6 +336,31 @@ objects:
352
336
regex: (.+)(?::\d+)
353
337
replacement: $1:8444
354
338
339
+ # Scrape config for nodes.
340
+ #
341
+ # Each node exposes a /metrics endpoint that contains operational metrics for
342
+ # the Kubelet and other components.
343
+ - job_name: 'kubernetes-nodes'
344
+
345
+ scheme: https
346
+ tls_config:
347
+ ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
348
+ bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
349
+
350
+ kubernetes_sd_configs:
351
+ - role: node
352
+
353
+ # Drop a very high cardinality metric that is incorrect in 3.7. It will be
354
+ # fixed in 3.9.
355
+ metric_relabel_configs:
356
+ - source_labels: [__name__]
357
+ action: drop
358
+ regex: 'openshift_sdn_pod_(setup|teardown)_latency(.*)'
359
+
360
+ relabel_configs:
361
+ - action: labelmap
362
+ regex: __meta_kubernetes_node_label_(.+)
363
+
355
364
# Scrape config for cAdvisor.
356
365
#
357
366
# Beginning in Kube 1.7, each node exposes a /metrics/cadvisor endpoint that
@@ -368,6 +377,14 @@ objects:
368
377
kubernetes_sd_configs:
369
378
- role: node
370
379
380
+ # Exclude a set of high cardinality metrics that can contribute to significant
381
+ # memory use in large clusters. These can be selectively enabled as necessary
382
+ # for medium or small clusters.
383
+ metric_relabel_configs:
384
+ - source_labels: [__name__]
385
+ action: drop
386
+ regex: 'container_(cpu_user_seconds_total|cpu_cfs_periods_total|memory_usage_bytes|memory_swap|memory_working_set_bytes|memory_cache|last_seen|fs_(read_seconds_total|write_seconds_total|sector_(.*)|io_(.*)|reads_merged_total|writes_merged_total)|tasks_state|memory_failcnt|memory_failures_total|spec_memory_swap_limit_bytes|fs_(.*)_bytes_total|spec_(.*))'
387
+
371
388
relabel_configs:
372
389
- action: labelmap
373
390
regex: __meta_kubernetes_node_label_(.+)
0 commit comments