diff --git a/examples/prometheus/README.md b/examples/prometheus/README.md index 3764808fa488..07d078fdd914 100644 --- a/examples/prometheus/README.md +++ b/examples/prometheus/README.md @@ -53,6 +53,7 @@ $ oc process -f prometheus-standalone.yaml | oc apply -f - You can find the Prometheus route by invoking `oc get routes` and then browsing in your web console. Users who are granted `view` access on the namespace will have access to login to Prometheus. +To load rules see [rules README](/examples/prometheus/rules/README.md). ## Useful metrics queries @@ -175,4 +176,4 @@ Returns the number of successfully completed builds. > openshift_build_total{phase="Failed"} offset 5m -Returns the failed builds totals, per failure reason, from 5 minutes ago. \ No newline at end of file +Returns the failed builds totals, per failure reason, from 5 minutes ago. diff --git a/examples/prometheus/prometheus.yaml b/examples/prometheus/prometheus.yaml index d2eb77a75536..6198b377167d 100644 --- a/examples/prometheus/prometheus.yaml +++ b/examples/prometheus/prometheus.yaml @@ -366,7 +366,7 @@ objects: expr: up{job="kubernetes-nodes"} == 0 annotations: miqTarget: "ContainerNode" - severity: "HIGH" + severity: error message: "{{$labels.instance}} is down" recording.rules: | @@ -385,6 +385,7 @@ objects: prometheus.yml: | rule_files: - '*.rules' + - 'rules/*.rules' # A scrape configuration for running Prometheus on a Kubernetes cluster. # This uses separate scrape configs for cluster components (i.e. API server, node) diff --git a/examples/prometheus/rules/README.md b/examples/prometheus/rules/README.md new file mode 100644 index 000000000000..60d6d94da24b --- /dev/null +++ b/examples/prometheus/rules/README.md @@ -0,0 +1,31 @@ +# Prometheus and Alertmanager Rules + +## Loading Rules + +With this deployment method all files in the rules directory are mounted into the pod as a configmap. + +1. Create a configmap of the rules directory + + oc create configmap base-rules --from-file=rules/ +1. Attach the configmap to the prometheus statefulset as a volume + + oc volume statefulset/prometheus --add \ + --configmap-name=base-rules --name=base-rules -t configmap \ + --mount-path=/etc/prometheus/rules +1. Delete pod to restart with new configuration + + oc delete $(oc get pods -o name --selector='app=prometheus') + +## Updating Rules + +1. Edit or add a local rules file +1. Validate the rules directory. ('promtool' may be downloaded from the [Prometheus web site](https://prometheus.io/download/).) + + promtool check rules rules/*.rules +1. Update the configmap + + oc create configmap base-rules --from-file=rules/ --dry-run -o yaml | oc apply -f - +1. Delete pod to restart with new configuration + + oc delete $(oc get pods -o name --selector='app=prometheus') + diff --git a/examples/prometheus/rules/kube.rules b/examples/prometheus/rules/kube.rules new file mode 100644 index 000000000000..7c1f7a6b067d --- /dev/null +++ b/examples/prometheus/rules/kube.rules @@ -0,0 +1,73 @@ +groups: +- name: kubernetes-rules + rules: + + - alert: DockerLatencyHigh + expr: max(kubelet_docker_operations_latency_microseconds{quantile="0.9"}) / 1e+06 > 1 + for: 5m + labels: + severity: warning + annotations: + summary: Docker latency is high + description: "Docker latency is {{ $value }} seconds for 90% of kubelet operations" + alertType: latency + miqTarget: ContainerNode + component: container runtime + selfHealing: false + url: + + - alert: KubernetesAPIDown + expr: up{job="kubernetes-apiservers"} == 0 + for: 10m + labels: + severity: error + annotations: + summary: Kubernetes API server unreachable + description: "Kubernetes API server unreachable on {{ $labels.cluster }} instance {{ $labels.instance }}" + alertType: availability + miqTarget: ContainerNode + component: kubernetes + selfHealing: false + url: + + - alert: KubernetesAPIAbsent + expr: absent(up{job="kubernetes-apiservers"}) + for: 5m + labels: + severity: error + annotations: + summary: Kubernetes API server absent + description: Kubernetes API server absent + alertType: availability + miqTarget: ContainerNode + component: kubernetes + selfHealing: false + url: + + - alert: KubernetesAPIErrorsHigh + expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) * 100 > 5 + for: 5m + labels: + severity: warning + annotations: + summary: Kubernetes API server errors high + description: "Kubernetes API server errors (response code 5xx) are {{ $value }}% of total requests" + alertType: errors + miqTarget: ContainerNode + component: kubernetes + selfHealing: false + url: + + - alert: KubernetesAPILatencyHigh + expr: apiserver_request_latencies_summary{quantile="0.9",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} / 1e+06 > .5 + for: 10m + labels: + severity: warning + annotations: + summary: Kubernetes API server latency high + description: "Kubernetes API server request latency is {{ $value }} seconds for 90% of requests. NOTE: long-standing requests have been removed from alert query." + alertType: latency + miqTarget: ContainerNode + component: kubernetes + selfHealing: false + url: