add example etcd rules and a doc

aweiteka · aweiteka · commit 951d15c459aa · 2018-01-08T10:01:28.000-05:00
Signed-off-by: Aaron Weitekamp &lt;aweiteka@redhat.com&gt;
diff --git a/examples/prometheus/README.md b/examples/prometheus/README.md
@@ -53,6 +53,7 @@ $ oc process -f prometheus-standalone.yaml | oc apply -f -
 
 You can find the Prometheus route by invoking `oc get routes` and then browsing in your web console. Users who are granted `view` access on the namespace will have access to login to Prometheus.
 
+To load rules see [rules README](/examples/prometheus/rules/README.md).
 
 ## Useful metrics queries
 
@@ -175,4 +176,4 @@ Returns the number of successfully completed builds.
 
 > openshift_build_total{phase="Failed"} offset 5m
 
-Returns the failed builds totals, per failure reason, from 5 minutes ago.
+Returns the failed builds totals, per failure reason, from 5 minutes ago.
diff --git a/examples/prometheus/prometheus.yaml b/examples/prometheus/prometheus.yaml
@@ -273,6 +273,7 @@ objects:
     prometheus.yml: |
       rule_files:
         - '*.rules'
+        - 'rules/*.rules'
 
       # A scrape configuration for running Prometheus on a Kubernetes cluster.
       # This uses separate scrape configs for cluster components (i.e. API server, node)
diff --git a/examples/prometheus/rules/README.md b/examples/prometheus/rules/README.md
@@ -0,0 +1,31 @@
+# Prometheus and Alertmanager Rules
+
+## Loading Rules
+
+With this deployment method all files in the rules directory are mounted into the pod as a configmap.
+
+1. Create a configmap of the rules directory
+
+        oc create configmap base-rules --from-file=rules/
+1. Attach the configmap to the prometheus statefulset as a volume
+
+        oc volume statefulset/prometheus --add \
+           --configmap-name=base-rules --name=base-rules -t configmap \
+           --mount-path=/etc/prometheus/rules
+1. Delete pod to restart with new configuration
+
+        oc delete $(oc get pods -o name --selector='app=prometheus')
+
+## Updating Rules
+
+1. Edit or add a local rules file
+1. Validate the rules directory. ('promtool' may be downloaded from the [Prometheus web site](https://prometheus.io/download/).)
+
+        promtool check rules rules/*.rules
+1. Update the configmap
+
+        oc create configmap base-rules --from-file=rules/ --dry-run -o yaml | oc apply -f -
+1. Delete pod to restart with new configuration
+
+        oc delete $(oc get pods -o name --selector='app=prometheus')
+
diff --git a/examples/prometheus/rules/kube.rules b/examples/prometheus/rules/kube.rules
@@ -0,0 +1,68 @@
+groups:
+- name: kubernetes-rules
+  rules:
+
+    - alert: DockerLatencyHigh
+      expr: max(kubelet_docker_operations_latency_microseconds{quantile="0.9"}) / 1e+06 > 1
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Docker latency is high
+        description: "Docker latency is {{ $value }} seconds for 90% of kubelet operations"
+        alertType: latency
+        component: container runtime
+        selfHealing: false
+        url:
+
+    - alert: KubernetesAPIDown
+      expr: up{job="kubernetes-apiservers"} == 0
+      for: 10m
+      labels:
+        severity: critical
+      annotations:
+        summary: Kubernetes API server unreachable
+        description: "Kubernetes API server unreachable on {{ $labels.cluster }} instance {{ $labels.instance }}"
+        alertType: availability
+        component: kubernetes
+        selfHealing: false
+        url:
+
+    - alert: KubernetesAPIAbsent
+      expr: absent(up{job="kubernetes-apiservers"})
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: Kubernetes API server absent
+        description: Kubernetes API server absent
+        alertType: availability
+        component: kubernetes
+        selfHealing: false
+        url:
+
+    - alert: KubernetesAPIErrorsHigh
+      expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) * 100 > 5
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Kubernetes API server errors high
+        description: "Kubernetes API server errors (response code 5xx) are {{ $value }}% of total requests"
+        alertType: errors
+        component: kubernetes
+        selfHealing: false
+        url:
+
+    - alert: KubernetesAPILatencyHigh
+      expr: apiserver_request_latencies_summary{quantile="0.9",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} / 1e+06 > .5
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        summary: Kubernetes API server latency high
+        description: "Kubernetes API server request latency is {{ $value }} seconds for 90% of requests. NOTE: long-standing requests have been removed from alert query."
+        alertType: latency
+        component: kubernetes
+        selfHealing: false
+        url: