Skip to content

Commit 7a98b17

Browse files
devoxeljaimeyh
authored andcommitted
collector: add tasks API collection (prometheus-community#778)
* collector: add tasks API collection This commit adds simple aggregation of Elasticsearch Tasks API. There are 4 new metrics; though 3 are just bookkeeping. elasticsearch_task_stats_action_total is a gague reporting the total number of tasks running for a given action. Because there are no stats endpoints available for this, this change introduces an aggregation step to group the number of tasks by action name. This metric is useful for ensuring long running actions of a specific kind stay within a specific limit. Of particular use to me is the action: 'indices:data/write/delete/byquery'. In my usecase, our ES access patterns mean we have a predefined limit of these actions running on the cluster. This change also adds two new CLI flags to manage the collection of tasks API: --es.tasks (to enable task collection) --es.tasks.actions (to filter tasks by action param) Issue prometheus-community#525 proposed addition of collection of these tasks. Signed-off-by: Aaron Delaney <[email protected]> * collector: use collector interface for tasks Signed-off-by: Aaron Delaney <[email protected]> * all: fix issues reported by golangci-lint Signed-off-by: Aaron Delaney <[email protected]> * collector: make task structs private to package Signed-off-by: Aaron Delaney <[email protected]> * Fix task stats metric name Signed-off-by: Aaron Delaney <[email protected]> * Fix tasks test Signed-off-by: Aaron Delaney <[email protected]> --------- Signed-off-by: Aaron Delaney <[email protected]>
1 parent 67ce83f commit 7a98b17

File tree

7 files changed

+260
-16
lines changed

7 files changed

+260
-16
lines changed

collector/cluster_info.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ type VersionInfo struct {
7777
LuceneVersion semver.Version `json:"lucene_version"`
7878
}
7979

80-
func (c *ClusterInfoCollector) Update(ctx context.Context, ch chan<- prometheus.Metric) error {
80+
func (c *ClusterInfoCollector) Update(_ context.Context, ch chan<- prometheus.Metric) error {
8181
resp, err := c.hc.Get(c.u.String())
8282
if err != nil {
8383
return err

collector/cluster_settings_test.go

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
package collector
1515

1616
import (
17-
"context"
1817
"io"
1918
"net/http"
2019
"net/http/httptest"
@@ -24,21 +23,9 @@ import (
2423
"testing"
2524

2625
"github.com/go-kit/log"
27-
"github.com/prometheus/client_golang/prometheus"
2826
"github.com/prometheus/client_golang/prometheus/testutil"
2927
)
3028

31-
type wrapCollector struct {
32-
c Collector
33-
}
34-
35-
func (w wrapCollector) Describe(ch chan<- *prometheus.Desc) {
36-
}
37-
38-
func (w wrapCollector) Collect(ch chan<- prometheus.Metric) {
39-
w.c.Update(context.Background(), ch)
40-
}
41-
4229
func TestClusterSettingsStats(t *testing.T) {
4330
// Testcases created using:
4431
// docker run -d -p 9200:9200 elasticsearch:VERSION-alpine

collector/collector_test.go

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
// Copyright 2023 The Prometheus Authors
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
package collector
15+
16+
import (
17+
"context"
18+
19+
"github.com/prometheus/client_golang/prometheus"
20+
)
21+
22+
// wrapCollector is a util to let you test your Collector implementation.
23+
//
24+
// Use this with prometheus/client_golang/prometheus/testutil to test metric output, for example:
25+
//
26+
// testutil.CollectAndCompare(wrapCollector{c}, strings.NewReader(want))
27+
type wrapCollector struct {
28+
c Collector
29+
}
30+
31+
func (w wrapCollector) Describe(_ chan<- *prometheus.Desc) {
32+
}
33+
34+
func (w wrapCollector) Collect(ch chan<- prometheus.Metric) {
35+
w.c.Update(context.Background(), ch)
36+
}

collector/nodes_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ type basicAuth struct {
138138
Next http.Handler
139139
}
140140

141-
func (h *basicAuth) checkAuth(w http.ResponseWriter, r *http.Request) bool {
141+
func (h *basicAuth) checkAuth(_ http.ResponseWriter, r *http.Request) bool {
142142
s := strings.SplitN(r.Header.Get("Authorization"), " ", 2)
143143
if len(s) != 2 {
144144
return false

collector/tasks.go

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
// Copyright 2023 The Prometheus Authors
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
package collector
15+
16+
import (
17+
"context"
18+
"encoding/json"
19+
"fmt"
20+
"io"
21+
"net/http"
22+
"net/url"
23+
24+
"github.com/alecthomas/kingpin/v2"
25+
"github.com/go-kit/log"
26+
"github.com/go-kit/log/level"
27+
"github.com/prometheus/client_golang/prometheus"
28+
)
29+
30+
// filterByTask global required because collector interface doesn't expose any way to take
31+
// constructor args.
32+
var actionFilter string
33+
34+
var taskActionDesc = prometheus.NewDesc(
35+
prometheus.BuildFQName(namespace, "task_stats", "action"),
36+
"Number of tasks of a certain action",
37+
[]string{"action"}, nil)
38+
39+
func init() {
40+
kingpin.Flag("tasks.actions",
41+
"Filter on task actions. Used in same way as Task API actions param").
42+
Default("indices:*").StringVar(&actionFilter)
43+
registerCollector("tasks", defaultDisabled, NewTaskCollector)
44+
}
45+
46+
// Task Information Struct
47+
type TaskCollector struct {
48+
logger log.Logger
49+
hc *http.Client
50+
u *url.URL
51+
}
52+
53+
// NewTaskCollector defines Task Prometheus metrics
54+
func NewTaskCollector(logger log.Logger, u *url.URL, hc *http.Client) (Collector, error) {
55+
level.Info(logger).Log("msg", "task collector created",
56+
"actionFilter", actionFilter,
57+
)
58+
59+
return &TaskCollector{
60+
logger: logger,
61+
hc: hc,
62+
u: u,
63+
}, nil
64+
}
65+
66+
func (t *TaskCollector) Update(ctx context.Context, ch chan<- prometheus.Metric) error {
67+
tasks, err := t.fetchTasks(ctx)
68+
if err != nil {
69+
return fmt.Errorf("failed to fetch and decode task stats: %w", err)
70+
}
71+
72+
stats := AggregateTasks(tasks)
73+
for action, count := range stats.CountByAction {
74+
ch <- prometheus.MustNewConstMetric(
75+
taskActionDesc,
76+
prometheus.GaugeValue,
77+
float64(count),
78+
action,
79+
)
80+
}
81+
return nil
82+
}
83+
84+
func (t *TaskCollector) fetchTasks(_ context.Context) (tasksResponse, error) {
85+
u := t.u.ResolveReference(&url.URL{Path: "_tasks"})
86+
q := u.Query()
87+
q.Set("group_by", "none")
88+
q.Set("actions", actionFilter)
89+
u.RawQuery = q.Encode()
90+
91+
var tr tasksResponse
92+
res, err := t.hc.Get(u.String())
93+
if err != nil {
94+
return tr, fmt.Errorf("failed to get data stream stats health from %s://%s:%s%s: %s",
95+
u.Scheme, u.Hostname(), u.Port(), u.Path, err)
96+
}
97+
98+
defer func() {
99+
err = res.Body.Close()
100+
if err != nil {
101+
level.Warn(t.logger).Log(
102+
"msg", "failed to close http.Client",
103+
"err", err,
104+
)
105+
}
106+
}()
107+
108+
if res.StatusCode != http.StatusOK {
109+
return tr, fmt.Errorf("HTTP Request to %v failed with code %d", u.String(), res.StatusCode)
110+
}
111+
112+
bts, err := io.ReadAll(res.Body)
113+
if err != nil {
114+
return tr, err
115+
}
116+
117+
err = json.Unmarshal(bts, &tr)
118+
return tr, err
119+
}
120+
121+
// tasksResponse is a representation of the Task management API.
122+
type tasksResponse struct {
123+
Tasks []taskResponse `json:"tasks"`
124+
}
125+
126+
// taskResponse is a representation of the individual task item returned by task API endpoint.
127+
//
128+
// We only parse a very limited amount of this API for use in aggregation.
129+
type taskResponse struct {
130+
Action string `json:"action"`
131+
}
132+
133+
type aggregatedTaskStats struct {
134+
CountByAction map[string]int64
135+
}
136+
137+
func AggregateTasks(t tasksResponse) aggregatedTaskStats {
138+
actions := map[string]int64{}
139+
for _, task := range t.Tasks {
140+
actions[task.Action]++
141+
}
142+
return aggregatedTaskStats{CountByAction: actions}
143+
}

collector/tasks_test.go

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
// Copyright 2023 The Prometheus Authors
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
package collector
15+
16+
import (
17+
"fmt"
18+
"net/http"
19+
"net/http/httptest"
20+
"net/url"
21+
"strings"
22+
"testing"
23+
24+
"github.com/go-kit/log"
25+
"github.com/prometheus/client_golang/prometheus/testutil"
26+
)
27+
28+
func TestTasks(t *testing.T) {
29+
// Test data was collected by running the following:
30+
// # create container
31+
// docker run -d --name elasticsearch -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" elasticsearch:7.17.11
32+
// sleep 15
33+
// # start some busy work in background
34+
// for i in $(seq 1 500)
35+
// do
36+
// curl -o /dev/null -sX POST "localhost:9200/a1/_doc" -H 'Content-Type: application/json' -d'{"a1": "'"$i"'"}'
37+
// sleep .01
38+
// curl -o /dev/null -sX POST "localhost:9200/a1/_doc" -H 'Content-Type: application/json' -d'{"a2": "'"$i"'"}'
39+
// sleep .01
40+
// curl -o /dev/null -sX POST "localhost:9200/a1/_doc" -H 'Content-Type: application/json' -d'{"a3": "'"$i"'"}'
41+
// sleep .01
42+
// done &
43+
// # try and collect a good sample
44+
// curl -X GET 'localhost:9200/_tasks?group_by=none&actions=indices:*'
45+
// # cleanup
46+
// docker rm --force elasticsearch
47+
tcs := map[string]string{
48+
"7.17": `{"tasks":[{"node":"9lWCm1y_QkujaAg75bVx7A","id":70,"type":"transport","action":"indices:admin/index_template/put","start_time_in_millis":1695900464655,"running_time_in_nanos":308640039,"cancellable":false,"headers":{}},{"node":"9lWCm1y_QkujaAg75bVx7A","id":73,"type":"transport","action":"indices:admin/index_template/put","start_time_in_millis":1695900464683,"running_time_in_nanos":280672000,"cancellable":false,"headers":{}},{"node":"9lWCm1y_QkujaAg75bVx7A","id":76,"type":"transport","action":"indices:admin/index_template/put","start_time_in_millis":1695900464711,"running_time_in_nanos":253247906,"cancellable":false,"headers":{}},{"node":"9lWCm1y_QkujaAg75bVx7A","id":93,"type":"transport","action":"indices:admin/index_template/put","start_time_in_millis":1695900464904,"running_time_in_nanos":60230460,"cancellable":false,"headers":{}},{"node":"9lWCm1y_QkujaAg75bVx7A","id":50,"type":"transport","action":"indices:data/write/index","start_time_in_millis":1695900464229,"running_time_in_nanos":734480468,"cancellable":false,"headers":{}},{"node":"9lWCm1y_QkujaAg75bVx7A","id":51,"type":"transport","action":"indices:admin/auto_create","start_time_in_millis":1695900464235,"running_time_in_nanos":729223933,"cancellable":false,"headers":{}}]}`,
49+
}
50+
want := `# HELP elasticsearch_task_stats_action Number of tasks of a certain action
51+
# TYPE elasticsearch_task_stats_action gauge
52+
elasticsearch_task_stats_action{action="indices:admin/auto_create"} 1
53+
elasticsearch_task_stats_action{action="indices:admin/index_template/put"} 4
54+
elasticsearch_task_stats_action{action="indices:data/write/index"} 1
55+
`
56+
for ver, out := range tcs {
57+
t.Run(ver, func(t *testing.T) {
58+
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
59+
fmt.Fprintln(w, out)
60+
}))
61+
defer ts.Close()
62+
63+
u, err := url.Parse(ts.URL)
64+
if err != nil {
65+
t.Fatalf("Failed to parse URL: %s", err)
66+
}
67+
68+
c, err := NewTaskCollector(log.NewNopLogger(), u, ts.Client())
69+
if err != nil {
70+
t.Fatalf("Failed to create collector: %v", err)
71+
}
72+
73+
if err := testutil.CollectAndCompare(wrapCollector{c}, strings.NewReader(want)); err != nil {
74+
t.Fatalf("Metrics did not match: %v", err)
75+
}
76+
})
77+
}
78+
}

pkg/clusterinfo/clusterinfo_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ const (
4444

4545
type mockES struct{}
4646

47-
func (mockES) ServeHTTP(w http.ResponseWriter, r *http.Request) {
47+
func (mockES) ServeHTTP(w http.ResponseWriter, _ *http.Request) {
4848

4949
fmt.Fprintf(w, `{
5050
"name" : "%s",

0 commit comments

Comments
 (0)