Skip to content

Commit 0314a4b

Browse files
committed
Fix the naming for Metrics as per convention
Some metrics aren't as per convention. This fixes it in a backward compatible way. https://prometheus.io/docs/practices/naming/
1 parent ab47f4e commit 0314a4b

File tree

5 files changed

+136
-18
lines changed

5 files changed

+136
-18
lines changed

docs/metrics.md

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,20 @@ We expose several kinds of exporters, including Prometheus, Google Stackdriver,
1515
|-----------------------------------------------------------------------------------------| ----------- | ----------- | ----------- |
1616
| `tekton_pipelines_controller_pipelinerun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `*pipeline`=&lt;pipeline_name&gt; <br> `*pipelinerun`=&lt;pipelinerun_name&gt; <br> `status`=&lt;status&gt; <br> `namespace`=&lt;pipelinerun-namespace&gt; | experimental |
1717
| `tekton_pipelines_controller_pipelinerun_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `*pipeline`=&lt;pipeline_name&gt; <br> `*pipelinerun`=&lt;pipelinerun_name&gt; <br> `status`=&lt;status&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> `namespace`=&lt;pipelineruns-taskruns-namespace&gt;| experimental |
18-
| `tekton_pipelines_controller_pipelinerun_count` | Counter | `status`=&lt;status&gt; <br> `*reason`=&lt;reason&gt; | experimental |
19-
| `tekton_pipelines_controller_running_pipelineruns_count` | Gauge | | experimental |
20-
| `tekton_pipelines_controller_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `status`=&lt;status&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> `namespace`=&lt;pipelineruns-taskruns-namespace&gt; | experimental |
21-
| `tekton_pipelines_controller_taskrun_count` | Counter | `status`=&lt;status&gt; <br> `*reason`=&lt;reason&gt; | experimental |
22-
| `tekton_pipelines_controller_running_taskruns_count` | Gauge | | experimental |
23-
| `tekton_pipelines_controller_running_taskruns_throttled_by_quota_count` | Gauge | | experimental |
24-
| `tekton_pipelines_controller_running_taskruns_throttled_by_node_count` | Gauge | | experimental |
25-
| `tekton_pipelines_controller_running_taskruns_waiting_on_task_resolution_count` | Gauge | | experimental |
26-
| `tekton_pipelines_controller_running_pipelineruns_waiting_on_pipeline_resolution_count` | Gauge | | experimental |
27-
| `tekton_pipelines_controller_running_pipelineruns_waiting_on_task_resolution_count` | Gauge | | experimental |
28-
| `tekton_pipelines_controller_taskruns_pod_latency_milliseconds` | Gauge | `namespace`=&lt;taskruns-namespace&gt; <br> `pod`= &lt; taskrun_pod_name&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> | experimental |
29-
| `tekton_pipelines_controller_client_latency_[bucket, sum, count]` | Histogram | | experimental |
18+
| `tekton_pipelines_controller_pipelinerun_count` | Counter | `status`=&lt;status&gt; | deprecate |
19+
| `tekton_pipelines_controller_pipelinerun_total` | Counter | `status`=&lt;status&gt; | experimental |
20+
| `tekton_pipelines_controller_running_pipelineruns_count` | Gauge | | deprecate |
21+
| `tekton_pipelines_controller_running_pipelineruns` | Gauge | | experimental |
22+
| `tekton_pipelines_controller_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `status`=&lt;status&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> `namespace`=&lt;pipelineruns-taskruns-namespace&gt; | experimental |
23+
| `tekton_pipelines_controller_taskrun_count` | Counter | `status`=&lt;status&gt; | deprecate |
24+
| `tekton_pipelines_controller_taskrun_total` | Counter | `status`=&lt;status&gt; | experimental |
25+
| `tekton_pipelines_controller_running_taskruns_count` | Gauge | | deprecate |
26+
| `tekton_pipelines_controller_running_taskruns` | Gauge | | experimental |
27+
| `tekton_pipelines_controller_running_taskruns_throttled_by_quota_count` | Gauge | | deprecate |
28+
| `tekton_pipelines_controller_running_taskruns_throttled_by_node_count` | Gauge | | deprecate |
29+
| `tekton_pipelines_controller_running_taskruns_throttled_by_quota` | Gauge | | experimental |
30+
| `tekton_pipelines_controller_running_taskruns_throttled_by_node` | Gauge | | experimental |
31+
| `tekton_pipelines_controller_client_latency_[bucket, sum, count]` | Histogram | | experimental |
3032

3133
The Labels/Tag marked as "*" are optional. And there's a choice between Histogram and LastValue(Gauge) for pipelinerun and taskrun duration metrics.
3234

pkg/pipelinerunmetrics/metrics.go

Lines changed: 56 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,20 +56,40 @@ var (
5656
stats.UnitDimensionless)
5757
prCountView *view.View
5858

59+
prTotal = stats.Float64("pipelinerun_total",
60+
"Number of pipelineruns",
61+
stats.UnitDimensionless)
62+
prTotalView *view.View
63+
5964
runningPRsCount = stats.Float64("running_pipelineruns_count",
6065
"Number of pipelineruns executing currently",
6166
stats.UnitDimensionless)
6267
runningPRsCountView *view.View
6368

69+
runningPRs = stats.Float64("running_pipelineruns",
70+
"Number of pipelineruns executing currently",
71+
stats.UnitDimensionless)
72+
runningPRsView *view.View
73+
6474
runningPRsWaitingOnPipelineResolutionCount = stats.Float64("running_pipelineruns_waiting_on_pipeline_resolution_count",
6575
"Number of pipelineruns executing currently that are waiting on resolution requests for their pipeline references.",
6676
stats.UnitDimensionless)
6777
runningPRsWaitingOnPipelineResolutionCountView *view.View
6878

79+
runningPRsWaitingOnPipelineResolution = stats.Float64("running_pipelineruns_waiting_on_pipeline_resolution",
80+
"Number of pipelineruns executing currently that are waiting on resolution requests for their pipeline references.",
81+
stats.UnitDimensionless)
82+
runningPRsWaitingOnPipelineResolutionView *view.View
83+
6984
runningPRsWaitingOnTaskResolutionCount = stats.Float64("running_pipelineruns_waiting_on_task_resolution_count",
7085
"Number of pipelineruns executing currently that are waiting on resolution requests for the task references of their taskrun children.",
7186
stats.UnitDimensionless)
7287
runningPRsWaitingOnTaskResolutionCountView *view.View
88+
89+
runningPRsWaitingOnTaskResolution = stats.Float64("running_pipelineruns_waiting_on_task_resolution",
90+
"Number of pipelineruns executing currently that are waiting on resolution requests for the task references of their taskrun children.",
91+
stats.UnitDimensionless)
92+
runningPRsWaitingOnTaskResolutionView *view.View
7393
)
7494

7595
const (
@@ -171,33 +191,61 @@ func viewRegister(cfg *config.Metrics) error {
171191
Aggregation: view.Count(),
172192
TagKeys: prCountViewTags,
173193
}
194+
prTotalView = &view.View{
195+
Description: prTotal.Description(),
196+
Measure: prTotal,
197+
Aggregation: view.Count(),
198+
TagKeys: []tag.Key{statusTag},
199+
}
200+
174201
runningPRsCountView = &view.View{
175202
Description: runningPRsCount.Description(),
176203
Measure: runningPRsCount,
177204
Aggregation: view.LastValue(),
178205
}
206+
runningPRsView = &view.View{
207+
Description: runningPRs.Description(),
208+
Measure: runningPRs,
209+
Aggregation: view.LastValue(),
210+
}
211+
179212
runningPRsWaitingOnPipelineResolutionCountView = &view.View{
180213
Description: runningPRsWaitingOnPipelineResolutionCount.Description(),
181214
Measure: runningPRsWaitingOnPipelineResolutionCount,
182215
Aggregation: view.LastValue(),
183216
}
217+
runningPRsWaitingOnPipelineResolutionView = &view.View{
218+
Description: runningPRsWaitingOnPipelineResolution.Description(),
219+
Measure: runningPRsWaitingOnPipelineResolution,
220+
Aggregation: view.LastValue(),
221+
}
222+
184223
runningPRsWaitingOnTaskResolutionCountView = &view.View{
185224
Description: runningPRsWaitingOnTaskResolutionCount.Description(),
186225
Measure: runningPRsWaitingOnTaskResolutionCount,
187226
Aggregation: view.LastValue(),
188227
}
228+
runningPRsWaitingOnTaskResolutionView = &view.View{
229+
Description: runningPRsWaitingOnTaskResolution.Description(),
230+
Measure: runningPRsWaitingOnTaskResolution,
231+
Aggregation: view.LastValue(),
232+
}
189233

190234
return view.Register(
191235
prDurationView,
192236
prCountView,
237+
prTotalView,
193238
runningPRsCountView,
239+
runningPRsView,
194240
runningPRsWaitingOnPipelineResolutionCountView,
241+
runningPRsWaitingOnPipelineResolutionView,
195242
runningPRsWaitingOnTaskResolutionCountView,
243+
runningPRsWaitingOnTaskResolutionView,
196244
)
197245
}
198246

199247
func viewUnregister() {
200-
view.Unregister(prDurationView, prCountView, runningPRsCountView, runningPRsWaitingOnPipelineResolutionCountView, runningPRsWaitingOnTaskResolutionCountView)
248+
view.Unregister(prDurationView, prCountView, prTotalView, runningPRsCountView, runningPRsView, runningPRsWaitingOnPipelineResolutionCountView, runningPRsWaitingOnPipelineResolutionView, runningPRsWaitingOnTaskResolutionCountView, runningPRsWaitingOnTaskResolutionView)
201249
}
202250

203251
// MetricsOnStore returns a function that checks if metrics are configured for a config.Store, and registers it if so
@@ -282,6 +330,7 @@ func (r *Recorder) DurationAndCount(pr *v1.PipelineRun, beforeCondition *apis.Co
282330

283331
metrics.Record(ctx, prDuration.M(duration.Seconds()))
284332
metrics.Record(ctx, prCount.M(1))
333+
metrics.Record(ctx, prTotal.M(1))
285334

286335
return nil
287336
}
@@ -301,13 +350,13 @@ func (r *Recorder) RunningPipelineRuns(lister listers.PipelineRunLister) error {
301350
return fmt.Errorf("failed to list pipelineruns while generating metrics : %w", err)
302351
}
303352

304-
var runningPRs int
353+
var runningPipelineRuns int
305354
var trsWaitResolvingTaskRef int
306355
var prsWaitResolvingPipelineRef int
307356

308357
for _, pr := range prs {
309358
if !pr.IsDone() {
310-
runningPRs++
359+
runningPipelineRuns++
311360
succeedCondition := pr.Status.GetCondition(apis.ConditionSucceeded)
312361
if succeedCondition != nil && succeedCondition.Status == corev1.ConditionUnknown {
313362
switch succeedCondition.Reason {
@@ -324,9 +373,12 @@ func (r *Recorder) RunningPipelineRuns(lister listers.PipelineRunLister) error {
324373
if err != nil {
325374
return err
326375
}
327-
metrics.Record(ctx, runningPRsCount.M(float64(runningPRs)))
328376
metrics.Record(ctx, runningPRsWaitingOnPipelineResolutionCount.M(float64(prsWaitResolvingPipelineRef)))
377+
metrics.Record(ctx, runningPRsWaitingOnPipelineResolution.M(float64(prsWaitResolvingPipelineRef)))
329378
metrics.Record(ctx, runningPRsWaitingOnTaskResolutionCount.M(float64(trsWaitResolvingTaskRef)))
379+
metrics.Record(ctx, runningPRsWaitingOnTaskResolution.M(float64(trsWaitResolvingTaskRef)))
380+
metrics.Record(ctx, runningPRsCount.M(float64(runningPipelineRuns)))
381+
metrics.Record(ctx, runningPRs.M(float64(runningPipelineRuns)))
330382

331383
return nil
332384
}

pkg/pipelinerunmetrics/metrics_test.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -404,8 +404,11 @@ func TestRecordPipelineRunDurationCount(t *testing.T) {
404404
}
405405
if test.expectedCountTags != nil {
406406
metricstest.CheckCountData(t, "pipelinerun_count", test.expectedCountTags, test.expectedCount)
407+
delete(test.expectedCountTags, "reason")
408+
metricstest.CheckCountData(t, "pipelinerun_total", test.expectedCountTags, test.expectedCount)
407409
} else {
408410
metricstest.CheckStatsNotReported(t, "pipelinerun_count")
411+
metricstest.CheckStatsNotReported(t, "pipelinerun_total")
409412
}
410413
})
411414
}
@@ -451,6 +454,7 @@ func TestRecordRunningPipelineRunsCount(t *testing.T) {
451454
t.Errorf("RunningPipelineRuns: %v", err)
452455
}
453456
metricstest.CheckLastValueData(t, "running_pipelineruns_count", map[string]string{}, 1)
457+
metricstest.CheckLastValueData(t, "running_pipelineruns", map[string]string{}, 1)
454458
}
455459

456460
func TestRecordRunningPipelineRunsResolutionWaitCounts(t *testing.T) {
@@ -532,11 +536,13 @@ func TestRecordRunningPipelineRunsResolutionWaitCounts(t *testing.T) {
532536
}
533537
metricstest.CheckLastValueData(t, "running_pipelineruns_waiting_on_pipeline_resolution_count", map[string]string{}, tc.prWaitCount)
534538
metricstest.CheckLastValueData(t, "running_pipelineruns_waiting_on_task_resolution_count", map[string]string{}, tc.trWaitCount)
539+
metricstest.CheckLastValueData(t, "running_pipelineruns_waiting_on_pipeline_resolution", map[string]string{}, tc.prWaitCount)
540+
metricstest.CheckLastValueData(t, "running_pipelineruns_waiting_on_task_resolution", map[string]string{}, tc.trWaitCount)
535541
}
536542
}
537543

538544
func unregisterMetrics() {
539-
metricstest.Unregister("pipelinerun_duration_seconds", "pipelinerun_count", "running_pipelineruns_waiting_on_pipeline_resolution_count", "running_pipelineruns_waiting_on_task_resolution_count", "running_pipelineruns_count")
545+
metricstest.Unregister("pipelinerun_duration_seconds", "pipelinerun_count", "pipelinerun_total", "running_pipelineruns_waiting_on_pipeline_resolution_count", "running_pipelineruns_waiting_on_pipeline_resolution", "running_pipelineruns_waiting_on_task_resolution_count", "running_pipelineruns_waiting_on_task_resolution", "running_pipelineruns_count", "running_pipelineruns")
540546

541547
// Allow the recorder singleton to be recreated.
542548
once = sync.Once{}

pkg/taskrunmetrics/metrics.go

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,13 @@ var (
5656
trDurationView *view.View
5757
prTRDurationView *view.View
5858
trCountView *view.View
59+
trTotalView *view.View
5960
runningTRsCountView *view.View
61+
runningTRsView *view.View
6062
runningTRsThrottledByQuotaCountView *view.View
6163
runningTRsThrottledByNodeCountView *view.View
64+
runningTRsThrottledByQuotaView *view.View
65+
runningTRsThrottledByNodeView *view.View
6266
runningTRsWaitingOnTaskResolutionCountView *view.View
6367
podLatencyView *view.View
6468

@@ -76,10 +80,18 @@ var (
7680
"number of taskruns",
7781
stats.UnitDimensionless)
7882

83+
trTotal = stats.Float64("taskrun_total",
84+
"Number of taskruns",
85+
stats.UnitDimensionless)
86+
7987
runningTRsCount = stats.Float64("running_taskruns_count",
8088
"Number of taskruns executing currently",
8189
stats.UnitDimensionless)
8290

91+
runningTRs = stats.Float64("running_taskruns",
92+
"Number of taskruns executing currently",
93+
stats.UnitDimensionless)
94+
8395
runningTRsThrottledByQuotaCount = stats.Float64("running_taskruns_throttled_by_quota_count",
8496
"Number of taskruns executing currently, but whose underlying Pods or Containers are suspended by k8s because of defined ResourceQuotas. Such suspensions can occur as part of initial scheduling of the Pod, or scheduling of any of the subsequent Container(s) in the Pod after the first Container is started",
8597
stats.UnitDimensionless)
@@ -92,6 +104,14 @@ var (
92104
"Number of taskruns executing currently that are waiting on resolution requests for their task references.",
93105
stats.UnitDimensionless)
94106

107+
runningTRsThrottledByQuota = stats.Float64("running_taskruns_throttled_by_quota",
108+
"Number of taskruns executing currently, but whose underlying Pods or Containers are suspended by k8s because of defined ResourceQuotas. Such suspensions can occur as part of initial scheduling of the Pod, or scheduling of any of the subsequent Container(s) in the Pod after the first Container is started",
109+
stats.UnitDimensionless)
110+
111+
runningTRsThrottledByNode = stats.Float64("running_taskruns_throttled_by_node",
112+
"Number of taskruns executing currently, but whose underlying Pods or Containers are suspended by k8s because of Node level constraints. Such suspensions can occur as part of initial scheduling of the Pod, or scheduling of any of the subsequent Container(s) in the Pod after the first Container is started",
113+
stats.UnitDimensionless)
114+
95115
podLatency = stats.Float64("taskruns_pod_latency_milliseconds",
96116
"scheduling latency for the taskruns pods",
97117
stats.UnitMilliseconds)
@@ -215,11 +235,23 @@ func viewRegister(cfg *config.Metrics) error {
215235
Aggregation: view.Count(),
216236
TagKeys: trCountViewTags,
217237
}
238+
trTotalView = &view.View{
239+
Description: trTotal.Description(),
240+
Measure: trTotal,
241+
Aggregation: view.Count(),
242+
TagKeys: []tag.Key{statusTag},
243+
}
218244
runningTRsCountView = &view.View{
219245
Description: runningTRsCount.Description(),
220246
Measure: runningTRsCount,
221247
Aggregation: view.LastValue(),
222248
}
249+
250+
runningTRsView = &view.View{
251+
Description: runningTRs.Description(),
252+
Measure: runningTRs,
253+
Aggregation: view.LastValue(),
254+
}
223255
runningTRsThrottledByQuotaCountView = &view.View{
224256
Description: runningTRsThrottledByQuotaCount.Description(),
225257
Measure: runningTRsThrottledByQuotaCount,
@@ -235,6 +267,17 @@ func viewRegister(cfg *config.Metrics) error {
235267
Measure: runningTRsWaitingOnTaskResolutionCount,
236268
Aggregation: view.LastValue(),
237269
}
270+
271+
runningTRsThrottledByQuotaView = &view.View{
272+
Description: runningTRsThrottledByQuota.Description(),
273+
Measure: runningTRsThrottledByQuota,
274+
Aggregation: view.LastValue(),
275+
}
276+
runningTRsThrottledByNodeView = &view.View{
277+
Description: runningTRsThrottledByNode.Description(),
278+
Measure: runningTRsThrottledByNode,
279+
Aggregation: view.LastValue(),
280+
}
238281
podLatencyView = &view.View{
239282
Description: podLatency.Description(),
240283
Measure: podLatency,
@@ -245,10 +288,14 @@ func viewRegister(cfg *config.Metrics) error {
245288
trDurationView,
246289
prTRDurationView,
247290
trCountView,
291+
trTotalView,
248292
runningTRsCountView,
293+
runningTRsView,
249294
runningTRsThrottledByQuotaCountView,
250295
runningTRsThrottledByNodeCountView,
251296
runningTRsWaitingOnTaskResolutionCountView,
297+
runningTRsThrottledByQuotaView,
298+
runningTRsThrottledByNodeView,
252299
podLatencyView,
253300
)
254301
}
@@ -258,10 +305,14 @@ func viewUnregister() {
258305
trDurationView,
259306
prTRDurationView,
260307
trCountView,
308+
trTotalView,
261309
runningTRsCountView,
310+
runningTRsView,
262311
runningTRsThrottledByQuotaCountView,
263312
runningTRsThrottledByNodeCountView,
264313
runningTRsWaitingOnTaskResolutionCountView,
314+
runningTRsThrottledByQuotaView,
315+
runningTRsThrottledByNodeView,
265316
podLatencyView,
266317
)
267318
}
@@ -356,6 +407,7 @@ func (r *Recorder) DurationAndCount(ctx context.Context, tr *v1.TaskRun, beforeC
356407

357408
metrics.Record(ctx, durationStat.M(duration.Seconds()))
358409
metrics.Record(ctx, trCount.M(1))
410+
metrics.Record(ctx, trTotal.M(1))
359411

360412
return nil
361413
}
@@ -402,9 +454,12 @@ func (r *Recorder) RunningTaskRuns(ctx context.Context, lister listers.TaskRunLi
402454
return err
403455
}
404456
metrics.Record(ctx, runningTRsCount.M(float64(runningTrs)))
457+
metrics.Record(ctx, runningTRs.M(float64(runningTrs)))
405458
metrics.Record(ctx, runningTRsThrottledByNodeCount.M(float64(trsThrottledByNode)))
406459
metrics.Record(ctx, runningTRsThrottledByQuotaCount.M(float64(trsThrottledByQuota)))
407460
metrics.Record(ctx, runningTRsWaitingOnTaskResolutionCount.M(float64(trsWaitResolvingTaskRef)))
461+
metrics.Record(ctx, runningTRsThrottledByNode.M(float64(trsThrottledByNode)))
462+
metrics.Record(ctx, runningTRsThrottledByQuota.M(float64(trsThrottledByQuota)))
408463

409464
return nil
410465
}

pkg/taskrunmetrics/metrics_test.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -415,8 +415,11 @@ func TestRecordTaskRunDurationCount(t *testing.T) {
415415
}
416416
if c.expectedCountTags != nil {
417417
metricstest.CheckCountData(t, "taskrun_count", c.expectedCountTags, c.expectedCount)
418+
delete(c.expectedCountTags, "reason")
419+
metricstest.CheckCountData(t, "taskrun_total", c.expectedCountTags, c.expectedCount)
418420
} else {
419421
metricstest.CheckStatsNotReported(t, "taskrun_count")
422+
metricstest.CheckStatsNotReported(t, "taskrun_total")
420423
}
421424
if c.expectedDurationTags != nil {
422425
metricstest.CheckLastValueData(t, c.metricName, c.expectedDurationTags, c.expectedDuration)
@@ -680,7 +683,7 @@ func TestTaskRunIsOfPipelinerun(t *testing.T) {
680683
}
681684

682685
func unregisterMetrics() {
683-
metricstest.Unregister("taskrun_duration_seconds", "pipelinerun_taskrun_duration_seconds", "taskrun_count", "running_taskruns_count", "running_taskruns_throttled_by_quota_count", "running_taskruns_throttled_by_node_count", "running_taskruns_waiting_on_task_resolution_count", "taskruns_pod_latency_milliseconds")
686+
metricstest.Unregister("taskrun_duration_seconds", "pipelinerun_taskrun_duration_seconds", "taskrun_count", "running_taskruns_count", "running_taskruns_throttled_by_quota_count", "running_taskruns_throttled_by_node_count", "running_taskruns_waiting_on_task_resolution_count", "taskruns_pod_latency_milliseconds", "taskrun_total", "running_taskruns", "running_taskruns_throttled_by_quota", "running_taskruns_throttled_by_node", "running_taskruns_waiting_on_task_resolution")
684687

685688
// Allow the recorder singleton to be recreated.
686689
once = sync.Once{}

0 commit comments

Comments
 (0)