Skip to content

Add HTTP/TCP connections and Swap/Recovery metrics #314

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,9 @@ Further Information
| elasticsearch_indices_translog_size_in_bytes | counter | 1 | Total translog size in bytes
| elasticsearch_indices_warmer_time_seconds_total | counter | 1 | Total warmer time in seconds
| elasticsearch_indices_warmer_total | counter | 1 | Total warmer count
| elasticsearch_indices_recovery_current_as_source | gauge | 1 | Number of ongoing recoveries for which a shard serves as a source
| elasticsearch_indices_recovery_current_as_target | gauge | 1 | Number of ongoing recoveries for which a shard serves as a target
| elasticsearch_indices_recovery_throttle_time_seconds_total | counter | 1 | Time in seconds recovery operations were delayed due to throttling
| elasticsearch_jvm_gc_collection_seconds_count | counter | 2 | Count of JVM GC runs
| elasticsearch_jvm_gc_collection_seconds_sum | counter | 2 | GC run time in seconds
| elasticsearch_jvm_memory_committed_bytes | gauge | 2 | JVM memory currently committed by area
Expand All @@ -174,6 +177,9 @@ Further Information
| elasticsearch_os_load1 | gauge | 1 | Shortterm load average
| elasticsearch_os_load5 | gauge | 1 | Midterm load average
| elasticsearch_os_load15 | gauge | 1 | Longterm load average
| elasticsearch_os_swap_in_bytes_used | gauge | 1 | Amount of used swap space in bytes
| elasticsearch_os_swap_in_bytes_free | gauge | 1 | Amount of free swap space in bytes
| elasticsearch_os_swap_in_bytes_total | gauge | 1 | Total amount of swap space in bytes
| elasticsearch_process_cpu_percent | gauge | 1 | Percent CPU used by process
| elasticsearch_process_cpu_time_seconds_sum | counter | 3 | Process CPU time in seconds
| elasticsearch_process_mem_resident_size_bytes | gauge | 1 | Resident memory in use by process in bytes
Expand All @@ -195,10 +201,13 @@ Further Information
| elasticsearch_thread_pool_queue_count | gauge | 14 | Thread Pool operations queued
| elasticsearch_thread_pool_rejected_count | counter | 14 | Thread Pool operations rejected
| elasticsearch_thread_pool_threads_count | gauge | 14 | Thread Pool current threads count
| elasticsearch_transport_tcp_connections_opened_total | counter | 1 | Number of connections opened for cluster communication
| elasticsearch_transport_rx_packets_total | counter | 1 | Count of packets received
| elasticsearch_transport_rx_size_bytes_total | counter | 1 | Total number of bytes received
| elasticsearch_transport_tx_packets_total | counter | 1 | Count of packets sent
| elasticsearch_transport_tx_size_bytes_total | counter | 1 | Total number of bytes sent
| elasticsearch_http_connections_opened_current | counter | 1 | Current number of opened connections
| elasticsearch_http_connections_opened_total | counter | 1 | Total number of opened connections
| elasticsearch_clusterinfo_last_retrieval_success_ts | gauge | 1 | Timestamp of the last successful cluster info retrieval
| elasticsearch_clusterinfo_up | gauge | 1 | Up metric for the cluster info collector
| elasticsearch_clusterinfo_version_info | gauge | 6 | Constant metric with ES version information as labels
Expand Down
36 changes: 36 additions & 0 deletions collector/indices.go
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,42 @@ func NewIndices(logger log.Logger, client *http.Client, url *url.URL, shards boo
},
Labels: indexLabels,
},
{
Type: prometheus.GaugeValue,
Desc: prometheus.NewDesc(
prometheus.BuildFQName(namespace, "indices", "recovery_current_as_source"),
"Number of ongoing recoveries for which a shard serves as a source",
indexLabels.keys(), nil,
),
Value: func(indexStats IndexStatsIndexResponse) float64 {
return float64(indexStats.Total.Recovery.CurrentAsSource)
},
Labels: indexLabels,
},
{
Type: prometheus.GaugeValue,
Desc: prometheus.NewDesc(
prometheus.BuildFQName(namespace, "indices", "recovery_current_as_target"),
"Number of ongoing recoveries for which a shard serves as a target",
indexLabels.keys(), nil,
),
Value: func(indexStats IndexStatsIndexResponse) float64 {
return float64(indexStats.Total.Recovery.CurrentAsTarget)
},
Labels: indexLabels,
},
{
Type: prometheus.CounterValue,
Desc: prometheus.NewDesc(
prometheus.BuildFQName(namespace, "indices", "recovery_throttle_time_seconds_total"),
"Time in seconds recovery operations were delayed due to throttling",
indexLabels.keys(), nil,
),
Value: func(indexStats IndexStatsIndexResponse) float64 {
return float64(indexStats.Total.Recovery.ThrottleTimeInMillis) / 1000
},
Labels: indexLabels,
},
{
Type: prometheus.CounterValue,
Desc: prometheus.NewDesc(
Expand Down
74 changes: 73 additions & 1 deletion collector/nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ func getRoles(node NodeStatsNodeResponse) map[string]bool {
}
}
}
if len(node.HTTP) == 0 {
if node.HTTP == nil {
roles["client"] = false
}
return roles
Expand Down Expand Up @@ -287,6 +287,42 @@ func NewNodes(logger log.Logger, client *http.Client, url *url.URL, all bool, no
},
Labels: defaultNodeLabelValues,
},
{
Type: prometheus.GaugeValue,
Desc: prometheus.NewDesc(
prometheus.BuildFQName(namespace, "os", "swap_in_bytes_used"),
"Amount of used swap space in bytes",
defaultNodeLabels, nil,
),
Value: func(node NodeStatsNodeResponse) float64 {
return float64(node.OS.Swap.Used)
},
Labels: defaultNodeLabelValues,
},
{
Type: prometheus.GaugeValue,
Desc: prometheus.NewDesc(
prometheus.BuildFQName(namespace, "os", "swap_in_bytes_free"),
"Amount of free swap space in bytes",
defaultNodeLabels, nil,
),
Value: func(node NodeStatsNodeResponse) float64 {
return float64(node.OS.Swap.Free)
},
Labels: defaultNodeLabelValues,
},
{
Type: prometheus.GaugeValue,
Desc: prometheus.NewDesc(
prometheus.BuildFQName(namespace, "os", "swap_in_bytes_total"),
"Total amount of swap space in bytes",
defaultNodeLabels, nil,
),
Value: func(node NodeStatsNodeResponse) float64 {
return float64(node.OS.Swap.Total)
},
Labels: defaultNodeLabelValues,
},
Comment on lines +290 to +325

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Think these metrics should be named swap_used_bytes, swap_free_bytes & swap_total_bytes simalarly to mem_* metrics.

{
Type: prometheus.GaugeValue,
Desc: prometheus.NewDesc(
Expand Down Expand Up @@ -1462,6 +1498,18 @@ func NewNodes(logger log.Logger, client *http.Client, url *url.URL, all bool, no
return append(defaultNodeLabelValues(cluster, node), "user")
},
},
{
Type: prometheus.CounterValue,
Desc: prometheus.NewDesc(
prometheus.BuildFQName(namespace, "transport", "tcp_connections_opened_total"),
"Number of connections opened for cluster communication",
defaultNodeLabels, nil,
),
Value: func(node NodeStatsNodeResponse) float64 {
return float64(node.Transport.ServerOpen)
},
Labels: defaultNodeLabelValues,
},
{
Type: prometheus.CounterValue,
Desc: prometheus.NewDesc(
Expand Down Expand Up @@ -1510,6 +1558,30 @@ func NewNodes(logger log.Logger, client *http.Client, url *url.URL, all bool, no
},
Labels: defaultNodeLabelValues,
},
{
Type: prometheus.CounterValue,
Copy link

@anti-social anti-social Nov 24, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Number of current open connections should be a gauge.

Desc: prometheus.NewDesc(
prometheus.BuildFQName(namespace, "http", "connections_opened_current"),
"Current number of opened connections",
defaultNodeLabels, nil,
),
Value: func(node NodeStatsNodeResponse) float64 {
return float64(node.HTTP.CurrentOpen)
},
Labels: defaultNodeLabelValues,
},
{
Type: prometheus.CounterValue,
Desc: prometheus.NewDesc(
prometheus.BuildFQName(namespace, "http", "connections_opened_total"),
"Total number of opened connections",
defaultNodeLabels, nil,
),
Value: func(node NodeStatsNodeResponse) float64 {
return float64(node.HTTP.TotalOpen)
},
Labels: defaultNodeLabelValues,
},
Comment on lines +1563 to +1584

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you think it's worth to rename Elasticsearch's metrics? This makes harder matching exporter's metric to the origin.

},
gcCollectionMetrics: []*gcCollectionMetric{
{
Expand Down
9 changes: 5 additions & 4 deletions collector/nodes_response.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ type NodeStatsNodeResponse struct {
ThreadPool map[string]NodeStatsThreadPoolPoolResponse `json:"thread_pool"`
JVM NodeStatsJVMResponse `json:"jvm"`
Breakers map[string]NodeStatsBreakersResponse `json:"breakers"`
HTTP map[string]int `json:"http"`
HTTP *NodeStatsHTTPResponse `json:"http"`
Transport NodeStatsTransportResponse `json:"transport"`
Process NodeStatsProcessResponse `json:"process"`
}
Expand Down Expand Up @@ -277,8 +277,9 @@ type NodeStatsOSMemResponse struct {

// NodeStatsOSSwapResponse defines node stats operating system swap usage structure
type NodeStatsOSSwapResponse struct {
Used int64 `json:"used_in_bytes"`
Free int64 `json:"free_in_bytes"`
Used int64 `json:"used_in_bytes"`
Free int64 `json:"free_in_bytes"`
Total int64 `json:"total_in_bytes"`
}

// NodeStatsOSCPUResponse defines node stats operating system CPU usage structure
Expand Down Expand Up @@ -325,7 +326,7 @@ type NodeStatsProcessCPUResponse struct {
// NodeStatsHTTPResponse defines node stats HTTP connections structure
type NodeStatsHTTPResponse struct {
CurrentOpen int64 `json:"current_open"`
TotalOpen int64 `json:"total_open"`
TotalOpen int64 `json:"total_opened"`
}

// NodeStatsFSResponse is a representation of a file system information, data path, free disk space, read/write stats
Expand Down