Update the existing Grafana Dashboard to include the new reconnect metrics
poulok opened this issue · comments
There is an existing "Reconnect" platform Grafana Dashboard:
Preproduction: https://preproduction.grafana.hedera-ops.com/d/a11737aa-3390-4951-8ddb-0b1b3f06aebc/reconnects?from=now-3h&to=now&var-network=performance&var-NodeID=All&orgId=1
Production: https://production.grafana.hedera-ops.com/d/a11737aa-3390-4951-8ddb-0b1b3f06aebc/reconnects?from=1707501036987&to=1707688643869&var-network=mainnet&var-NodeID=All&orgId=1
The definitions for the production instances are kept in the repo under the "infrastructure" folder. The dashboard needs to be updated to include the new reconnect metrics with appropriate and useful visualizations. This can be done interactively in Grafana and the json exported for inclusion in the repo. Preproduction must be updated by hand (i think, need to verify if these are kept in the repo or not).
The best strategy is to experiment on the preproduction instance, then convert it to production. @beeradb is a good contact for help with this. engnet1
runs develop
nightly.
Depends on all other tickets in #12412.
There exists a non-trivial diff between the JSON dashboard definition currently in production Grafana and the reconnect.json
file in hedera-services under infrastructure/.../production/... as shown below. I'm unsure which version is "correct" or "latest". If I had to guess, I'd choose the .json in the repository as the correct and latest definition, assuming that we do maintain our infrastructure as code.
The diff (~700 LOC):
2,398d1
< "__inputs": [
< {
< "name": "DS_GRAFANACLOUD-SWIRLDSLABSPREPRODUCTION-PROM",
< "label": "grafanacloud-swirldslabspreproduction-prom",
< "description": "",
< "type": "datasource",
< "pluginId": "prometheus",
< "pluginName": "Prometheus"
< },
< {
< "name": "DS_GRAFANACLOUD-SWIRLDSLABSPREPRODUCTION-PROM-FOR-LIBRARY-PANEL",
< "label": "grafanacloud-swirldslabspreproduction-prom",
< "description": "",
< "type": "datasource",
< "pluginId": "prometheus",
< "pluginName": "Prometheus",
< "usage": {
< "libraryPanels": [
< {
< "name": "Platform Status (new)",
< "uid": "bdcky36wafe9se"
< },
< {
< "name": "conns Changes",
< "uid": "ef64b104-def4-4374-ba11-df4fb2a8ec2d"
< }
< ]
< }
< }
< ],
< "__elements": {
< "bdcky36wafe9se": {
< "name": "Platform Status (new)",
< "uid": "bdcky36wafe9se",
< "kind": 1,
< "model": {
< "datasource": {
< "type": "prometheus",
< "uid": "${DS_GRAFANACLOUD-SWIRLDSLABSPREPRODUCTION-PROM-FOR-LIBRARY-PANEL}"
< },
< "description": "",
< "fieldConfig": {
< "defaults": {
< "color": {
< "mode": "continuous-GrYlRd"
< },
< "custom": {
< "fillOpacity": 61,
< "hideFrom": {
< "legend": false,
< "tooltip": false,
< "viz": false
< },
< "insertNulls": false,
< "lineWidth": 0,
< "spanNulls": false
< },
< "mappings": [
< {
< "options": {
< "1": {
< "color": "super-light-green",
< "index": 0,
< "text": "STARTING_UP"
< },
< "2": {
< "color": "green",
< "index": 1,
< "text": "ACTIVE"
< },
< "4": {
< "color": "red",
< "index": 2,
< "text": "BEHIND"
< },
< "5": {
< "color": "light-blue",
< "index": 3,
< "text": "FREEZING"
< },
< "6": {
< "color": "dark-blue",
< "index": 4,
< "text": "FREEZE_COMPLETE"
< },
< "7": {
< "color": "super-light-yellow",
< "index": 5,
< "text": "REPLAYING_EVENTS"
< },
< "8": {
< "color": "text",
< "index": 6,
< "text": "OBSERVING"
< },
< "9": {
< "color": "orange",
< "index": 7,
< "text": "CHECKING"
< },
< "10": {
< "color": "purple",
< "index": 8,
< "text": "RECONNECT_COMPLETE"
< },
< "11": {
< "color": "#250707",
< "index": 9,
< "text": "CATASTROPHIC_FAILURE"
< }
< },
< "type": "value"
< }
< ],
< "thresholds": {
< "mode": "absolute",
< "steps": [
< {
< "color": "green",
< "value": null
< },
< {
< "color": "red",
< "value": 80
< }
< ]
< },
< "unitScale": true
< },
< "overrides": []
< },
< "libraryPanel": {
< "name": "Platform Status (new)",
< "uid": "bdcky36wafe9se"
< },
< "options": {
< "alignValue": "left",
< "legend": {
< "displayMode": "list",
< "placement": "bottom",
< "showLegend": false
< },
< "mergeValues": true,
< "rowHeight": 0.9,
< "showValue": "auto",
< "tooltip": {
< "mode": "single",
< "sort": "none"
< }
< },
< "targets": [
< {
< "datasource": {
< "type": "prometheus",
< "uid": "grafanacloud-prom"
< },
< "disableTextWrap": false,
< "editorMode": "builder",
< "expr": "platform_PlatformStatus{environment=\"$network\", node_id=~\"$NodeID\"}",
< "fullMetaSearch": false,
< "includeNullMetadata": true,
< "instant": false,
< "legendFormat": "node {{node_id}}",
< "range": true,
< "refId": "A",
< "useBackend": false
< }
< ],
< "title": "Platform Status",
< "type": "state-timeline"
< }
< },
< "a5c5c524-a234-4292-85ae-9fb58f7a863a": {
< "name": "syncGenDiff",
< "uid": "a5c5c524-a234-4292-85ae-9fb58f7a863a",
< "kind": 1,
< "model": {
< "datasource": {
< "type": "prometheus",
< "uid": "${DS_GRAFANACLOUD-SWIRLDSLABSPREPRODUCTION-PROM}"
< },
< "description": "The number of generation ahead (positive) or behind (negative) when syncing.",
< "fieldConfig": {
< "defaults": {
< "color": {
< "mode": "palette-classic"
< },
< "custom": {
< "axisBorderShow": false,
< "axisCenteredZero": false,
< "axisColorMode": "text",
< "axisLabel": "generations",
< "axisPlacement": "auto",
< "barAlignment": 0,
< "drawStyle": "line",
< "fillOpacity": 0,
< "gradientMode": "none",
< "hideFrom": {
< "legend": false,
< "tooltip": false,
< "viz": false
< },
< "insertNulls": false,
< "lineInterpolation": "linear",
< "lineWidth": 1,
< "pointSize": 5,
< "scaleDistribution": {
< "type": "linear"
< },
< "showPoints": "auto",
< "spanNulls": false,
< "stacking": {
< "group": "A",
< "mode": "none"
< },
< "thresholdsStyle": {
< "mode": "area"
< }
< },
< "mappings": [],
< "thresholds": {
< "mode": "absolute",
< "steps": [
< {
< "color": "semi-dark-red",
< "value": null
< },
< {
< "color": "semi-dark-yellow",
< "value": -5000
< },
< {
< "color": "semi-dark-green",
< "value": -3000
< }
< ]
< }
< },
< "overrides": []
< },
< "options": {
< "legend": {
< "calcs": [],
< "displayMode": "list",
< "placement": "bottom",
< "showLegend": true
< },
< "tooltip": {
< "mode": "single",
< "sort": "none"
< }
< },
< "targets": [
< {
< "datasource": {
< "type": "prometheus",
< "uid": "grafanacloud-prom"
< },
< "disableTextWrap": false,
< "editorMode": "builder",
< "expr": "internal_syncGenDiff{environment=\"$network\", node_id=~\"$NodeID\"}",
< "fullMetaSearch": false,
< "includeNullMetadata": true,
< "instant": false,
< "legendFormat": "node {{node_id}}",
< "range": true,
< "refId": "A",
< "useBackend": false
< }
< ],
< "title": "syncGenDiff",
< "type": "timeseries"
< }
< },
< "ef64b104-def4-4374-ba11-df4fb2a8ec2d": {
< "name": "conns Changes",
< "uid": "ef64b104-def4-4374-ba11-df4fb2a8ec2d",
< "kind": 1,
< "model": {
< "datasource": {
< "type": "prometheus",
< "uid": "${DS_GRAFANACLOUD-SWIRLDSLABSPREPRODUCTION-PROM-FOR-LIBRARY-PANEL}"
< },
< "description": "The increase in the number of TLS connections established by a node per second.",
< "fieldConfig": {
< "defaults": {
< "color": {
< "mode": "palette-classic"
< },
< "custom": {
< "axisBorderShow": false,
< "axisCenteredZero": false,
< "axisColorMode": "text",
< "axisLabel": "",
< "axisPlacement": "auto",
< "barAlignment": 0,
< "drawStyle": "line",
< "fillOpacity": 0,
< "gradientMode": "none",
< "hideFrom": {
< "legend": false,
< "tooltip": false,
< "viz": false
< },
< "insertNulls": false,
< "lineInterpolation": "linear",
< "lineWidth": 1,
< "pointSize": 5,
< "scaleDistribution": {
< "type": "linear"
< },
< "showPoints": "auto",
< "spanNulls": false,
< "stacking": {
< "group": "A",
< "mode": "none"
< },
< "thresholdsStyle": {
< "mode": "off"
< }
< },
< "mappings": [],
< "thresholds": {
< "mode": "absolute",
< "steps": [
< {
< "color": "green",
< "value": null
< },
< {
< "color": "red",
< "value": 80
< }
< ]
< }
< },
< "overrides": []
< },
< "libraryPanel": {
< "name": "conns Changes",
< "uid": "ef64b104-def4-4374-ba11-df4fb2a8ec2d"
< },
< "options": {
< "legend": {
< "calcs": [],
< "displayMode": "list",
< "placement": "bottom",
< "showLegend": true
< },
< "tooltip": {
< "mode": "single",
< "sort": "none"
< }
< },
< "targets": [
< {
< "datasource": {
< "type": "prometheus",
< "uid": "grafanacloud-prom"
< },
< "disableTextWrap": false,
< "editorMode": "builder",
< "expr": "increase(platform_conns{environment=\"$network\", type=\"mean\", node_id=~\"$NodeID\"}[$__rate_interval])",
< "fullMetaSearch": false,
< "includeNullMetadata": true,
< "instant": false,
< "legendFormat": "node {{node_id}}",
< "range": true,
< "refId": "A",
< "useBackend": false
< }
< ],
< "title": "conns Changes",
< "type": "timeseries"
< }
< }
< },
< "__requires": [
< {
< "type": "grafana",
< "id": "grafana",
< "name": "Grafana",
< "version": "10.4.0-66955"
< },
< {
< "type": "datasource",
< "id": "prometheus",
< "name": "Prometheus",
< "version": "1.0.0"
< },
< {
< "type": "panel",
< "id": "timeseries",
< "name": "Time series",
< "version": ""
< }
< ],
418c21
< "id": null,
---
> "id": 113,
446,448c49,52
< "uid": "bdcky36wafe9se",
< "name": "Platform Status (new)"
< }
---
> "name": "Platform Status (new)",
> "uid": "bdcky36wafe9se"
> },
> "title": "Platform Status"
459,461c63,66
< "uid": "a5c5c524-a234-4292-85ae-9fb58f7a863a",
< "name": "syncGenDiff"
< }
---
> "name": "syncIndicatorDiff",
> "uid": "a5c5c524-a234-4292-85ae-9fb58f7a863a"
> },
> "title": "syncIndicatorDiff"
466c71
< "uid": "${DS_GRAFANACLOUD-SWIRLDSLABSPREPRODUCTION-PROM}"
---
> "uid": "grafanacloud-prom"
479a85
> "barWidthFactor": 0.6,
546c152
< "uid": "${DS_GRAFANACLOUD-SWIRLDSLABSPREPRODUCTION-PROM}"
---
> "uid": "grafanacloud-prom"
566c172
< "uid": "${DS_GRAFANACLOUD-SWIRLDSLABSPREPRODUCTION-PROM}"
---
> "uid": "grafanacloud-prom"
579a186
> "barWidthFactor": 0.6,
646c253
< "uid": "${DS_GRAFANACLOUD-SWIRLDSLABSPREPRODUCTION-PROM}"
---
> "uid": "grafanacloud-prom"
666c273
< "uid": "${DS_GRAFANACLOUD-SWIRLDSLABSPREPRODUCTION-PROM}"
---
> "uid": "grafanacloud-prom"
679a287
> "barWidthFactor": 0.6,
710c318,319
< "color": "green"
---
> "color": "green",
> "value": null
744c353
< "uid": "${DS_GRAFANACLOUD-SWIRLDSLABSPREPRODUCTION-PROM}"
---
> "uid": "grafanacloud-prom"
760c369
< "uid": "${DS_GRAFANACLOUD-SWIRLDSLABSPREPRODUCTION-PROM}"
---
> "uid": "grafanacloud-prom"
781c390
< "uid": "${DS_GRAFANACLOUD-SWIRLDSLABSPREPRODUCTION-PROM}"
---
> "uid": "grafanacloud-prom"
794a404
> "barWidthFactor": 0.6,
825c435,436
< "color": "green"
---
> "color": "green",
> "value": null
859c470
< "uid": "${DS_GRAFANACLOUD-SWIRLDSLABSPREPRODUCTION-PROM}"
---
> "uid": "grafanacloud-prom"
875c486
< "uid": "${DS_GRAFANACLOUD-SWIRLDSLABSPREPRODUCTION-PROM}"
---
> "uid": "grafanacloud-prom"
894,953d504
< "datasource": {
< "type": "prometheus",
< "uid": "${DS_GRAFANACLOUD-SWIRLDSLABSPREPRODUCTION-PROM}"
< },
< "fieldConfig": {
< "defaults": {
< "color": {
< "mode": "palette-classic"
< },
< "custom": {
< "axisBorderShow": false,
< "axisCenteredZero": false,
< "axisColorMode": "text",
< "axisLabel": "",
< "axisPlacement": "auto",
< "barAlignment": 0,
< "drawStyle": "line",
< "fillOpacity": 0,
< "gradientMode": "none",
< "hideFrom": {
< "legend": false,
< "tooltip": false,
< "viz": false
< },
< "insertNulls": false,
< "lineInterpolation": "linear",
< "lineWidth": 1,
< "pointSize": 5,
< "scaleDistribution": {
< "type": "linear"
< },
< "showPoints": "auto",
< "spanNulls": false,
< "stacking": {
< "group": "A",
< "mode": "none"
< },
< "thresholdsStyle": {
< "mode": "off"
< }
< },
< "mappings": [],
< "thresholds": {
< "mode": "absolute",
< "steps": [
< {
< "color": "green",
< "value": null
< },
< {
< "color": "red",
< "value": 80
< }
< ]
< },
< "unit": "s",
< "unitScale": true
< },
< "overrides": []
< },
960,1093d510
< "id": 15,
< "options": {
< "legend": {
< "calcs": [],
< "displayMode": "list",
< "placement": "bottom",
< "showLegend": true
< },
< "tooltip": {
< "mode": "single",
< "sort": "none"
< }
< },
< "targets": [
< {
< "datasource": {
< "type": "prometheus",
< "uid": "${DS_GRAFANACLOUD-SWIRLDSLABSPREPRODUCTION-PROM}"
< },
< "editorMode": "code",
< "expr": "Reconnect_senderReconnectDurationSeconds{environment=\"$network\"}",
< "instant": false,
< "legendFormat": "{{node_id}}",
< "range": true,
< "refId": "A"
< }
< ],
< "title": "Duration of reconnect as a teacher, seconds",
< "type": "timeseries"
< },
< {
< "datasource": {
< "type": "prometheus",
< "uid": "${DS_GRAFANACLOUD-SWIRLDSLABSPREPRODUCTION-PROM}"
< },
< "fieldConfig": {
< "defaults": {
< "color": {
< "mode": "palette-classic"
< },
< "custom": {
< "axisBorderShow": false,
< "axisCenteredZero": false,
< "axisColorMode": "text",
< "axisLabel": "",
< "axisPlacement": "auto",
< "barAlignment": 0,
< "drawStyle": "line",
< "fillOpacity": 0,
< "gradientMode": "none",
< "hideFrom": {
< "legend": false,
< "tooltip": false,
< "viz": false
< },
< "insertNulls": false,
< "lineInterpolation": "linear",
< "lineWidth": 1,
< "pointSize": 5,
< "scaleDistribution": {
< "type": "linear"
< },
< "showPoints": "auto",
< "spanNulls": false,
< "stacking": {
< "group": "A",
< "mode": "none"
< },
< "thresholdsStyle": {
< "mode": "off"
< }
< },
< "mappings": [],
< "thresholds": {
< "mode": "absolute",
< "steps": [
< {
< "color": "green",
< "value": null
< },
< {
< "color": "red",
< "value": 80
< }
< ]
< },
< "unit": "s",
< "unitScale": true
< },
< "overrides": []
< },
< "gridPos": {
< "h": 8,
< "w": 24,
< "x": 0,
< "y": 65
< },
< "id": 16,
< "options": {
< "legend": {
< "calcs": [],
< "displayMode": "list",
< "placement": "bottom",
< "showLegend": true
< },
< "tooltip": {
< "mode": "single",
< "sort": "none"
< }
< },
< "targets": [
< {
< "datasource": {
< "type": "prometheus",
< "uid": "${DS_GRAFANACLOUD-SWIRLDSLABSPREPRODUCTION-PROM}"
< },
< "editorMode": "code",
< "expr": "Reconnect_receiverReconnectDurationSeconds{environment=\"$network\"}",
< "instant": false,
< "legendFormat": "{{node_id}}",
< "range": true,
< "refId": "A"
< }
< ],
< "title": "Duration of reconnect as a learner, seconds",
< "type": "timeseries"
< },
< {
< "gridPos": {
< "h": 8,
< "w": 24,
< "x": 0,
< "y": 73
< },
1096,1098c513,516
< "uid": "ef64b104-def4-4374-ba11-df4fb2a8ec2d",
< "name": "conns Changes"
< }
---
> "name": "conns Changes",
> "uid": "ef64b104-def4-4374-ba11-df4fb2a8ec2d"
> },
> "title": "conns Changes"
1108c526,530
< "current": {},
---
> "current": {
> "selected": false,
> "text": "mainnet",
> "value": "mainnet"
> },
1111c533
< "uid": "${DS_GRAFANACLOUD-SWIRLDSLABSPREPRODUCTION-PROM}"
---
> "uid": "grafanacloud-prom"
1133c555,563
< "current": {},
---
> "current": {
> "selected": false,
> "text": [
> "All"
> ],
> "value": [
> "$__all"
> ]
> },
1136c566
< "uid": "${DS_GRAFANACLOUD-SWIRLDSLABSPREPRODUCTION-PROM}"
---
> "uid": "grafanacloud-prom"
1166c596
< "version": 21,
---
> "version": 9,
Note that the preproduction also has a diff of a similar size. So neither the prod nor pre-prod dashboards match the definition in the repository.
While I'm trying to figure out how to proceed, I managed to find all the new metrics that we're now emitting in pre-prod:
Apparently, I was looking into a simply JSON definition of the dashboard. However, Grafana offers an "Export" option, and when I use that I get a full JSON definition, including those __inputs
and __elements
objects, and the diff compared to the file in the repository now boils down to just a few lines which all make sense.