medic / cht-watchdog

Configuration for deploying a monitoring/alerting stack for CHT

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Treat monitoring API response of `-1` as an error

mrjones-plip opened this issue · comments

Right now if we API will return -1 for a bunch of metrics when errors are thrown by the CouchDb queries to retrieve required values. However, when this happened for a production instance, date.uptime and date.current were correctly incrementing, so Watchdog didn't think anything was wrong.

Instead, we should fire an alert to indicate that something is wrong.

In the outage itself, the monitoring API returned this:

{
  "version": {
    "app": "",
    "node": "v16.20.0",
    "couchdb": ""
  },
  "couchdb": {
    "medic": {
      "name": "",
      "update_sequence": -1,
      "doc_count": -1,
      "doc_del_count": -1,
      "fragmentation": -1
    },
    "sentinel": {
      "name": "",
      "update_sequence": -1,
      "doc_count": -1,
      "doc_del_count": -1,
      "fragmentation": -1
    },
    "usersmeta": {
      "name": "",
      "update_sequence": -1,
      "doc_count": -1,
      "doc_del_count": -1,
      "fragmentation": -1
    },
    "users": {
      "name": "",
      "update_sequence": -1,
      "doc_count": -1,
      "doc_del_count": -1,
      "fragmentation": -1
    }
  },
  "date": {
    "current": 1698210046488,
    "uptime": 967259.799207221
  },
  "sentinel": {
    "backlog": -1
  },
  "messaging": {
    "outgoing": {
      "total": {
        "due": -1,
        "scheduled": -1,
        "muted": -1,
        "failed": -1,
        "delivered": -1
      },
      "seven_days": {
        "due": -1,
        "scheduled": -1,
        "muted": -1,
        "failed": -1,
        "delivered": -1
      },
      "last_hundred": {
        "pending": {
          "pending": -1,
          "forwarded-to-gateway": -1,
          "received-by-gateway": -1,
          "forwarded-by-gateway": -1
        },
        "final": {
          "sent": -1,
          "delivered": -1,
          "failed": -1
        },
        "muted": {
          "denied": -1,
          "cleared": -1,
          "muted": -1,
          "duplicate": -1
        }
      }
    }
  },
  "outbound_push": {
    "backlog": -1
  },
  "feedback": {
    "count": -1
  },
  "conflict": {
    "count": -1
  },
  "replication_limit": {
    "count": -1
  },
  "connected_users": {
    "count": -1
  }
}

which in turn looked like 0 value for everything instead of -1

image