stream
|from()
|eval(lambda: if(((weekday("time") >= 1 AND weekday("time") <= 5) AND (hour("time") >= 8 AND hour("time") <= 17)), 'true', 'false'))
.as('business_hours')
.tags('business_hours')
.keep()
|log()
|influxDBOut()
.database('telegraf')
.retentionPolicy('autogen')
.tag('kapacitor_augmented', 'true')
alert.tick
var data = stream
|from()
.measurement('cpu')
|window()
.period(5s)
.every(5s)
|httpPost('http://localhost:6060')
.header('example','b')
any_influxdb_write_errors.tick
// SELECT writeError FROM telegraf..influxdb_write WHERE tmpltime() group by host
// This assumes that you're using Telegraf and have the InfluxDB plugin enabled
// https://github.com/influxdata/telegraf/tree/master/plugins/inputs/influxdb
batch
|query('SELECT writeError, writeOk, writeTimeout FROM "telegraf"."default".influxdb_write')
.period(10s)
.every(10s)
.groupBy('host')
|derivative('writeError')
.nonNegative()
|alert()
.crit(lambda: "derivative" > 2)
.details('')
.log('/dev/stdout')
b.tick
var data = batch
|query('select * from "telegraf".autogen.cpu')
.every(10s)
.period(10s)
|log()
batch.tick
batch
|query('SELECT * from "_internal"."monitor"."write"')
.period(1m)
.every(5s)
.align()
|log()
c.tick
// Query the last years worth of data, but include `mean` call so that timestamp
// returned will be the lower boundary of the time on the query issued. Without that
// the timestamp would be variable
var last_year = batch
|query('SELECT last("my_field") as my_field, mean("my_field") FROM "mydb"."myrp"."mymeasurement"')
.period(56w)
.every(10m)
// Shift the data up to the current time so that we can join on that timestamp
|shift(56w)
// Query the average over the last 10m
var mean = batch
|query('SELECT mean("my_field") as my_field FROM "mydb"."myrp"."mymeasurement"')
.period(10m)
.every(10m)
// Shift the data up to the current time so that we can join on that timestamp
|shift(10m)
// Join the last year data with the current average
last_year
|join(mean)
.as('last_year','mean')
.tolerance(5m)
|alert()
.crit(lambda: "last_year.my_field" > 0 AND "mean.my_field" < 10)
// SELECT used_percent FROM telegraf..disk WHERE tmpltime() group by host
// This TICKscript assumes that you're using Telegrafs standard system plugin
batch
|query('SELECT used_percent FROM "telegraf"."default".disk')
.period(10s)
.every(10s)
.groupBy('host')
|alert()
.crit(lambda: "used_percent" > 80)
.details('')
.stateChangesOnly()
.log('/dev/stdout')
high_host_mem.tick
// This TICKscript assumes that you're using Telegrafs standard system plugin
batch
|query('SELECT used_percent FROM "telegraf"."default".mem')
.period(10s)
.every(10s)
.groupBy('host')
|alert()
.crit(lambda: "used_percent" > 80)
.stateChangesOnly()
.details('')
.log('/dev/stdout')
.victorOps()
.routingKey('awstest')
idk.tick
// See what the count is when Kapacitor counts the raw data
batch
|query('SELECT * FROM "demonstration"."telemetry_retention"."telemetry"')
.period(3d)
.cron('0 55 13 * * * *')
|count('WK_Zen')
.as('a_WKZen')
|log()
// See what the count is when the count query is issued directly to InfluxDb
batch
|query('SELECT count(*) AS b FROM "demonstration"."telemetry_retention"."telemetry"')
.period(3d)
.cron('0 55 13 * * * *')
|log()
// The old query
batch
|query('SELECT * FROM "demonstration"."telemetry_retention"."telemetry"')
.period(3d)
.cron('0 55 13 * * * *')
.groupBy(*)
|shift(24h)
|influxDBOut()
.database('demowork')
.retentionPolicy('telemetry_retention')
.measurement('telemetry')
.precision('ms')
idk_1.tick
var warn = 401
var crit = 500
var threshold = 2
stream
|from()
.measurement('http_response')
.where(lambda: "server" == 'xxx')
.groupBy('http_response_code', 'server', 'host')
|where(lambda: "http_response_code" >= crit)
|stats(1m)
|derivative('emitted')
.unit(1m)
.nonNegative()
.as('emitted')
|alert()
.crit(lambda: "emitted" >= threshold)
.warn(lambda: "http_response_code" > warn AND "http_response_code" != 408)
.id('{{ index .Tags "host" }}')
.message('{{ .Level }}: HTTP Response is {{ index .Fields "http_response_code" }} for HEALTHCHECK {{ index .Tags "server"}}')
.pagerDuty()
var points = 12.0
var period = 1m
var every = 1m
var data = stream
|from()
.measurement('http_response')
.groupBy(*)
|eval()
.keep('response_time')
|window()
.period(period)
.every(every)
.align
var success_rate = data
|where(lambda: "http_response_code" < 302 AND "response_string_match" == 1)
|count('response_time')
.as('count')
|eval(lambda: (float("count") / points) * 100.0)
.as('rate')
var mean_data = data
|mean('response_time')
.as('response_time')
|eval(lambda: "response_time" * 1000.0)
.as('response_time')
var p50_data = data
|percentile('response_time', 50.0)
.as('response_time')
|eval(lambda: "response_time" * 1000.0)
.as('response_time')
var p95_data = data
|percentile('response_time', 95.0)
.as('response_time')
|eval(lambda: "response_time" * 1000.0)
.as('response_time')
var p99_data = data
|percentile('response_time', 99.0)
.as('response_time')
|eval(lambda: "response_time" * 1000.0)
.as('response_time')
success_rate
|join(mean_data, p50_data, p95_data, p99_data)
.as('success', 'mean', 'p50', 'p95', 'p99')
|influxDBOut()
.create()
.database('telegraf_derived')
.retentionPolicy('autogen')
.measurement('http_response_rates')
kap_stream.tick
var period = 10s
var every = 10s
var data = stream
|from()
.measurement('m')
.groupBy('hname')
|window()
.period(period)
.every(every)
.fillPeriod()
var stata = data
|where(lambda: "sts" == 'ok')
|sum('count')
.as('value')
var statb = data
|sum('count')
.as('value')
stata
|join(statb)
.as('stata', 'statb')
|eval(lambda: (float("stata.value") / float("statb.value")) * 100.0)
.as('stat')
|where(lambda: "stat" > 0.0)
|log()
|alert()
// .id('{{ index .Tags "host"}}/rmqh_d-fail-ratio')
.id('{{ index .Tags "hname"}}')
.message('xxx')
.crit(lambda: "stat" > 0.0)
.stateChangesOnly()
// .flapping(0.25, 0.5)
.log('xxx')
l.tick
var message = 'batch alerting {{.Time}} {{ index .Fields "value" }}'
var current = batch
|query('select mean(nummeasurements) as value from "_internal"."monitor"."database"')
.period(1m)
.every(1m)
.align()
var past = batch
|query('select mean(nummeasurements) as value from "_internal"."monitor"."database"')
.period(1m)
.every(1m)
.align()
.offset(24h)
|shift(24h)
var trigger = past
|join(current)
.as('past', 'current')
|eval(lambda: float("current.value" - "past.value"))
.keep()
.as('value')
|alert()
.crit(lambda: "value" > 0)
.stateChangesOnly()
.message(message)
.post('http://xxxxx')
local_write_failures.tick
// Fire an alert when write errors occur (grouped by database)
var period = 10s
var every = 10s
var delta = 1s
stream
|from()
.database('_internal')
.measurement('shard')
.groupBy('database')
|window()
.period(period)
.every(every)
|max('writePointsFail')
.as('max')
|derivative('max')
.unit(delta)
.nonNegative()
.as('derivative')
|alert()
.id('write_failures/{{ index .Tags "database" }}')
.message('There have been write failures on database: {{ index .Tags "database" }}')
.details('')
.warn(lambda: "derivative" >= 1)
.stateChangesOnly()
.log('/var/log/kapacitor/write_failures.log')
missing_isPresent.tick
var data = stream
|from()
.measurement('cpu')
|eval(lambda: isPresent("usage_funny"))
.as('missing')
var data = stream
|from()
.measurement('mysql')
.retentionPolicy('default')
.database('performance')
.groupBy(*)
|where(lambda: "host" == 'xxx')
var c1 = data
|where(lambda: isPresent("commands_select"))
|derivative('commands_select')
.unit(1s)
.nonNegative()
.as('div1')
var c2 = data
|where(lambda: isPresent("commands_insert"))
|derivative('commands_insert')
.unit(1s)
.nonNegative()
.as('div2')
c1
|join(c2)
.as('c1', 'c2')
|eval()
.keep('c1.div1', 'c2.div2')
|httpOut('out')
old_example.tick
var data = stream
|from()
.measurement('cpu')
|log()
other.tick
var data = stream
|from()
.measurement('example')
|eval(lambda: (float("x")/float("y")) * 100)
.as('result')
|log()
other_d.tick
var db = 'iotdata'
var rp = 'autogen'
var measurement = 'influxdata_sensors'
var groupBy = [*]
var whereFilter = lambda: ("location" == 'TravelingWilbury')
var name = 'relative-temp'
var idVar = name + ':{{.Group}}'
var message = ' {{ index .Fields "value" }}'
var idTag = 'alertID'
var levelTag = 'level'
var messageField = 'message'
var durationField = 'duration'
var outputDB = 'chronograf'
var outputRP = 'autogen'
var outputMeasurement = 'alerts'
var triggerType = 'relative'
var shift = 5s
0s
var crit = 0.5
var data = stream
|from()
.database(db)
.retentionPolicy(rp)
.measurement(measurement)
.groupBy(groupBy)
.where(whereFilter)
|eval(lambda: "temp_f")
.as('value')
|window()
.period(3s)
.every(1s)
.align()
var max = data|max('value').as('value')
var min = data|min('value').as('value')
var trigger = max
|join(min)
.as('max', 'min')
.tolerance(100ms)
|eval(lambda: float("max.value" - "min.value"))
.keep()
.as('chng')
|log()
.level('ERROR')
trigger
|influxDBOut()
.create()
.database('derived_iotdata')
.measurement('influxdata_sensors')
trigger
|alert()
.crit(lambda: abs("chng") > crit)
.stateChangesOnly()
.message(message)
.id(idVar)
.idTag(idTag)
.levelTag(levelTag)
.messageField(messageField)
.durationField(durationField)
.post('http://davidgs.com:1880/sensor-reading')
trigger
|influxDBOut()
.create()
.database(outputDB)
.retentionPolicy(outputRP)
.measurement(outputMeasurement)
.tag('alertName', name)
.tag('triggerType', triggerType)
trigger
|httpOut('output')
var measurement = 'event_trend2'
var period = 1m
var every = 20s
var data = batch
|query('SELECT count(event_id) as alert_count FROM "a"."autogen".State')
.period(period)
.every(every)
.groupBy(time(every), *)
.fill(0)
var mean_data = data
|mean('alert_count')
.as('mean')
var deriv_data = data
|derivative('alert_count')
.unit(every)
.nonNegative()
.as('derivative')
|last('derivative')
.as('derivative')
mean_data
|join(deriv_data)
.as('mean', 'deriv')
|influxDBOut()
.database('_a_analytics')
.measurement(measurement)
mean_data
|httpOut('result')
record_series_count.tick
// Fire an alert when series count exceeds threshold
// Also rewrite downsampled points back to _internal database
var period = 10s
var every = 10s
var delta = 1s
var warn_threshold = 800000
var data = stream
|from()
.database('_internal')
.measurement('database')
.groupBy('hostname')
|window()
.period(period)
.every(every)
|default()
.field('numSeries', 0)
|sum('numSeries')
.as('totalNumSeries')
data
|alert()
.id('high_series_count/{{ index .Tags "hostname" }}')
.message('High series count for host: {{ index .Tags "hostname" }}')
.details('')
.warn(lambda: "totalNumSeries" > warn_threshold)
.stateChangesOnly()
.log('/var/log/kapacitor/high_series_count.log')
data
|influxDBOut()
.database('_internal')
.retentionPolicy('monitor')
.measurement('database')
.tag('source', 'kapacitor')
record_throughput.tick
// Fire an alert when write throughput exceeds threshold
// Also rewrite downsampled points back to _internal database
var period = 10s
var every = 10s
var delta = 1s
var warn_threshold = 500000
var data = stream
|from()
.database('_internal')
.measurement('write')
|window()
.period(period)
.every(every)
|default()
.field('pointReq', 0)
|max('pointReq')
.as('max')
|derivative('max')
.unit(delta)
.nonNegative()
.as('pointsPerSecond')
data
|alert()
.id('high_throughput/{{ index .Tags "hostname" }}')
.message('High write throughput on host: {{ index .Tags "hostname" }}')
.details('')
.warn(lambda: "pointsPerSecond" > warn_threshold)
.stateChangesOnly()
.log('/var/log/kapacitor/high_throughput.log')
data
|influxDBOut()
.database('_internal')
.retentionPolicy('monitor')
.measurement('write')
.tag('source', 'kapacitor')
rename.tick
///////////////////////////////
// writes are failing
var data = stream
|from()
.measurement('shard')
|groupBy('database')
|window()
.period(10s)
.every(10s)
var writeOk = data
|max('writePointsOk')
.as('maxWritePointsOk')
|derivative('maxWritePointsOk')
.unit(1s)
.nonNegative()
.as('derivativeMaxWritePointsOk')
var writeFail = data
|max('writePointsFail')
.as('maxWritePointsFail')
|derivative('maxWritePointsFail')
.unit(1s)
.nonNegative()
.as('derivativeMaxWritePointsFail')
writeOk
|join(writeFail)
.as('ok', 'fail')
.tolerance(1s)
.fill(0.0)
.streamName('write_ok_fail')
|eval(lambda: "fail.derivativeMaxWritePointsFail" / ("ok.derivativeMaxWritePointsOk" + "fail.derivativeMaxWritePointsFail"))
.as('percentFailedWrites')
|alert()
.id('kapacitor/{{ index .Tags "datbase" }}') // ??
.message('{{ .ID }} is {{ .Level }} value:{{index .Fields "value" }}') // ?
.info(lambda: "derivativeMeanWritePointOk" > 10) // ??
.warn(lambda: "derivativeMeanWritePointOk" > 30) // ??
.crit(lambda: "derivativeMeanWritePointOk" > 90) // ??
.slack() // ??
.channel('#alerts')
|influxDBOut()
.database('internal_stats') // ??
.retentionPolicy('myrp') // ??
.measurement('percent_failed_disk_writes')
.tag('kapacitor', 'true')
/////////////////////////////////////////////////////////
// you currently have high throughput
stream
|from()
.measurement('write')
|max('pointReq')
.as('maxPointReq')
|derivative('maxPointReq')
.unit(1s)
.nonNegative()
.as('pointsPerSecond')
|alert()
.message('idk?')
.info(lambda: 'pointsPerSecond' > 150000)
.warn(lambda: 'pointsPerSecond' > 300000)
.crit(lambda: 'pointsPerSecond' > 500000)
|influxDBOut()
.database('internal_stats') // ??
.retentionPolicy('myrp') // ??
.measurement('throughput')
.tag('kapacitor', 'true')
////////////////////////////////////////////////////////////
///// too many series
stream
|from()
.measurement('database')
|sum('numSeries')
.as('totalNumSeries')
|alert()
.message('you have this many series')
.warn(lambda: 'totalNumSeries' > 1000000)
.crit(lambda: 'totalNumSeries' > 10000000)
|influxDBOut()
.database('internal_stats') // ??
.retentionPolicy('myrp') // ??
.measurement('series')
.tag('kapacitor', 'true')
rollup.tick
// DEPLOYAS:stream
// DBRP:telegraf.default
// Which measurement to consume
var measurement = 'cpu'
// Which database to consume
var db = 'telegraf'
// Which retention policy to consume
var rp = 'autogen'
// Which field to process
var field = 'usage_user'
var data = stream
|from()
.database('telegraf')
.retentionPolicy('default')
.measurement(measurement)
.groupBy(*)
// 1m rollup
data
|window()
.period(1m)
.every(1m)
|mean(field)
.as(field)
|influxDBOut()
.database('telegaf')
.retentionPolicy('autogen')
.measurement(measurement)
.tag('resolution', '1m')
.create()
// 5m rollup
data
|window()
.period(5m)
.every(5m)
|mean(field)
.as(field)
|influxDBOut()
.database('telegaf')
.retentionPolicy('autogen')
.measurement(measurement)
.tag('resolution', '5m')
.create()
s.tick
var db = 'telegraf'
var rp = 'rp_2w'
var measurement = 'net'
var outputDB = 'chronograf'
var outputRP = 'rp_2w'
var outputMeasurement = 'nnd_net'
batch
|query('select non_negative_derivative(*) as nnd from telegraf.rp_2w.net')
.period(20s)
.every(20s)
.groupBy(*)
|influxDBOut()
.database(outputDB)
.retentionPolicy(outputRP)
.measurement(outputMeasurement)
.precision('s')
shift.tick
dbrp "telegraf"."autogen"
batch
|query('SELECT * FROM "telegraf"."autogen"."cpu" GROUP BY *')
.period(1m)
.every(1m)
.groupBy(*)
|shift(10m)
|influxDBOut()
.database('telegraf')
.retentionPolicy('newrp2')
// This TICKscript assumes that you have telegraf running and the Kapacitor slack plugin enabled
var period = 120s
var every = 60s
var sys_data = stream
|from()
.database('telegraf')
.measurement('system')
.groupBy('host')
|window()
.period(period)
.every(every)
sys_data|deadman(1.0, period)
.id('deadman/{{ index .Tags "host" }}')
.message('Node {{ index .Tags "host" }} {{ if ne .Level "OK" }}has not responded in 120s!{{ else }}is back online.{{ end }}')
.stateChangesOnly()
.slack()
var secondsPerMinute = 60
var secondsPerHour = 60 * 60
var secondsPerDay = 24 * secondsPerHour
var secondsPerWeek = 7 * secondsPerDay
stream
|from()
.measurement('cpu')
|eval(lambda: unixNano("time") - "usage_time")
.as('t2')
|log()
week_on_week.tick
// Query the current minute
var this_minute = batch
|query('''
SELECT count("usage_user") FROM "telegraf"."autogen"."cpu"
''')
.period(1m)
.every(1m)
var last_minute = batch
|query('''
SELECT count("usage_user") FROM "telegraf"."autogen"."cpu"
''')
.period(1m)
.every(1m)
// This is what gives us the previous miniute of data
.offset(24h)
// we need to shift the previous minute forward
|shift(1m)
this_minute
|join(last_minute)
.as('this', 'last')
|log()
working-issue1192.tick
batch
|query('select mean(used_percent) from "telegraf"."autogen"."mem"')
.groupBy('tag1','missing-tag')
.period(30s)
.every(30s)
// By using last here the edge type switches from batch to stream and the bug only exists in the batch handling, so this is a hack workaround.
// Using last doesn't change anything since the batch query only returns a single aggregated point anyways.
|last('mean')
.as('mean')
|eval(lambda: trunc("mean"))
.as('mean')
|default()
.tag('missing-tag', 'unknown')
|log()
|alert()
.id('kapacitor/{{ index .Tags "missing-tag" }}/...')