mirror of
https://github.com/openimsdk/open-im-server.git
synced 2025-04-05 20:11:14 +08:00
* fix:fix error values&logs * modify: add logs * feature:add redis io retry logic * feature:add redis error alert rule * test:for test alert * fix:fix prometheus rules * del:del test code --------- Co-authored-by: lin.huang <lin.huang@apulis.com>
23 lines
968 B
YAML
23 lines
968 B
YAML
groups:
|
|
- name: instance_down
|
|
rules:
|
|
- alert: InstanceDown
|
|
expr: up == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Instance {{ $labels.instance }} down"
|
|
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."
|
|
|
|
- name: database_insert_failure_alerts
|
|
rules:
|
|
- alert: DatabaseInsertFailed
|
|
expr: (increase(msg_insert_redis_failed_total[5m]) > 0) or (increase(msg_insert_mongo_failed_total[5m]) > 0)
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Increase in MsgInsertRedisFailedCounter or MsgInsertMongoFailedCounter detected"
|
|
description: "Either MsgInsertRedisFailedCounter or MsgInsertMongoFailedCounter has increased in the last 5 minutes, indicating failures in message insert operations to Redis or MongoDB,maybe the redis or mongodb is crash."
|