### node_exporter_rules.yml
```
groups:
- name: HOST
rules:
- alert: Node實例已宕機
expr: up == 0
for: 10s
labels:
user: root
severity: Warning
annotations:
summary: "Instance {{ $labels.instance }} Down"
description: "xxx系統 {{ $labels.instance }} of job {{ $labels.job }} has been Down."
- alert: MasterDown
expr: up{job='federate'} == 0
for: 10m
labels:
severity: info
annotations:
summary: "Master 主機服務異常"
description: "xxx系統{{ $labels.instance }} Master 9090 端口服務異常"
- alert: InstanceDown
expr: (up{job='node'} == 0) and ((node_time_seconds-node_boot_time_seconds) > 1800)
for: 5m
labels:
severity: info
annotations:
summary: "監控數據獲取異常"
description: "xxx系統{{ $labels.instance }} 主機可能宕機,所在節點 Master 私網IP {{ $labels.master_private_ip }} "
- alert: InstanceReboot
expr: (node_time_seconds-node_boot_time_seconds) < 600
labels:
severity: info
annotations:
summary: "重新啟動"
description: "xxx系統{{ $labels.instance }} 重新啟動"
value: "{{ $value }}"
- alert: CPU
expr: round(100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance,job) * 100),0.01) > 80
for: 7m
labels:
severity: Warn
annotations:
summary: "CPU 使用率高"
description: "xxx系統{{ $labels.instance }} CPU 使用率 {{ $value }}%"
value: "{{ $value }}"
- alert: CPU
expr: round(100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance,job) * 100),0.01) > 96
for: 7m
labels:
severity: Error
annotations:
summary: "CPU 使用率很高"
description: "xxx系統{{ $labels.instance }} CPU 使用率 {{ $value }}%"
value: "{{ $value }}"
- alert: LOAD
expr: node_load5 / on (instance) sum(count(node_cpu_seconds_total{mode='system'}) by (cpu,instance)) by(instance) > 7
for: 7m
labels:
severity: critical
annotations:
summary: "overload"
description: "xxx系統{{ $labels.instance }} 負載/CPU核數比 {{ $value }}"
value: "{{ $value }}"
- alert: MEM
expr: round((1 - ((node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) or ((node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) / node_memory_MemTotal_bytes))) * 100,0.01) > 80
for: 10m
labels:
severity: critical
annotations:
summary: "主機內存使用率高"
description: "xxx系統{{ $labels.instance }} MEM 使用占比 {{ $value }}%"
value: "{{ $value }}"
- alert: MEM
expr: round((1 - ((node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) or ((node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) / node_memory_MemTotal_bytes))) * 100,0.01) > 90
for: 10m
labels:
severity: critical
annotations:
summary: "主機內存不足"
description: "xxx系統{{ $labels.instance }} MEM 使用占比 {{ $value }}%"
value: "{{ $value }}"
- alert: DISK
expr: round((100-(node_filesystem_avail_bytes{mountpoint!='/boot',fstype=~'ext.+|ocfs.+|xfs'}/node_filesystem_size_bytes{mountpoint!='/boot',fstype=~'ext.+|ocfs.+|xfs'})*100 > 90 and node_filesystem_avail_bytes{mountpoint!='/boot',fstype=~'ext.+|ocfs.+|xfs'}/1073741824 < 10),0.01)
for: 28m
labels:
severity: info
annotations:
summary: "存儲空間不足"
description: "xxx系統{{ $labels.instance }} {{ $labels.mountpoint }} 存儲空間使用占比 {{ $value }}%"
value: "{{ $value }}"
- alert: IOWAIT
expr: round((avg by (instance,job) (irate(node_cpu_seconds_total{mode="iowait"}[3m])) * 100),0.01) > 80
for: 7m
labels:
severity: info
annotations:
summary: "CPU IOWAIT 過高"
description: "xxx系統{{ $labels.instance }} CPU IOWAIT {{ $value }}%"
value: "{{ $value }}"
- alert: IO
expr: round(100-(avg(irate(node_disk_io_time_seconds_total[3m])) by(instance,job)* 100),0.01) < 60
for: 7m
labels:
severity: info
annotations:
summary: "磁盤 I/O 性能低"
description: "xxx系統{{ $labels.instance }} {{ $labels.mountpoint }} 磁盤 I/O 時間占比 {{ $value }}%"
value: "{{ $value }}"
- alert: ProcessNearFDLimits
expr: process_open_fds / process_max_fds > 0.8
for: 3m
labels:
severity: critical
annotations:
summary: "A process hits 80% of the limit"
description: "xxx系統{{ $labels.instance }} 進程使用的文件描述符數占比 {{ $value }}"
value: "{{ $value }}"
- alert: TCP_ESTAB
expr: node_netstat_Tcp_CurrEstab > 20000
for: 3m
labels:
severity: info
annotations:
summary: "TCP 會話數很多"
description: "xxx系統{{ $labels.instance }} TCP 會話數為 {{ $value }}"
value: "{{ $value }}"
```
### windows_exporter_rules.yml
```
groups:
- name: WINDOWS_EXPORTER
rules:
- alert: WindowsServerServiceStatus
expr: windows_service_status{status="ok"} != 1
for: 1m
labels:
severity: Error
annotations:
summary: "Windows Server service Status "
description: "xxx系統{{ $labels.instance }}Windows服務狀態不正常"
- alert: WindowsServerCpuUsage
expr: 100 - (avg by (instance) (rate(windows_cpu_time_total{mode="idle"}[2m])) * 100) > 80
for: 0m
labels:
severity: warning
annotations:
summary: "Windows Server CPU Usage"
description: "xxx系統{{ $labels.instance }}CPU使用率超過80%"
value: "{{ $value }}"
- alert: WindowsServerCpuUsage
expr: 100 - (avg by (instance) (rate(windows_cpu_time_total{mode="idle"}[2m])) * 100) > 90
for: 0m
labels:
severity: Error
annotations:
summary: "Windows Server CPU Usage"
description: "xxx系統{{ $labels.instance }}CPU使用率超過90%"
value: "{{ $value }}"
- alert: WindowsServerMemoryUsage
expr: 100 - ((windows_os_physical_memory_free_bytes / windows_cs_physical_memory_bytes) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
summary: "Windows Server memory Usage"
description: "xxx系統{{ $labels.instance }}內存使用率超過80%"
value: "{{ $value }}"
- alert: WindowsServerMemoryUsage
expr: 100 - ((windows_os_physical_memory_free_bytes / windows_cs_physical_memory_bytes) * 100) > 90
for: 2m
labels:
severity: Error
annotations:
summary: "Windows Server memory Usage"
description: "xxx系統{{ $labels.instance }}內存使用率超過90%"
value: "{{ $value }}"
- alert: WindowsServerDiskSpaceUsage
expr: 100.0 - 100 * ((windows_logical_disk_free_bytes / 1024 / 1024 ) / (windows_logical_disk_size_bytes / 1024 / 1024)) > 80
for: 2m
labels:
severity: Error
annotations:
summary: "Windows Server disk Space Usage"
description: "xxx系統{{ $labels.instance }}磁盤使用率超過80%"
value: "{{ $value }}"
```