在之前的博客中,说明了报警的一般步骤前置条件
1 编写Prometheus配置
配置信息:
# my global config
global:
  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
  alertmanagers:
  - static_configs:
    - targets:
       - 192.168.156.135:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  # - "first_rules.yml"
  # - "second_rules.yml"
  - "/opt/prometheus/prometheus-2.6.1.linux-amd64/rules/*.rules"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: 'prometheus'
    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.
    static_configs:
    - targets: ['localhost:9090']
  - job_name: 'agent1'
    static_configs: 
    - targets: ['192.168.156.135:9100']
  - job_name: pushgateway
    honor_labels: true
    static_configs:
      - targets: ['192.168.156.135:9091']
        labels:
          instance: pushgateway
实际操作:
[root@localhost prometheus-2.6.1.linux-amd64]# vim prometheus.yml 
# my global config
global:
  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
  alertmanagers:
  - static_configs:
    - targets:
       - 192.168.156.135:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  # - "first_rules.yml"
  # - "second_rules.yml"
  - "/opt/prometheus/prometheus-2.6.1.linux-amd64/rules/*.rules"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: 'prometheus'
    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.
    static_configs:
    - targets: ['localhost:9090']
  - job_name: 'agent1'
    static_configs: 
    - targets: ['192.168.156.135:9100']
  - job_name: pushgateway
    honor_labels: true
    static_configs:
      - targets: ['192.168.156.135:9091']
        labels:
          instance: pushgateway
[root@localhost prometheus-2.6.1.linux-amd64]# 
2 编写报警规则
配置信息:
groups:
- name: node-up
  rules:
  - alert: node-up
    expr: up{job="agent1"} == 0
    for: 15s
    labels:
      severity: 1
      team: node
    annotations:
      summary: "{{ $labels.instance }} 已停止运行超过 15s!"
      description: "{{ $labels.instance }} 检测到异常停止!请重点关注!!!"
时间操作:
[root@localhost rules]# vimnode-up.rules 
groups:
- name: node-up
  rules:
  - alert: node-up
    expr: up{job="agent1"} == 0
    for: 15s
    labels:
      severity: 1
      team: node
    annotations:
      summary: "{{ $labels.instance }} 已停止运行超过 15s!"
      description: "{{ $labels.instance }} 检测到异常停止!请重点关注!!!"
[root@localhost rules]# 
[root@localhost rules]# pwd
/opt/prometheus/prometheus-2.6.1.linux-amd64/rules
[root@localhost rules]# ls
node-up.rules
[root@localhost rules]# 
3 编写alertmanager配置
配置信息:
global:
  resolve_timeout: 5m
route:
  group_by: ['alertname']
  group_wait: 5s
  group_interval: 5s
  repeat_interval: 5m
  receiver: 'prometheusalert-feishu'
#  receiver: 'web.hook.prometheusalert'
#  routes:
#  - receiver: 'prometheusalert-feishu'
#    group_wait: 10s
#    match:
#      level: '2'
receivers:
#- name: 'web.hook.prometheusalert'
#  webhook_configs:
#  - url: "http://[prometheusalert_url]:8080/prometheusalert/alert"
- name: 'prometheusalert-feishu'
  webhook_configs:
  - url: "http://192.168.156.135:8080/prometheusalert?type=fs&tpl=prometheus-fsv2&fsurlhttps://open.feishu.cn/open-apis/bot/v2/hook/xxx-3fba-4903-87d3-xxxx"
实际操作:
[root@localhost alertmanager]# vim  alertmanager2.yml 
global:
  resolve_timeout: 5m
route:
  group_by: ['alertname']
  group_wait: 5s
  group_interval: 5s
  repeat_interval: 5m
  receiver: 'prometheusalert-feishu'
#  receiver: 'web.hook.prometheusalert'
#  routes:
#  - receiver: 'prometheusalert-feishu'
#    group_wait: 10s
#    match:
#      level: '2'
receivers:
#- name: 'web.hook.prometheusalert'
#  webhook_configs:
#  - url: "http://[prometheusalert_url]:8080/prometheusalert/alert"
- name: 'prometheusalert-feishu'
  webhook_configs:
  - url: "http://192.168.156.135:8080/prometheusalert?type=fs&tpl=prometheus-fsv2&fsurlhttps://open.feishu.cn/open-apis/bot/v2/hook/xxx-3fba-4903-87d3-xxx"
"alertmanager2.yml" 22L, 660C 已写入                                                                             
[root@localhost alertmanager]# ./amtool check-config  alertmanager2.yml
Checking 'alertmanager2.yml'  SUCCESS
Found:
 - global config
 - route
 - 0 inhibit rules
 - 1 receivers
 - 0 templates
[root@localhost alertmanager]# ls
alertmanager  alertmanager1.yml  alertmanager2.yml  alertmanager.yml  amtool  data  LICENSE  NOTICE  template
[root@localhost alertmanager]# pwd
/opt/prometheus/alertmanager
[root@localhost alertmanager]# 
4 重启服务
4.1 启动PrometheusAlert
在PrometheusAlert的安装目录下面启动
./PrometheusAlert
4.1 重启Prometheus
在安装目录里面如下面操作
[root@localhost prometheus-2.6.1.linux-amd64]# pwd
/opt/prometheus/prometheus-2.6.1.linux-amd64
[root@localhost prometheus-2.6.1.linux-amd64]# ls
console_libraries  consoles  data  LICENSE  NOTICE  prometheus  prometheus.yml  promtool  rules
启动服务
[root@localhost prometheus-2.6.1.linux-amd64]# 
[root@localhost prometheus-2.6.1.linux-amd64]# pkill prometheus
[root@localhost prometheus-2.6.1.linux-amd64]# lsof -i:9090
[root@localhost prometheus-2.6.1.linux-amd64]# ./prometheus --config.file=prometheus.yml & 
4.2 重启alertmanager服务
[root@localhost alertmanager]# pwd
/opt/prometheus/alertmanager
[root@localhost alertmanager]# ls
alertmanager  alertmanager1.yml  alertmanager2.yml  alertmanager.yml  amtool  data  LICENSE  NOTICE  template
[root@localhost alertmanager]# 
启动服务
[root@localhost alertmanager]# ./alertmanager --config.file=alertmanager2.yml
5 关闭node节点制造错误
[root@localhost node_export]# pwd
/opt/node_export
[root@localhost node_export]# ls
LICENSE  node_exporter  nohup.out  NOTICE
[root@localhost node_export]# 
5.1 关闭node
[root@localhost node_export]# lsof -i:9100
COMMAND     PID USER   FD   TYPE  DEVICE SIZE/OFF NODE NAME
prometheu 95405 root   19u  IPv4 1035389      0t0  TCP localhost.localdomain:59670->localhost.localdomain:jetdirect (ESTABLISHED)
node_expo 96011 root    3u  IPv6 1034103      0t0  TCP *:jetdirect (LISTEN)
node_expo 96011 root    5u  IPv6 1035390      0t0  TCP localhost.localdomain:jetdirect->localhost.localdomain:59670 (ESTABLISHED)
[root@localhost node_export]# kill 96011
[root@localhost node_export]# 
查看消息
 
5.2 重启node
[root@localhost node_export]# nohup ./node_exporter &
[7] 96267
[6]   已终止               nohup ./node_exporter
[root@localhost node_export]# nohup: 忽略输入并把输出追加到"nohup.out"
[root@localhost node_export]#
查看消息
 










