update

4 years ago · 519999b6ab
5 changed files with 42 additions and 179 deletions
--- a/alert.rules.yml
+++ b/alert.rules.yml
@ -5,7 +5,7 @@ groups:
        expr: up == 0
        for: 1m
        labels:
-          severity: "critical"
+          severity: critical
        annotations:
          summary: "Endpoint {{ $labels.instance }} down"
          description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."
--- a/alertmanager.yml
+++ b/alertmanager.yml
@ -1,107 +1,15 @@
-global:
+---
  # The smarthost and SMTP sender used for mail notifications.
  smtp_smarthost: 'localhost:25'
  smtp_from: 'alertmanager@example.org'
 # The root route on which each incoming alert enters.
 route:
-  # The root route must not have any matchers as it is the entry point for
+  receiver: telegram.bot
-  # all alerts. It needs to have a receiver configured so alerts that do not
+  group_by: [...]
  # match any of the sub-routes are sent to someone.
  receiver: 'team-X-mails'
  # The labels by which incoming alerts are grouped together. For example,
  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
  # be batched into a single group.
  #
  # To aggregate by all possible labels use '...' as the sole label name.
  # This effectively disables aggregation entirely, passing through all
  # alerts as-is. This is unlikely to be what you want, unless you have
  # a very low alert volume or your upstream notification system performs
  # its own grouping. Example: group_by: [...]
  group_by: ['alertname', 'cluster']
  # When a new group of alerts is created by an incoming alert, wait at
  # least 'group_wait' to send the initial notification.
  # This way ensures that you get multiple alerts for the same group that start
  # firing shortly after another are batched together on the first
  # notification.
  group_wait: 30s
-
+  group_interval: 1m
-  # When the first notification was sent, wait 'group_interval' to send a batch
+  repeat_interval: 1h
  # of new alerts that started firing for that group.
  group_interval: 5m
  # If an alert has successfully been sent, wait 'repeat_interval' to
  # resend them.
  repeat_interval: 3h
  # All the above attributes are inherited by all child routes and can
  # overwritten on each.
  # The child route trees.
  routes:
  # This routes performs a regular expression match on alert labels to
  # catch alerts that are related to a list of services.
  - match_re:
      service: ^(foo1|foo2|baz)$
    receiver: team-X-mails
    # The service has a sub-route for critical alerts, any alerts
    # that do not match, i.e. severity != critical, fall-back to the
    # parent node and are sent to 'team-X-mails'
    routes:
    - match:
        severity: critical
      receiver: team-X-pager
  - match:
      service: files
    receiver: team-Y-mails
    routes:
    - match:
        severity: critical
      receiver: team-Y-pager
  # This route handles all alerts coming from a database service. If there's
  # no team to handle it, it defaults to the DB team.
  - match:
      service: database
    receiver: team-DB-pager
    # Also group alerts by affected database.
    group_by: [alertname, cluster, database]
    routes:
    - match:
        owner: team-X
      receiver: team-X-pager
    - match:
        owner: team-Y
      receiver: team-Y-pager
 # Inhibition rules allow to mute a set of alerts given that another alert is
 # firing.
 # We use this to mute any warning-level notifications if the same alert is
 # already critical.
 inhibit_rules:
 - source_matchers:
    - severity="critical"
  target_matchers:
    - severity="warning"
  # Apply inhibition if the alertname is the same.
  # CAUTION: 
  #   If all label names listed in `equal` are missing 
  #   from both the source and target alerts,
  #   the inhibition rule will apply!
  equal: ['alertname']
 receivers:
- name: 'telegram-bot'
+  - name: telegram.bot
-  telegram_configs:
+    telegram_configs:
-    bot_token: "5209410321:AAGqy6WrQZQRQ0qx0pww8K6KPqzXIeRRosA"
+      - api_url: https://api.telegram.org
-    chat_id: 65498889
+        bot_token: 5209410321:AAGqy6WrQZQRQ0qx0pww8K6KPqzXIeRRosA
        chat_id: 65498889
        parse_mode: HTML
--- a/docker-compose.bb.yml
+++ b/docker-compose.bb.yml
@ -1,33 +0,0 @@
 services:
  node-exporter:
    container_name: node-exporter
    image: prom/node-exporter:latest
    network_mode: host
  prometheus:
    container_name: prometheus
    image: prom/prometheus:latest
    network_mode: host
    volumes:
      - /home/f2256342/forge/monitor/prometheus.yml:/etc/prometheus/prometheus.yml
      - ./alert.rules.yml:/etc/prometheus/alert.rules.yml
  grafana:
    container_name: grafana
    image: grafana/grafana:latest
    network_mode: host
  data-generator:
    container_name: data-generator
    build:
      context: ../prometheus-data-generator/
    network_mode: host
    volumes:
      - /home/f2256342/forge/prometheus-data-generator/config.yml:/config.yml
  alertmanager:
    container_name: alertmanager
    image: prom/alertmanager
    network_mode: host
    volumes:
      - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -2,44 +2,32 @@ services:
  node-exporter:
    container_name: node-exporter
    image: prom/node-exporter:latest
-    network_mode: bridge
+    network_mode: host
    ports:
      - "9100:9100"
  grafana:
    container_name: grafana
    image: grafana/grafana:latest
-    network_mode: bridge
+    network_mode: host
    ports:
      - "9080:3000"
  data-generator:
    container_name: data-generator
    build:
      context: ../prometheus-data-generator/
    network_mode: bridge
    ports:
      - "9000:9000"
    volumes:
      # - ../prometheus-data-generator/config.yml:/config.yml
      - /home/f2256342/forge/prometheus-data-generator/config.yml:/config.yml
    command: --no-collector.rapl
  # data-generator:
  #   container_name: data-generator
  #   build:
  #     context: ../prometheus-data-generator/
  #   network_mode: host
  #   volumes:
  #     - /home/f2256342/forge/prometheus-data-generator/config.yml:/config.yml
  prometheus:
    container_name: prometheus
    image: prom/prometheus:latest
-    depends_on:
+    network_mode: host
      - node-exporter
      - data-generator
    network_mode: bridge
    ports:
      - "9090:9090"
    env_file:
      - ./.env
    volumes:
      # - /home/yutsuo/forge/monitor/prometheus.yml:/etc/prometheus/prometheus.yml
      - /home/f2256342/forge/monitor/prometheus.yml:/etc/prometheus/prometheus.yml
-    # command: --enable-feature=expand-external-labels --config.file=/etc/prometheus/prometheus.yml
+      - ./alert.rules.yml:/etc/prometheus/alert.rules.yml
  alertmanager:
    container_name: alertmanager
    image: prom/alertmanager
    network_mode: host
    volumes:
      - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
--- a/prometheus.yml
+++ b/prometheus.yml
@ -1,37 +1,37 @@
 global:
  scrape_interval: 15s
  external_labels:
-    monitor: "codelab-monitor"
+    monitor: codelab-monitor
 rule_files:
  - alert.rules.yml
 alerting:
  alertmanagers:
-      - static_configs:
+    - static_configs:
        - targets:
-          - "localhost:9093"
+            - localhost:9093
 scrape_configs:
-  - job_name: "cfe-acesso"
+  - job_name: cfe-acesso
-    scheme: "https"
+    scheme: https
    scrape_interval: 5s
    static_configs:
-      - targets: ["mobi2.bb.com.br"]
+      - targets: [mobi2.bb.com.br]
-    metrics_path: "/cfe-acesso/api/v1/info/metrics"
+    metrics_path: /cfe-acesso/api/v1/info/metrics
-  - job_name: "node-exporter"
+  - job_name: node-exporter
    scrape_interval: 5s
    static_configs:
-      - targets: ["localhost:9100"]
+      - targets: [localhost:9100]
-  - job_name: "prometheus"
+  - job_name: prometheus
    relabel_configs:
    scrape_interval: 5s
    static_configs:
-      - targets: ["localhost:9090"]
+      - targets: [localhost:9090]
-  - job_name: "data-exporter"
+  - job_name: data-exporter
    scrape_interval: 5s
    static_configs:
-      - targets: ["localhost:9000"]
+      - targets: [localhost:9000]