diff --git a/alert.rules.yml b/alert.rules.yml index 61c0740..984469c 100644 --- a/alert.rules.yml +++ b/alert.rules.yml @@ -5,7 +5,7 @@ groups: expr: up == 0 for: 1m labels: - severity: "critical" + severity: critical annotations: summary: "Endpoint {{ $labels.instance }} down" description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes." diff --git a/alertmanager.yml b/alertmanager.yml index 608e4a2..421d775 100644 --- a/alertmanager.yml +++ b/alertmanager.yml @@ -1,107 +1,15 @@ -global: - # The smarthost and SMTP sender used for mail notifications. - smtp_smarthost: 'localhost:25' - smtp_from: 'alertmanager@example.org' - -# The root route on which each incoming alert enters. +--- route: - # The root route must not have any matchers as it is the entry point for - # all alerts. It needs to have a receiver configured so alerts that do not - # match any of the sub-routes are sent to someone. - receiver: 'team-X-mails' - - # The labels by which incoming alerts are grouped together. For example, - # multiple alerts coming in for cluster=A and alertname=LatencyHigh would - # be batched into a single group. - # - # To aggregate by all possible labels use '...' as the sole label name. - # This effectively disables aggregation entirely, passing through all - # alerts as-is. This is unlikely to be what you want, unless you have - # a very low alert volume or your upstream notification system performs - # its own grouping. Example: group_by: [...] - group_by: ['alertname', 'cluster'] - - # When a new group of alerts is created by an incoming alert, wait at - # least 'group_wait' to send the initial notification. - # This way ensures that you get multiple alerts for the same group that start - # firing shortly after another are batched together on the first - # notification. + receiver: telegram.bot + group_by: [...] group_wait: 30s - - # When the first notification was sent, wait 'group_interval' to send a batch - # of new alerts that started firing for that group. - group_interval: 5m - - # If an alert has successfully been sent, wait 'repeat_interval' to - # resend them. - repeat_interval: 3h - - # All the above attributes are inherited by all child routes and can - # overwritten on each. - - # The child route trees. - routes: - # This routes performs a regular expression match on alert labels to - # catch alerts that are related to a list of services. - - match_re: - service: ^(foo1|foo2|baz)$ - receiver: team-X-mails - - # The service has a sub-route for critical alerts, any alerts - # that do not match, i.e. severity != critical, fall-back to the - # parent node and are sent to 'team-X-mails' - routes: - - match: - severity: critical - receiver: team-X-pager - - - match: - service: files - receiver: team-Y-mails - - routes: - - match: - severity: critical - receiver: team-Y-pager - - # This route handles all alerts coming from a database service. If there's - # no team to handle it, it defaults to the DB team. - - match: - service: database - - receiver: team-DB-pager - # Also group alerts by affected database. - group_by: [alertname, cluster, database] - - routes: - - match: - owner: team-X - receiver: team-X-pager - - - match: - owner: team-Y - receiver: team-Y-pager - - -# Inhibition rules allow to mute a set of alerts given that another alert is -# firing. -# We use this to mute any warning-level notifications if the same alert is -# already critical. -inhibit_rules: -- source_matchers: - - severity="critical" - target_matchers: - - severity="warning" - # Apply inhibition if the alertname is the same. - # CAUTION: - # If all label names listed in `equal` are missing - # from both the source and target alerts, - # the inhibition rule will apply! - equal: ['alertname'] - + group_interval: 1m + repeat_interval: 1h receivers: -- name: 'telegram-bot' - telegram_configs: - bot_token: "5209410321:AAGqy6WrQZQRQ0qx0pww8K6KPqzXIeRRosA" - chat_id: 65498889 \ No newline at end of file + - name: telegram.bot + telegram_configs: + - api_url: https://api.telegram.org + bot_token: 5209410321:AAGqy6WrQZQRQ0qx0pww8K6KPqzXIeRRosA + chat_id: 65498889 + parse_mode: HTML diff --git a/docker-compose.bb.yml b/docker-compose.bb.yml deleted file mode 100644 index 9c73e6a..0000000 --- a/docker-compose.bb.yml +++ /dev/null @@ -1,33 +0,0 @@ -services: - node-exporter: - container_name: node-exporter - image: prom/node-exporter:latest - network_mode: host - - prometheus: - container_name: prometheus - image: prom/prometheus:latest - network_mode: host - volumes: - - /home/f2256342/forge/monitor/prometheus.yml:/etc/prometheus/prometheus.yml - - ./alert.rules.yml:/etc/prometheus/alert.rules.yml - - grafana: - container_name: grafana - image: grafana/grafana:latest - network_mode: host - - data-generator: - container_name: data-generator - build: - context: ../prometheus-data-generator/ - network_mode: host - volumes: - - /home/f2256342/forge/prometheus-data-generator/config.yml:/config.yml - - alertmanager: - container_name: alertmanager - image: prom/alertmanager - network_mode: host - volumes: - - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml diff --git a/docker-compose.yml b/docker-compose.yml index f2cbd14..b756bfd 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,44 +2,32 @@ services: node-exporter: container_name: node-exporter image: prom/node-exporter:latest - network_mode: bridge - ports: - - "9100:9100" + network_mode: host grafana: container_name: grafana image: grafana/grafana:latest - network_mode: bridge - ports: - - "9080:3000" - - data-generator: - container_name: data-generator - build: - context: ../prometheus-data-generator/ - network_mode: bridge - ports: - - "9000:9000" - volumes: - # - ../prometheus-data-generator/config.yml:/config.yml - - /home/f2256342/forge/prometheus-data-generator/config.yml:/config.yml - command: --no-collector.rapl - + network_mode: host + # data-generator: + # container_name: data-generator + # build: + # context: ../prometheus-data-generator/ + # network_mode: host + # volumes: + # - /home/f2256342/forge/prometheus-data-generator/config.yml:/config.yml prometheus: container_name: prometheus image: prom/prometheus:latest - depends_on: - - node-exporter - - data-generator - network_mode: bridge - ports: - - "9090:9090" - env_file: - - ./.env + network_mode: host volumes: - # - /home/yutsuo/forge/monitor/prometheus.yml:/etc/prometheus/prometheus.yml - /home/f2256342/forge/monitor/prometheus.yml:/etc/prometheus/prometheus.yml - # command: --enable-feature=expand-external-labels --config.file=/etc/prometheus/prometheus.yml + - ./alert.rules.yml:/etc/prometheus/alert.rules.yml + alertmanager: + container_name: alertmanager + image: prom/alertmanager + network_mode: host + volumes: + - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml diff --git a/prometheus.yml b/prometheus.yml index f4c6347..f90dee3 100644 --- a/prometheus.yml +++ b/prometheus.yml @@ -1,37 +1,37 @@ global: scrape_interval: 15s external_labels: - monitor: "codelab-monitor" + monitor: codelab-monitor rule_files: - alert.rules.yml alerting: alertmanagers: - - static_configs: + - static_configs: - targets: - - "localhost:9093" + - localhost:9093 scrape_configs: - - job_name: "cfe-acesso" - scheme: "https" + - job_name: cfe-acesso + scheme: https scrape_interval: 5s static_configs: - - targets: ["mobi2.bb.com.br"] - metrics_path: "/cfe-acesso/api/v1/info/metrics" + - targets: [mobi2.bb.com.br] + metrics_path: /cfe-acesso/api/v1/info/metrics - - job_name: "node-exporter" + - job_name: node-exporter scrape_interval: 5s static_configs: - - targets: ["localhost:9100"] + - targets: [localhost:9100] - - job_name: "prometheus" + - job_name: prometheus relabel_configs: scrape_interval: 5s static_configs: - - targets: ["localhost:9090"] + - targets: [localhost:9090] - - job_name: "data-exporter" + - job_name: data-exporter scrape_interval: 5s static_configs: - - targets: ["localhost:9000"] + - targets: [localhost:9000]