From d981ef3f184ef6dbee9c068b89c7cbee18d3fb7a Mon Sep 17 00:00:00 2001 From: mishaschwartz <4380924+mishaschwartz@users.noreply.github.com> Date: Tue, 4 Jun 2024 13:57:54 -0400 Subject: [PATCH 01/10] add longterm monitoring components --- birdhouse/components/monitoring/default.env | 2 +- .../prometheus-longterm-metrics/.gitignore | 3 + .../config/magpie/config.yml.template | 28 ++++++ .../config/magpie/docker-compose-extra.yml | 7 ++ .../monitoring/docker-compose-extra.yml | 6 ++ .../monitoring/prometheus.rules.template | 16 ++++ .../monitoring.conf.template | 18 ++++ .../config/proxy/docker-compose-extra.yml | 6 ++ .../prometheus-longterm-metrics/default.env | 15 ++++ .../docker-compose-extra.yml | 32 +++++++ .../pre-docker-compose-up | 3 + .../prometheus.yml | 18 ++++ .../optional-components/thanos/.gitignore | 2 + .../thanos/config/magpie/config.yml.template | 28 ++++++ .../config/magpie/docker-compose-extra.yml | 7 ++ .../monitoring.conf.template | 38 +++++++++ .../config/proxy/docker-compose-extra.yml | 6 ++ .../optional-components/thanos/default.env | 50 +++++++++++ .../thanos/docker-compose-extra.yml | 85 +++++++++++++++++++ .../thanos/minio-entrypoint | 5 ++ 20 files changed, 374 insertions(+), 1 deletion(-) create mode 100644 birdhouse/optional-components/prometheus-longterm-metrics/.gitignore create mode 100644 birdhouse/optional-components/prometheus-longterm-metrics/config/magpie/config.yml.template create mode 100644 birdhouse/optional-components/prometheus-longterm-metrics/config/magpie/docker-compose-extra.yml create mode 100644 birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/docker-compose-extra.yml create mode 100644 birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.rules.template create mode 100644 birdhouse/optional-components/prometheus-longterm-metrics/config/proxy/conf.extra-service.d/monitoring.conf.template create mode 100644 birdhouse/optional-components/prometheus-longterm-metrics/config/proxy/docker-compose-extra.yml create mode 100644 birdhouse/optional-components/prometheus-longterm-metrics/default.env create mode 100644 birdhouse/optional-components/prometheus-longterm-metrics/docker-compose-extra.yml create mode 100755 birdhouse/optional-components/prometheus-longterm-metrics/pre-docker-compose-up create mode 100644 birdhouse/optional-components/prometheus-longterm-metrics/prometheus.yml create mode 100644 birdhouse/optional-components/thanos/.gitignore create mode 100644 birdhouse/optional-components/thanos/config/magpie/config.yml.template create mode 100644 birdhouse/optional-components/thanos/config/magpie/docker-compose-extra.yml create mode 100644 birdhouse/optional-components/thanos/config/proxy/conf.extra-service.d/monitoring.conf.template create mode 100644 birdhouse/optional-components/thanos/config/proxy/docker-compose-extra.yml create mode 100644 birdhouse/optional-components/thanos/default.env create mode 100644 birdhouse/optional-components/thanos/docker-compose-extra.yml create mode 100755 birdhouse/optional-components/thanos/minio-entrypoint diff --git a/birdhouse/components/monitoring/default.env b/birdhouse/components/monitoring/default.env index c6f781fde..a42c1a1a8 100644 --- a/birdhouse/components/monitoring/default.env +++ b/birdhouse/components/monitoring/default.env @@ -8,7 +8,7 @@ export GRAFANA_VERSION="7.0.3" export GRAFANA_DOCKER=grafana/grafana export GRAFANA_IMAGE='${GRAFANA_DOCKER}:${GRAFANA_VERSION}' -export PROMETHEUS_VERSION="v2.19.0" +export PROMETHEUS_VERSION="v2.52.0" export PROMETHEUS_DOCKER=prom/prometheus export PROMETHEUS_IMAGE='${PROMETHEUS_DOCKER}:${PROMETHEUS_VERSION}' diff --git a/birdhouse/optional-components/prometheus-longterm-metrics/.gitignore b/birdhouse/optional-components/prometheus-longterm-metrics/.gitignore new file mode 100644 index 000000000..352988cd5 --- /dev/null +++ b/birdhouse/optional-components/prometheus-longterm-metrics/.gitignore @@ -0,0 +1,3 @@ +config/monitoring/prometheus.rules +config/magpie/config.yml +config/proxy/conf.extra-service.d/monitoring.conf diff --git a/birdhouse/optional-components/prometheus-longterm-metrics/config/magpie/config.yml.template b/birdhouse/optional-components/prometheus-longterm-metrics/config/magpie/config.yml.template new file mode 100644 index 000000000..420685852 --- /dev/null +++ b/birdhouse/optional-components/prometheus-longterm-metrics/config/magpie/config.yml.template @@ -0,0 +1,28 @@ +providers: + prometheus-longterm-metrics: + # below URL is only used to fill in the required location in Magpie + # actual auth validation is performed with Twitcher 'verify' endpoint without accessing this proxied URL + url: http://proxy:80 + title: PrometheusLongtermMetrics + public: true + c4i: false + type: api + sync_type: api + +permissions: + - service: prometheus-longterm-metrics + permission: read + group: administrators + action: create + - service: prometheus-longterm-metrics + permission: write + group: administrators + action: create + - service: prometheus-longterm-metrics + permission: read + group: monitoring + action: create + - service: prometheus-longterm-metrics + permission: write + group: monitoring + action: create diff --git a/birdhouse/optional-components/prometheus-longterm-metrics/config/magpie/docker-compose-extra.yml b/birdhouse/optional-components/prometheus-longterm-metrics/config/magpie/docker-compose-extra.yml new file mode 100644 index 000000000..4278c611e --- /dev/null +++ b/birdhouse/optional-components/prometheus-longterm-metrics/config/magpie/docker-compose-extra.yml @@ -0,0 +1,7 @@ +version: "3.4" + +services: + magpie: + volumes: + - ./optional-components/prometheus-longterm-metrics/config/magpie/config.yml:${MAGPIE_PERMISSIONS_CONFIG_PATH}/prometheus-longterm-metrics.yml:ro + - ./optional-components/prometheus-longterm-metrics/config/magpie/config.yml:${MAGPIE_PROVIDERS_CONFIG_PATH}/prometheus-longterm-metrics.yml:ro diff --git a/birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/docker-compose-extra.yml b/birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/docker-compose-extra.yml new file mode 100644 index 000000000..4eeb181ad --- /dev/null +++ b/birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/docker-compose-extra.yml @@ -0,0 +1,6 @@ +version: "3.4" + +services: + prometheus: + volumes: + - ./optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.rules:/etc/prometheus/prometheus-longterm-metrics.rules:ro diff --git a/birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.rules.template b/birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.rules.template new file mode 100644 index 000000000..48a1cd26d --- /dev/null +++ b/birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.rules.template @@ -0,0 +1,16 @@ +groups: + - name: longterm-metrics + interval: ${PROMETHEUS_LONGTERM_STORE_INTERVAL} + rules: + - record: cpu_instance:cpu_load_irate:avg${PROMETHEUS_LONGTERM_STORE_INTERVAL} + expr: avg by(cpu, instance) (irate(node_cpu_seconds_total{mode!="idle"}[${PROMETHEUS_LONGTERM_STORE_INTERVAL}])) + labels: + group: longterm-metrics + - record: instance:network_bytes_received_irate:sum${PROMETHEUS_LONGTERM_STORE_INTERVAL} + expr: sum by (instance) (irate(node_network_receive_bytes_total[${PROMETHEUS_LONGTERM_STORE_INTERVAL}])) + labels: + group: longterm-metrics + - record: instance:network_bytes_sent_irate:sum${PROMETHEUS_LONGTERM_STORE_INTERVAL} + expr: sum by (instance) (irate(node_network_transmit_bytes_total[${PROMETHEUS_LONGTERM_STORE_INTERVAL}])) + labels: + group: longterm-metrics diff --git a/birdhouse/optional-components/prometheus-longterm-metrics/config/proxy/conf.extra-service.d/monitoring.conf.template b/birdhouse/optional-components/prometheus-longterm-metrics/config/proxy/conf.extra-service.d/monitoring.conf.template new file mode 100644 index 000000000..67c25c053 --- /dev/null +++ b/birdhouse/optional-components/prometheus-longterm-metrics/config/proxy/conf.extra-service.d/monitoring.conf.template @@ -0,0 +1,18 @@ + location /prometheus-longterm-metrics { + auth_request /secure-prometheus-longterm-metrics-auth; + auth_request_set $auth_status $upstream_status; + proxy_pass http://prometheus-longterm-metrics:9090; + proxy_set_header Host $host; + } + + location = /secure-prometheus-longterm-metrics-auth { + internal; + proxy_pass https://${BIRDHOUSE_FQDN_PUBLIC}${TWITCHER_VERIFY_PATH}/prometheus-longterm-metrics$request_uri; + proxy_pass_request_body off; + proxy_set_header Host $host; + proxy_set_header Content-Length ""; + proxy_set_header X-Original-URI $request_uri; + proxy_set_header X-Forwarded-Proto $real_scheme; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Host $host:$server_port; + } diff --git a/birdhouse/optional-components/prometheus-longterm-metrics/config/proxy/docker-compose-extra.yml b/birdhouse/optional-components/prometheus-longterm-metrics/config/proxy/docker-compose-extra.yml new file mode 100644 index 000000000..b25d3e080 --- /dev/null +++ b/birdhouse/optional-components/prometheus-longterm-metrics/config/proxy/docker-compose-extra.yml @@ -0,0 +1,6 @@ +version: "3.4" + +services: + proxy: + volumes: + - ./optional-components/prometheus-longterm-metrics/config/proxy/conf.extra-service.d:/etc/nginx/conf.extra-service.d/prometheus-longterm-metrics:ro diff --git a/birdhouse/optional-components/prometheus-longterm-metrics/default.env b/birdhouse/optional-components/prometheus-longterm-metrics/default.env new file mode 100644 index 000000000..2f22ce078 --- /dev/null +++ b/birdhouse/optional-components/prometheus-longterm-metrics/default.env @@ -0,0 +1,15 @@ +export PROMETHEUS_LONGTERM_RETENTION_TIME=1y +export PROMETHEUS_LONGTERM_STORE_INTERVAL=1h + +# These are the prometheus defaults +export PROMETHEUS_LONGTERM_TSDB_MIN_BLOCK_DURATION=2h +export PROMETHEUS_LONGTERM_TSDB_MAX_BLOCK_DURATION=1d12h + +OPTIONAL_VARS=" + $OPTIONAL_VARS + \$PROMETHEUS_LONGTERM_STORE_INTERVAL +" + +COMPONENT_DEPENDENCIES=" + ./components/monitoring +" diff --git a/birdhouse/optional-components/prometheus-longterm-metrics/docker-compose-extra.yml b/birdhouse/optional-components/prometheus-longterm-metrics/docker-compose-extra.yml new file mode 100644 index 000000000..49c6f152c --- /dev/null +++ b/birdhouse/optional-components/prometheus-longterm-metrics/docker-compose-extra.yml @@ -0,0 +1,32 @@ +version: "3.4" + +x-logging: + &default-logging + driver: "json-file" + options: + max-size: "50m" + max-file: "10" + +services: + prometheus-longterm-metrics: + image: ${PROMETHEUS_IMAGE} + container_name: prometheus-longterm-metrics + volumes: + - ./optional-components/prometheus-longterm-metrics/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus_longterm_persistence:/prometheus:rw + command: + - --config.file=/etc/prometheus/prometheus.yml + - --storage.tsdb.path=/prometheus + - --web.console.libraries=/usr/share/prometheus/console_libraries + - --web.console.templates=/usr/share/prometheus/consoles + - --storage.tsdb.retention.time=${PROMETHEUS_LONGTERM_RETENTION_TIME} + - --web.external-url=https://${BIRDHOUSE_FQDN_PUBLIC}/prometheus-longterm-metrics/ + - --storage.tsdb.min-block-duration=${PROMETHEUS_LONGTERM_TSDB_MIN_BLOCK_DURATION} + - --storage.tsdb.max-block-duration=${PROMETHEUS_LONGTERM_TSDB_MAX_BLOCK_DURATION} + restart: always + logging: *default-logging + +volumes: + prometheus_longterm_persistence: + external: + name: prometheus_longterm_persistence diff --git a/birdhouse/optional-components/prometheus-longterm-metrics/pre-docker-compose-up b/birdhouse/optional-components/prometheus-longterm-metrics/pre-docker-compose-up new file mode 100755 index 000000000..76a44e2e8 --- /dev/null +++ b/birdhouse/optional-components/prometheus-longterm-metrics/pre-docker-compose-up @@ -0,0 +1,3 @@ +#!/bin/sh -x + +docker volume create prometheus_longterm_persistence # metrics db diff --git a/birdhouse/optional-components/prometheus-longterm-metrics/prometheus.yml b/birdhouse/optional-components/prometheus-longterm-metrics/prometheus.yml new file mode 100644 index 000000000..ba089ac3e --- /dev/null +++ b/birdhouse/optional-components/prometheus-longterm-metrics/prometheus.yml @@ -0,0 +1,18 @@ +global: + external_labels: + instance_name: prometheus-longterm-metrics + +scrape_configs: + - job_name: 'federate' + scrape_interval: 15s + + honor_labels: true + metrics_path: '/prometheus/federate' + + params: + 'match[]': + - '{group="longterm-metrics"}' + + static_configs: + - targets: + - 'prometheus:9090' diff --git a/birdhouse/optional-components/thanos/.gitignore b/birdhouse/optional-components/thanos/.gitignore new file mode 100644 index 000000000..97ac1a63e --- /dev/null +++ b/birdhouse/optional-components/thanos/.gitignore @@ -0,0 +1,2 @@ +config/magpie/config.yml +config/proxy/conf.extra-service.d/monitoring.conf diff --git a/birdhouse/optional-components/thanos/config/magpie/config.yml.template b/birdhouse/optional-components/thanos/config/magpie/config.yml.template new file mode 100644 index 000000000..05633dff4 --- /dev/null +++ b/birdhouse/optional-components/thanos/config/magpie/config.yml.template @@ -0,0 +1,28 @@ +providers: + thanos: + # below URL is only used to fill in the required location in Magpie + # actual auth validation is performed with Twitcher 'verify' endpoint without accessing this proxied URL + url: http://proxy:80 + title: Thanos + public: true + c4i: false + type: api + sync_type: api + +permissions: + - service: thanos + permission: read + group: administrators + action: create + - service: thanos + permission: write + group: administrators + action: create + - service: thanos + permission: read + group: monitoring + action: create + - service: thanos + permission: write + group: monitoring + action: create diff --git a/birdhouse/optional-components/thanos/config/magpie/docker-compose-extra.yml b/birdhouse/optional-components/thanos/config/magpie/docker-compose-extra.yml new file mode 100644 index 000000000..fd3e207ac --- /dev/null +++ b/birdhouse/optional-components/thanos/config/magpie/docker-compose-extra.yml @@ -0,0 +1,7 @@ +version: "3.4" + +services: + magpie: + volumes: + - ./optional-components/thanos/config/magpie/config.yml:${MAGPIE_PERMISSIONS_CONFIG_PATH}/thanos.yml:ro + - ./optional-components/thanos/config/magpie/config.yml:${MAGPIE_PROVIDERS_CONFIG_PATH}/thanos.yml:ro diff --git a/birdhouse/optional-components/thanos/config/proxy/conf.extra-service.d/monitoring.conf.template b/birdhouse/optional-components/thanos/config/proxy/conf.extra-service.d/monitoring.conf.template new file mode 100644 index 000000000..b2aa98ccd --- /dev/null +++ b/birdhouse/optional-components/thanos/config/proxy/conf.extra-service.d/monitoring.conf.template @@ -0,0 +1,38 @@ + location /thanos-query { + auth_request /secure-thanos-auth; + auth_request_set $auth_status $upstream_status; + proxy_pass http://thanos-query:19192; + proxy_set_header Host $host; + } + + location /thanos-minio/ { + auth_request /secure-thanos-auth; + auth_request_set $auth_status $upstream_status; + + rewrite ^/thanos-minio/(.*) /$1 break; + proxy_pass http://minio:9001; + + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "Upgrade"; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + + # This allows WebSocket connections + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + } + + location = /secure-thanos-auth { + internal; + proxy_pass https://${BIRDHOUSE_FQDN_PUBLIC}${TWITCHER_VERIFY_PATH}/thanos$request_uri; + proxy_pass_request_body off; + proxy_set_header Host $host; + proxy_set_header Content-Length ""; + proxy_set_header X-Original-URI $request_uri; + proxy_set_header X-Forwarded-Proto $real_scheme; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Host $host:$server_port; + } diff --git a/birdhouse/optional-components/thanos/config/proxy/docker-compose-extra.yml b/birdhouse/optional-components/thanos/config/proxy/docker-compose-extra.yml new file mode 100644 index 000000000..39977c0f4 --- /dev/null +++ b/birdhouse/optional-components/thanos/config/proxy/docker-compose-extra.yml @@ -0,0 +1,6 @@ +version: "3.4" + +services: + proxy: + volumes: + - ./optional-components/thanos/config/proxy/conf.extra-service.d:/etc/nginx/conf.extra-service.d/thanos:ro diff --git a/birdhouse/optional-components/thanos/default.env b/birdhouse/optional-components/thanos/default.env new file mode 100644 index 000000000..ba1ed726c --- /dev/null +++ b/birdhouse/optional-components/thanos/default.env @@ -0,0 +1,50 @@ + +export THANOS_VERSION=v0.35.1 +export THANOS_DOCKER="thanosio/thanos" +export THANOS_IMAGE='${THANOS_DOCKER}:${THANOS_VERSION}' + +export MINIO_VERSION=RELEASE.2024-05-27T19-17-46Z +export MINIO_DOCKER=minio/minio +export MINIO_IMAGE='${MINIO_DOCKER}:${MINIO_VERSION}' + +# Minio uses object storage on disk at this location +export MINIO_DATA_STORE='${BIRDHOUSE_DATA_PERSIST_ROOT}/minio_data/' + +# Note that bucket names must only contain lowercase ascii, digits, - and . +export THANOS_MINIO_BUCKET_NAME=thanos-bucket + +# Minio credentials +export __DEFAULT__MINIO_ROOT_USER=minioadmin +export __DEFAULT__MINIO_ROOT_PASSWORD=minioadmin +export MINIO_ROOT_USER="${__DEFAULT__MINIO_ROOT_USER}" +export MINIO_ROOT_PASSWORD="${__DEFAULT__MINIO_ROOT_PASSWORD}" + +# Set a schedule to run the compactor. This should be significantly larger than the PROMETHEUS_LONGTERM_STORE_INTERVAL +export THANOS_COMPACTOR_WAIT_INTERVAL=24h + +# The longterm data retention time can be shortened back to the default since Thanos is now responsible for +# storing longterm data, not the prometheus-longterm-metrics component. +export PROMETHEUS_LONGTERM_RETENTION_TIME=15d + +# The thanos-sidecar component requires that these two values be equal or else it cannot perform its own compaction +# https://thanos.io/tip/components/sidecar.md/#sidecar +export PROMETHEUS_LONGTERM_TSDB_MIN_BLOCK_DURATION=2h +export PROMETHEUS_LONGTERM_TSDB_MAX_BLOCK_DURATION=2h + +VARS=" + $VARS + \$MINIO_ROOT_USER + \$MINIO_ROOT_PASSWORD +" + +export DELAYED_EVAL=" + $DELAYED_EVAL + THANOS_IMAGE + MINIO_IMAGE + MINIO_DATA_STORE +" + +COMPONENT_DEPENDENCIES=" + ./components/monitoring + ./optional-components/prometheus-longterm-metrics +" diff --git a/birdhouse/optional-components/thanos/docker-compose-extra.yml b/birdhouse/optional-components/thanos/docker-compose-extra.yml new file mode 100644 index 000000000..1b3d170ee --- /dev/null +++ b/birdhouse/optional-components/thanos/docker-compose-extra.yml @@ -0,0 +1,85 @@ +version: "3.4" + +x-logging: + &default-logging + driver: "json-file" + options: + max-size: "50m" + max-file: "10" + +x-objstore-config: &objstore-config | + --objstore.config=type: S3 + config: + bucket: ${THANOS_MINIO_BUCKET_NAME} + access_key: ${MINIO_ROOT_USER} + secret_key: ${MINIO_ROOT_PASSWORD} + endpoint: minio:9000 + insecure: true # use http instead of https + +services: + thanos-sidecar: + image: ${THANOS_IMAGE} + container_name: thanos-sidecar + volumes: + - prometheus_longterm_persistence:/prometheus + user: nobody # prometheus runs as this user so the sidecar must as well + command: + - 'sidecar' + - '--tsdb.path=/prometheus' + - '--prometheus.url=http://prometheus-longterm-metrics:9090/prometheus-longterm-metrics' + - '--grpc-address=0.0.0.0:19090' + - '--http-address=0.0.0.0:19191' + - *objstore-config + depends_on: + - prometheus-longterm-metrics + - minio + restart: always + logging: *default-logging + + thanos-query: + image: ${THANOS_IMAGE} + container_name: thanos-query + command: + - 'query' + - '--http-address=0.0.0.0:19192' + - '--web.route-prefix=/thanos-query' + - '--web.external-prefix=/thanos-query' + - '--endpoint=thanos-sidecar:19090' + depends_on: + - thanos-sidecar + restart: always + logging: *default-logging + + thanos-compactor: + image: ${THANOS_IMAGE} + container_name: thanos-compactor + command: + - 'compact' + - '--data-dir=/tmp/data' # temporary workspace (doesn't need to be a volume) + - '--wait' + - '--wait-interval=${THANOS_COMPACTOR_WAIT_INTERVAL}' + - *objstore-config + depends_on: + - minio + restart: always + logging: *default-logging + + minio: + image: ${MINIO_IMAGE} + container_name: minio + volumes: + - ${MINIO_DATA_STORE}:/data + - ./optional-components/thanos/minio-entrypoint:/entrypoint + entrypoint: /entrypoint + command: + - 'minio' + - 'server' + - '--console-address' + - ':9001' + - '/data' + environment: + - MINIO_ROOT_USER=${MINIO_ROOT_USER} + - MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD} + - MINIO_PROMETHEUS_AUTH_TYPE=public + - THANOS_MINIO_BUCKET_NAME=${THANOS_MINIO_BUCKET_NAME} + - MINIO_BROWSER_REDIRECT_URL=https://${BIRDHOUSE_FQDN_PUBLIC}/thanos-minio diff --git a/birdhouse/optional-components/thanos/minio-entrypoint b/birdhouse/optional-components/thanos/minio-entrypoint new file mode 100755 index 000000000..a02c3368f --- /dev/null +++ b/birdhouse/optional-components/thanos/minio-entrypoint @@ -0,0 +1,5 @@ +#!/bin/sh + +mkdir -p "/data/${THANOS_MINIO_BUCKET_NAME}" + +exec "$@" From 06dc997c09aa8e4926ea8433950e5e14c30cef0f Mon Sep 17 00:00:00 2001 From: mishaschwartz <4380924+mishaschwartz@users.noreply.github.com> Date: Fri, 7 Jun 2024 15:57:27 -0400 Subject: [PATCH 02/10] add documentation --- CHANGES.md | 36 +++++++++++++++++++++++- birdhouse/env.local.example | 9 ++++++ birdhouse/optional-components/README.rst | 35 +++++++++++++++++++++++ 3 files changed, 79 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index b735e3253..988e835ec 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -15,7 +15,41 @@ [Unreleased](https://github.com/bird-house/birdhouse-deploy/tree/master) (latest) ------------------------------------------------------------------------------------------------------------------ -[//]: # (list changes here, using '-' for each new entry, remove this when items are added) +## Changes + +- Add the `prometheus-longterm-metrics` and `thanos` optional components + + The `prometheus-longterm-metrics` component collects longterm monitoring metrics from the original prometheus instance + (the one created by the ``components/monitoring`` component). + + Longterm metrics are any prometheus rule that have the label ``group: longterm-metrics`` or in other words are + selectable using prometheus's ``'{group="longterm-metrics"}'`` query filter. To see which longterm metric rules are + added by default see the + ``optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.rules.template`` file. + + To configure this component: + + * update the ``PROMETHEUS_LONGTERM_RETENTION_TIME`` variable to set how long the data will be kept by prometheus + * update the ``PROMETHEUS_LONGTERM_STORE_INTERVAL`` variable to set how often the longterm metrics rules will be + calculated. For example, setting it to ``10h`` will calculate these metrics every 10 hours. + + Enabling the `prometheus-longterm-metrics` component creates the additional endpoint ``/prometheus-longterm-metrics``. + + The `thanos` component enables better storage of longterm metrics collected by the + ``optional-components/prometheus-longterm-metrics`` component. Data will be collected from the + ``prometheus-longterm-metrics`` and stored in an S3 object store indefinitely. + + When enabling this component, please change the default values for the ``MINIO_ROOT_USER`` and ``MINIO_ROOT_PASSWORD`` + by updating the ``env.local`` file. These set the login credentials for the root user that runs the + [minio](https://min.io/) object store. + + Enabling the `thanos` component creates the additional endpoints: + + * ``/thanos-query``: a prometheus-like query interface to inspect the data stored by thanos + * ``/thanos-minio``: a minio web console to inspect the data stored by minio. + + This also includes an update to the prometheus version from `v2.19.0` to the current latest `v2.52.0`. This is to + required to support the interaction between prometheus and thanos. [2.4.0](https://github.com/bird-house/birdhouse-deploy/tree/2.4.0) (2024-06-04) ------------------------------------------------------------------------------------------------------------------ diff --git a/birdhouse/env.local.example b/birdhouse/env.local.example index 99be5ea12..9753ba94d 100644 --- a/birdhouse/env.local.example +++ b/birdhouse/env.local.example @@ -574,6 +574,15 @@ export THREDDS_ADDITIONAL_CATALOG="" #export ALERTMANAGER_EXTRA_INHIBITION="" #export ALERTMANAGER_EXTRA_RECEIVERS="" +# Below are for the prometheus-longterm-metrics optional component +#export PROMETHEUS_LONGTERM_RETENTION_TIME=1y +#export PROMETHEUS_LONGTERM_STORE_INTERVAL=1h + +# Below are for the thanos optional component +# Change these from the default for added security +#export MINIO_ROOT_USER="${__DEFAULT__MINIO_ROOT_USER}" +#export MINIO_ROOT_PASSWORD="${__DEFAULT__MINIO_ROOT_PASSWORD}" + ############################################################################# # Emu optional vars ############################################################################# diff --git a/birdhouse/optional-components/README.rst b/birdhouse/optional-components/README.rst index b1c0df792..626ae56f4 100644 --- a/birdhouse/optional-components/README.rst +++ b/birdhouse/optional-components/README.rst @@ -443,3 +443,38 @@ How to enable X-Robots-Tag Header in ``env.local`` (a copy from `env.local.examp .. seealso:: See the `env.local.example`_ file for more details about this ``BIRDHOUSE_PROXY_ROOT_LOCATION`` behaviour. + +Prometheus Long-term Metrics +---------------------------- + +This is a second prometheus instance that collects longterm monitoring metrics from the original prometheus instance +(the one created by the ``components/monitoring`` component). + +Longterm metrics are any prometheus rule that have the label ``group: longterm-metrics`` or in other words are +selectable using prometheus' ``'{group="longterm-metrics"}'`` query filter. To see which longterm metric rules are +added by default see the ``optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.rules.template``. + +To configure this component: + + * update the ``PROMETHEUS_LONGTERM_RETENTION_TIME`` variable to set how long the data will be kept by prometheus + * update the ``PROMETHEUS_LONGTERM_STORE_INTERVAL`` variable to set how often the longterm metrics rules will be + calculated. For example, setting it to ``10h`` will calculate these metrics every 10 hours. + +Enabling this component creates the additional endpoint ``/prometheus-longterm-metrics``. + +Thanos +------ + +This enables better storage of longterm metrics collected by the ``optional-components/prometheus-longterm-metrics`` +component. Data will be collected from the ``prometheus-longterm-metrics`` and stored in an S3 object store +indefinitely. + +When enabling this component, please change the default values for the ``MINIO_ROOT_USER`` and ``MINIO_ROOT_PASSWORD`` +by updating the ``env.local`` file. These set the login credentials for the root user that runs the minio_ object +store. + +Enabling this component creates the additional endpoints: + * ``/thanos-query``: a prometheus-like query interface to inspect the data stored by thanos + * ``/thanos-minio``: a minio_ web console to inspect the data stored by minio_. + +.. _minio: https://min.io/ From 0d7178eead3b0bc2b4f0f9aaba2bcdc0494855ca Mon Sep 17 00:00:00 2001 From: mishaschwartz <4380924+mishaschwartz@users.noreply.github.com> Date: Tue, 18 Jun 2024 11:30:18 -0400 Subject: [PATCH 03/10] create default rules --- CHANGES.md | 5 +++-- birdhouse/env.local.example | 1 - birdhouse/optional-components/README.rst | 6 ++++-- .../prometheus-longterm-metrics/.gitignore | 1 - .../monitoring/docker-compose-extra.yml | 2 +- .../config/monitoring/prometheus.null.rules | 4 ++++ .../config/monitoring/prometheus.rules | 19 +++++++++++++++++++ .../monitoring/prometheus.rules.template | 16 ---------------- .../prometheus-longterm-metrics/default.env | 10 ++++++---- .../optional-components/thanos/default.env | 5 +++-- 10 files changed, 40 insertions(+), 29 deletions(-) create mode 100644 birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.null.rules create mode 100644 birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.rules delete mode 100644 birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.rules.template diff --git a/CHANGES.md b/CHANGES.md index 988e835ec..0a21c1eec 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -27,11 +27,12 @@ added by default see the ``optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.rules.template`` file. + If you do not want the default longterm-metric rules included, set the ``PROMETHEUS_LONGTERM_RULES_FILE`` to anything + other than ``True`` in your ``env.local`` file. + To configure this component: * update the ``PROMETHEUS_LONGTERM_RETENTION_TIME`` variable to set how long the data will be kept by prometheus - * update the ``PROMETHEUS_LONGTERM_STORE_INTERVAL`` variable to set how often the longterm metrics rules will be - calculated. For example, setting it to ``10h`` will calculate these metrics every 10 hours. Enabling the `prometheus-longterm-metrics` component creates the additional endpoint ``/prometheus-longterm-metrics``. diff --git a/birdhouse/env.local.example b/birdhouse/env.local.example index 9753ba94d..d3c3eabb5 100644 --- a/birdhouse/env.local.example +++ b/birdhouse/env.local.example @@ -576,7 +576,6 @@ export THREDDS_ADDITIONAL_CATALOG="" # Below are for the prometheus-longterm-metrics optional component #export PROMETHEUS_LONGTERM_RETENTION_TIME=1y -#export PROMETHEUS_LONGTERM_STORE_INTERVAL=1h # Below are for the thanos optional component # Change these from the default for added security diff --git a/birdhouse/optional-components/README.rst b/birdhouse/optional-components/README.rst index 626ae56f4..b59b33855 100644 --- a/birdhouse/optional-components/README.rst +++ b/birdhouse/optional-components/README.rst @@ -454,11 +454,13 @@ Longterm metrics are any prometheus rule that have the label ``group: longterm-m selectable using prometheus' ``'{group="longterm-metrics"}'`` query filter. To see which longterm metric rules are added by default see the ``optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.rules.template``. +If you do not want the default longterm-metric rules included, set the ``PROMETHEUS_LONGTERM_RULES_FILE`` to anything +other than ``True`` in your ``env.local`` file. You may want to do this if you've created your own set of rules in +another component that you would like to use instead of the default ones. + To configure this component: * update the ``PROMETHEUS_LONGTERM_RETENTION_TIME`` variable to set how long the data will be kept by prometheus - * update the ``PROMETHEUS_LONGTERM_STORE_INTERVAL`` variable to set how often the longterm metrics rules will be - calculated. For example, setting it to ``10h`` will calculate these metrics every 10 hours. Enabling this component creates the additional endpoint ``/prometheus-longterm-metrics``. diff --git a/birdhouse/optional-components/prometheus-longterm-metrics/.gitignore b/birdhouse/optional-components/prometheus-longterm-metrics/.gitignore index 352988cd5..97ac1a63e 100644 --- a/birdhouse/optional-components/prometheus-longterm-metrics/.gitignore +++ b/birdhouse/optional-components/prometheus-longterm-metrics/.gitignore @@ -1,3 +1,2 @@ -config/monitoring/prometheus.rules config/magpie/config.yml config/proxy/conf.extra-service.d/monitoring.conf diff --git a/birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/docker-compose-extra.yml b/birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/docker-compose-extra.yml index 4eeb181ad..3eaa00e21 100644 --- a/birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/docker-compose-extra.yml +++ b/birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/docker-compose-extra.yml @@ -3,4 +3,4 @@ version: "3.4" services: prometheus: volumes: - - ./optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.rules:/etc/prometheus/prometheus-longterm-metrics.rules:ro + - ./optional-components/prometheus-longterm-metrics/config/monitoring/${PROMETHEUS_LONGTERM_RULES_FILE}:/etc/prometheus/prometheus-longterm-metrics.rules:ro diff --git a/birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.null.rules b/birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.null.rules new file mode 100644 index 000000000..a885bf2eb --- /dev/null +++ b/birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.null.rules @@ -0,0 +1,4 @@ +# This file is intentionally left blank in order to allow a user to choose whether to enable the default rules that are +# set in the prometheus.rules file. +# By setting the PROMETHEUS_LONGTERM_ENABLE_DEFAULT_RULES environment variable to True, the rules in prometheus.rules +# will be added. By setting that value to anything else, this file will be added instead. diff --git a/birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.rules b/birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.rules new file mode 100644 index 000000000..32ea9b941 --- /dev/null +++ b/birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.rules @@ -0,0 +1,19 @@ +groups: + - name: longterm-metrics-hourly + interval: 1h + rules: + # percentage of the time, over the last hour, that all CPUs were working + # 1 means all CPUs were working all the time, 0 means they were all idle all the time + - record: instance:cpu_load:avg_rate1h + expr: avg by(instance) (rate(node_cpu_seconds_total{mode!="idle"}[1h])) + labels: + group: longterm-metrics + # total number of bytes that were sent or received over the network in the last hour + - record: instance:network_bytes_transmitted:sum_rate1h + expr: sum by(instance) (rate(node_network_transmit_bytes_total[1h]) + rate(node_network_receive_bytes_total[1h])) + labels: + group: longterm-metrics + - name: longterm-metrics-daily + interval: 24h + rules: + diff --git a/birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.rules.template b/birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.rules.template deleted file mode 100644 index 48a1cd26d..000000000 --- a/birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.rules.template +++ /dev/null @@ -1,16 +0,0 @@ -groups: - - name: longterm-metrics - interval: ${PROMETHEUS_LONGTERM_STORE_INTERVAL} - rules: - - record: cpu_instance:cpu_load_irate:avg${PROMETHEUS_LONGTERM_STORE_INTERVAL} - expr: avg by(cpu, instance) (irate(node_cpu_seconds_total{mode!="idle"}[${PROMETHEUS_LONGTERM_STORE_INTERVAL}])) - labels: - group: longterm-metrics - - record: instance:network_bytes_received_irate:sum${PROMETHEUS_LONGTERM_STORE_INTERVAL} - expr: sum by (instance) (irate(node_network_receive_bytes_total[${PROMETHEUS_LONGTERM_STORE_INTERVAL}])) - labels: - group: longterm-metrics - - record: instance:network_bytes_sent_irate:sum${PROMETHEUS_LONGTERM_STORE_INTERVAL} - expr: sum by (instance) (irate(node_network_transmit_bytes_total[${PROMETHEUS_LONGTERM_STORE_INTERVAL}])) - labels: - group: longterm-metrics diff --git a/birdhouse/optional-components/prometheus-longterm-metrics/default.env b/birdhouse/optional-components/prometheus-longterm-metrics/default.env index 2f22ce078..8b9ca5140 100644 --- a/birdhouse/optional-components/prometheus-longterm-metrics/default.env +++ b/birdhouse/optional-components/prometheus-longterm-metrics/default.env @@ -1,13 +1,15 @@ export PROMETHEUS_LONGTERM_RETENTION_TIME=1y -export PROMETHEUS_LONGTERM_STORE_INTERVAL=1h +export PROMETHEUS_LONGTERM_ENABLE_DEFAULT_RULES=True # These are the prometheus defaults export PROMETHEUS_LONGTERM_TSDB_MIN_BLOCK_DURATION=2h export PROMETHEUS_LONGTERM_TSDB_MAX_BLOCK_DURATION=1d12h -OPTIONAL_VARS=" - $OPTIONAL_VARS - \$PROMETHEUS_LONGTERM_STORE_INTERVAL +export PROMETHEUS_LONGTERM_RULES_FILE='$([ "${PROMETHEUS_LONGTERM_ENABLE_DEFAULT_RULES}" = "True" ] && echo prometheus.rules || echo prometheus.null.rules)' + +export DELAYED_EVAL=" + $DELAYED_EVAL + PROMETHEUS_LONGTERM_RULES_FILE " COMPONENT_DEPENDENCIES=" diff --git a/birdhouse/optional-components/thanos/default.env b/birdhouse/optional-components/thanos/default.env index ba1ed726c..1defc4cbd 100644 --- a/birdhouse/optional-components/thanos/default.env +++ b/birdhouse/optional-components/thanos/default.env @@ -19,8 +19,9 @@ export __DEFAULT__MINIO_ROOT_PASSWORD=minioadmin export MINIO_ROOT_USER="${__DEFAULT__MINIO_ROOT_USER}" export MINIO_ROOT_PASSWORD="${__DEFAULT__MINIO_ROOT_PASSWORD}" -# Set a schedule to run the compactor. This should be significantly larger than the PROMETHEUS_LONGTERM_STORE_INTERVAL -export THANOS_COMPACTOR_WAIT_INTERVAL=24h +# Set a schedule to run the compactor. This should be at least double the largest longterm-metrics interval. +# eg. if thanos is collecting a metric that is calculated every 24h (daily) then this value should be at least 48h +export THANOS_COMPACTOR_WAIT_INTERVAL=48h # The longterm data retention time can be shortened back to the default since Thanos is now responsible for # storing longterm data, not the prometheus-longterm-metrics component. From 42f687dbafb04c9b6867c0a95905d3fce22db6ce Mon Sep 17 00:00:00 2001 From: mishaschwartz <4380924+mishaschwartz@users.noreply.github.com> Date: Tue, 18 Jun 2024 11:54:04 -0400 Subject: [PATCH 04/10] review updates --- CHANGES.md | 2 +- birdhouse/env.local.example | 4 +-- birdhouse/optional-components/README.rst | 6 ++--- .../prometheus-longterm-metrics/.gitignore | 1 + .../prometheus-longterm-metrics/default.env | 1 + ...prometheus.yml => prometheus.yml.template} | 2 +- .../monitoring.conf.template | 2 +- .../optional-components/thanos/default.env | 25 +++++++++---------- .../thanos/docker-compose-extra.yml | 22 ++++++++-------- 9 files changed, 33 insertions(+), 32 deletions(-) rename birdhouse/optional-components/prometheus-longterm-metrics/{prometheus.yml => prometheus.yml.template} (84%) diff --git a/CHANGES.md b/CHANGES.md index 0a21c1eec..69e45127d 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -40,7 +40,7 @@ ``optional-components/prometheus-longterm-metrics`` component. Data will be collected from the ``prometheus-longterm-metrics`` and stored in an S3 object store indefinitely. - When enabling this component, please change the default values for the ``MINIO_ROOT_USER`` and ``MINIO_ROOT_PASSWORD`` + When enabling this component, please change the default values for the ``THANOS_MINIO_ROOT_USER`` and ``THANOS_MINIO_ROOT_PASSWORD`` by updating the ``env.local`` file. These set the login credentials for the root user that runs the [minio](https://min.io/) object store. diff --git a/birdhouse/env.local.example b/birdhouse/env.local.example index d3c3eabb5..28a2f7059 100644 --- a/birdhouse/env.local.example +++ b/birdhouse/env.local.example @@ -579,8 +579,8 @@ export THREDDS_ADDITIONAL_CATALOG="" # Below are for the thanos optional component # Change these from the default for added security -#export MINIO_ROOT_USER="${__DEFAULT__MINIO_ROOT_USER}" -#export MINIO_ROOT_PASSWORD="${__DEFAULT__MINIO_ROOT_PASSWORD}" +#export THANOS_MINIO_ROOT_USER="${__DEFAULT__THANOS_MINIO_ROOT_USER}" +#export THANOS_MINIO_ROOT_PASSWORD="${__DEFAULT__THANOS_MINIO_ROOT_PASSWORD}" ############################################################################# # Emu optional vars diff --git a/birdhouse/optional-components/README.rst b/birdhouse/optional-components/README.rst index b59b33855..7a4bf4ec6 100644 --- a/birdhouse/optional-components/README.rst +++ b/birdhouse/optional-components/README.rst @@ -471,9 +471,9 @@ This enables better storage of longterm metrics collected by the ``optional-comp component. Data will be collected from the ``prometheus-longterm-metrics`` and stored in an S3 object store indefinitely. -When enabling this component, please change the default values for the ``MINIO_ROOT_USER`` and ``MINIO_ROOT_PASSWORD`` -by updating the ``env.local`` file. These set the login credentials for the root user that runs the minio_ object -store. +When enabling this component, please change the default values for the ``THANOS_MINIO_ROOT_USER`` and +``THANOS_MINIO_ROOT_PASSWORD`` by updating the ``env.local`` file. These set the login credentials for the root user +that runs the minio_ object store. Enabling this component creates the additional endpoints: * ``/thanos-query``: a prometheus-like query interface to inspect the data stored by thanos diff --git a/birdhouse/optional-components/prometheus-longterm-metrics/.gitignore b/birdhouse/optional-components/prometheus-longterm-metrics/.gitignore index 97ac1a63e..b7813ee7b 100644 --- a/birdhouse/optional-components/prometheus-longterm-metrics/.gitignore +++ b/birdhouse/optional-components/prometheus-longterm-metrics/.gitignore @@ -1,2 +1,3 @@ +prometheus.yml config/magpie/config.yml config/proxy/conf.extra-service.d/monitoring.conf diff --git a/birdhouse/optional-components/prometheus-longterm-metrics/default.env b/birdhouse/optional-components/prometheus-longterm-metrics/default.env index 8b9ca5140..0b1d19378 100644 --- a/birdhouse/optional-components/prometheus-longterm-metrics/default.env +++ b/birdhouse/optional-components/prometheus-longterm-metrics/default.env @@ -1,5 +1,6 @@ export PROMETHEUS_LONGTERM_RETENTION_TIME=1y export PROMETHEUS_LONGTERM_ENABLE_DEFAULT_RULES=True +export PROMETHEUS_LONGTERM_SCRAPE_INTERVAL=1h # These are the prometheus defaults export PROMETHEUS_LONGTERM_TSDB_MIN_BLOCK_DURATION=2h diff --git a/birdhouse/optional-components/prometheus-longterm-metrics/prometheus.yml b/birdhouse/optional-components/prometheus-longterm-metrics/prometheus.yml.template similarity index 84% rename from birdhouse/optional-components/prometheus-longterm-metrics/prometheus.yml rename to birdhouse/optional-components/prometheus-longterm-metrics/prometheus.yml.template index ba089ac3e..5ca24f003 100644 --- a/birdhouse/optional-components/prometheus-longterm-metrics/prometheus.yml +++ b/birdhouse/optional-components/prometheus-longterm-metrics/prometheus.yml.template @@ -4,7 +4,7 @@ global: scrape_configs: - job_name: 'federate' - scrape_interval: 15s + scrape_interval: ${PROMETHEUS_LONGTERM_SCRAPE_INTERVAL} honor_labels: true metrics_path: '/prometheus/federate' diff --git a/birdhouse/optional-components/thanos/config/proxy/conf.extra-service.d/monitoring.conf.template b/birdhouse/optional-components/thanos/config/proxy/conf.extra-service.d/monitoring.conf.template index b2aa98ccd..e20d2a99b 100644 --- a/birdhouse/optional-components/thanos/config/proxy/conf.extra-service.d/monitoring.conf.template +++ b/birdhouse/optional-components/thanos/config/proxy/conf.extra-service.d/monitoring.conf.template @@ -10,7 +10,7 @@ auth_request_set $auth_status $upstream_status; rewrite ^/thanos-minio/(.*) /$1 break; - proxy_pass http://minio:9001; + proxy_pass http://thanos-minio:9001; proxy_http_version 1.1; proxy_set_header Upgrade $http_upgrade; diff --git a/birdhouse/optional-components/thanos/default.env b/birdhouse/optional-components/thanos/default.env index 1defc4cbd..4aeae0668 100644 --- a/birdhouse/optional-components/thanos/default.env +++ b/birdhouse/optional-components/thanos/default.env @@ -3,21 +3,21 @@ export THANOS_VERSION=v0.35.1 export THANOS_DOCKER="thanosio/thanos" export THANOS_IMAGE='${THANOS_DOCKER}:${THANOS_VERSION}' -export MINIO_VERSION=RELEASE.2024-05-27T19-17-46Z -export MINIO_DOCKER=minio/minio -export MINIO_IMAGE='${MINIO_DOCKER}:${MINIO_VERSION}' +export THANOS_MINIO_VERSION=RELEASE.2024-05-27T19-17-46Z +export THANOS_MINIO_DOCKER=minio/minio +export THANOS_MINIO_IMAGE='${MINIO_DOCKER}:${MINIO_VERSION}' # Minio uses object storage on disk at this location -export MINIO_DATA_STORE='${BIRDHOUSE_DATA_PERSIST_ROOT}/minio_data/' +export THANOS_MINIO_DATA_STORE='${BIRDHOUSE_DATA_PERSIST_ROOT}/thanos_minio_data/' # Note that bucket names must only contain lowercase ascii, digits, - and . export THANOS_MINIO_BUCKET_NAME=thanos-bucket # Minio credentials -export __DEFAULT__MINIO_ROOT_USER=minioadmin -export __DEFAULT__MINIO_ROOT_PASSWORD=minioadmin -export MINIO_ROOT_USER="${__DEFAULT__MINIO_ROOT_USER}" -export MINIO_ROOT_PASSWORD="${__DEFAULT__MINIO_ROOT_PASSWORD}" +export __DEFAULT__THANOS_MINIO_ROOT_USER=minioadmin +export __DEFAULT__THANOS_MINIO_ROOT_PASSWORD=minioadmin +export THANOS_MINIO_ROOT_USER="${__DEFAULT__MINIO_ROOT_USER}" +export THANOS_MINIO_ROOT_PASSWORD="${__DEFAULT__MINIO_ROOT_PASSWORD}" # Set a schedule to run the compactor. This should be at least double the largest longterm-metrics interval. # eg. if thanos is collecting a metric that is calculated every 24h (daily) then this value should be at least 48h @@ -34,18 +34,17 @@ export PROMETHEUS_LONGTERM_TSDB_MAX_BLOCK_DURATION=2h VARS=" $VARS - \$MINIO_ROOT_USER - \$MINIO_ROOT_PASSWORD + \$THANOS_MINIO_ROOT_USER + \$THANOS_MINIO_ROOT_PASSWORD " export DELAYED_EVAL=" $DELAYED_EVAL THANOS_IMAGE - MINIO_IMAGE - MINIO_DATA_STORE + THANOS_MINIO_IMAGE + THANOS_MINIO_DATA_STORE " COMPONENT_DEPENDENCIES=" - ./components/monitoring ./optional-components/prometheus-longterm-metrics " diff --git a/birdhouse/optional-components/thanos/docker-compose-extra.yml b/birdhouse/optional-components/thanos/docker-compose-extra.yml index 1b3d170ee..424404f2b 100644 --- a/birdhouse/optional-components/thanos/docker-compose-extra.yml +++ b/birdhouse/optional-components/thanos/docker-compose-extra.yml @@ -11,9 +11,9 @@ x-objstore-config: &objstore-config | --objstore.config=type: S3 config: bucket: ${THANOS_MINIO_BUCKET_NAME} - access_key: ${MINIO_ROOT_USER} - secret_key: ${MINIO_ROOT_PASSWORD} - endpoint: minio:9000 + access_key: ${THANOS_MINIO_ROOT_USER} + secret_key: ${THANOS_MINIO_ROOT_PASSWORD} + endpoint: thanos-minio:9000 insecure: true # use http instead of https services: @@ -32,7 +32,7 @@ services: - *objstore-config depends_on: - prometheus-longterm-metrics - - minio + - thanos-minio restart: always logging: *default-logging @@ -60,15 +60,15 @@ services: - '--wait-interval=${THANOS_COMPACTOR_WAIT_INTERVAL}' - *objstore-config depends_on: - - minio + - thanos-minio restart: always logging: *default-logging - minio: - image: ${MINIO_IMAGE} - container_name: minio + thanos-minio: + image: ${THANOS_MINIO_IMAGE} + container_name: thanos-minio volumes: - - ${MINIO_DATA_STORE}:/data + - ${THANOS_MINIO_DATA_STORE}:/data - ./optional-components/thanos/minio-entrypoint:/entrypoint entrypoint: /entrypoint command: @@ -78,8 +78,8 @@ services: - ':9001' - '/data' environment: - - MINIO_ROOT_USER=${MINIO_ROOT_USER} - - MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD} + - MINIO_ROOT_USER=${THANOS_MINIO_ROOT_USER} + - MINIO_ROOT_PASSWORD=${THANOS_MINIO_ROOT_PASSWORD} - MINIO_PROMETHEUS_AUTH_TYPE=public - THANOS_MINIO_BUCKET_NAME=${THANOS_MINIO_BUCKET_NAME} - MINIO_BROWSER_REDIRECT_URL=https://${BIRDHOUSE_FQDN_PUBLIC}/thanos-minio From a921527580a2fb03a4d5b6caae05748ade85cb2f Mon Sep 17 00:00:00 2001 From: mishaschwartz <4380924+mishaschwartz@users.noreply.github.com> Date: Tue, 18 Jun 2024 12:00:40 -0400 Subject: [PATCH 05/10] fix missing default var --- birdhouse/optional-components/thanos/default.env | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/birdhouse/optional-components/thanos/default.env b/birdhouse/optional-components/thanos/default.env index 4aeae0668..f41930b10 100644 --- a/birdhouse/optional-components/thanos/default.env +++ b/birdhouse/optional-components/thanos/default.env @@ -16,8 +16,8 @@ export THANOS_MINIO_BUCKET_NAME=thanos-bucket # Minio credentials export __DEFAULT__THANOS_MINIO_ROOT_USER=minioadmin export __DEFAULT__THANOS_MINIO_ROOT_PASSWORD=minioadmin -export THANOS_MINIO_ROOT_USER="${__DEFAULT__MINIO_ROOT_USER}" -export THANOS_MINIO_ROOT_PASSWORD="${__DEFAULT__MINIO_ROOT_PASSWORD}" +export THANOS_MINIO_ROOT_USER="${__DEFAULT__THANOS_MINIO_ROOT_USER}" +export THANOS_MINIO_ROOT_PASSWORD="${__DEFAULT__THANOS_MINIO_ROOT_PASSWORD}" # Set a schedule to run the compactor. This should be at least double the largest longterm-metrics interval. # eg. if thanos is collecting a metric that is calculated every 24h (daily) then this value should be at least 48h From 030799609ffc2ee90bd6b5e4b288428ad87007ea Mon Sep 17 00:00:00 2001 From: mishaschwartz <4380924+mishaschwartz@users.noreply.github.com> Date: Tue, 18 Jun 2024 12:04:57 -0400 Subject: [PATCH 06/10] bug fixes --- .../config/monitoring/prometheus.rules | 4 ---- .../prometheus-longterm-metrics/default.env | 5 +++++ birdhouse/optional-components/thanos/default.env | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.rules b/birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.rules index 32ea9b941..465a8f20e 100644 --- a/birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.rules +++ b/birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.rules @@ -13,7 +13,3 @@ groups: expr: sum by(instance) (rate(node_network_transmit_bytes_total[1h]) + rate(node_network_receive_bytes_total[1h])) labels: group: longterm-metrics - - name: longterm-metrics-daily - interval: 24h - rules: - diff --git a/birdhouse/optional-components/prometheus-longterm-metrics/default.env b/birdhouse/optional-components/prometheus-longterm-metrics/default.env index 0b1d19378..75c6c46d3 100644 --- a/birdhouse/optional-components/prometheus-longterm-metrics/default.env +++ b/birdhouse/optional-components/prometheus-longterm-metrics/default.env @@ -8,6 +8,11 @@ export PROMETHEUS_LONGTERM_TSDB_MAX_BLOCK_DURATION=1d12h export PROMETHEUS_LONGTERM_RULES_FILE='$([ "${PROMETHEUS_LONGTERM_ENABLE_DEFAULT_RULES}" = "True" ] && echo prometheus.rules || echo prometheus.null.rules)' +OPTIONAL_VARS=" + $OPTIONAL_VARS + \$PROMETHEUS_LONGTERM_SCRAPE_INTERVAL +" + export DELAYED_EVAL=" $DELAYED_EVAL PROMETHEUS_LONGTERM_RULES_FILE diff --git a/birdhouse/optional-components/thanos/default.env b/birdhouse/optional-components/thanos/default.env index f41930b10..5780b5523 100644 --- a/birdhouse/optional-components/thanos/default.env +++ b/birdhouse/optional-components/thanos/default.env @@ -5,7 +5,7 @@ export THANOS_IMAGE='${THANOS_DOCKER}:${THANOS_VERSION}' export THANOS_MINIO_VERSION=RELEASE.2024-05-27T19-17-46Z export THANOS_MINIO_DOCKER=minio/minio -export THANOS_MINIO_IMAGE='${MINIO_DOCKER}:${MINIO_VERSION}' +export THANOS_MINIO_IMAGE='${THANOS_MINIO_DOCKER}:${THANOS_MINIO_VERSION}' # Minio uses object storage on disk at this location export THANOS_MINIO_DATA_STORE='${BIRDHOUSE_DATA_PERSIST_ROOT}/thanos_minio_data/' From 2eab8b7aa95e94e631813b2e85d419c91a075613 Mon Sep 17 00:00:00 2001 From: mishaschwartz <4380924+mishaschwartz@users.noreply.github.com> Date: Tue, 25 Jun 2024 16:16:11 -0400 Subject: [PATCH 07/10] make longterm monitoring deployable on a different server --- birdhouse/optional-components/README.rst | 18 +++++++++++++----- .../config/monitoring/docker-compose-extra.yml | 6 ------ .../config/monitoring/prometheus.null.rules | 4 ---- .../prometheus-longterm-metrics/default.env | 17 ++++++++++++----- .../docker-compose-extra.yml | 2 +- .../prometheus.yml.template | 3 +-- .../config/monitoring/docker-compose-extra.yml | 6 ++++++ .../config/monitoring/prometheus.rules | 0 8 files changed, 33 insertions(+), 23 deletions(-) delete mode 100644 birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/docker-compose-extra.yml delete mode 100644 birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.null.rules create mode 100644 birdhouse/optional-components/prometheus-longterm-rules/config/monitoring/docker-compose-extra.yml rename birdhouse/optional-components/{prometheus-longterm-metrics => prometheus-longterm-rules}/config/monitoring/prometheus.rules (100%) diff --git a/birdhouse/optional-components/README.rst b/birdhouse/optional-components/README.rst index 7a4bf4ec6..583cbe169 100644 --- a/birdhouse/optional-components/README.rst +++ b/birdhouse/optional-components/README.rst @@ -451,12 +451,11 @@ This is a second prometheus instance that collects longterm monitoring metrics f (the one created by the ``components/monitoring`` component). Longterm metrics are any prometheus rule that have the label ``group: longterm-metrics`` or in other words are -selectable using prometheus' ``'{group="longterm-metrics"}'`` query filter. To see which longterm metric rules are -added by default see the ``optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.rules.template``. +selectable using prometheus' ``'{group="longterm-metrics"}'`` query filter. To add some default longterm metrics rules +also enable the ``prometheus-longterm-rules`` component. -If you do not want the default longterm-metric rules included, set the ``PROMETHEUS_LONGTERM_RULES_FILE`` to anything -other than ``True`` in your ``env.local`` file. You may want to do this if you've created your own set of rules in -another component that you would like to use instead of the default ones. +You may also choose to create your own set of rules in another component that you would like to use instead of the +default ones. To configure this component: @@ -464,6 +463,15 @@ To configure this component: Enabling this component creates the additional endpoint ``/prometheus-longterm-metrics``. +Prometheus Long-term Rules +-------------------------- + +This adds some default longterm metrics rules to the `prometheus` component for use by the `prometheus-longterm-metrics` +component. These rules all have the label ``group: longterm-metrics``. + +To see which rules are added, check out the +`optional-components/prometheus-longterm-rules/config/monitoring/prometheus.rules` file. + Thanos ------ diff --git a/birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/docker-compose-extra.yml b/birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/docker-compose-extra.yml deleted file mode 100644 index 3eaa00e21..000000000 --- a/birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/docker-compose-extra.yml +++ /dev/null @@ -1,6 +0,0 @@ -version: "3.4" - -services: - prometheus: - volumes: - - ./optional-components/prometheus-longterm-metrics/config/monitoring/${PROMETHEUS_LONGTERM_RULES_FILE}:/etc/prometheus/prometheus-longterm-metrics.rules:ro diff --git a/birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.null.rules b/birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.null.rules deleted file mode 100644 index a885bf2eb..000000000 --- a/birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.null.rules +++ /dev/null @@ -1,4 +0,0 @@ -# This file is intentionally left blank in order to allow a user to choose whether to enable the default rules that are -# set in the prometheus.rules file. -# By setting the PROMETHEUS_LONGTERM_ENABLE_DEFAULT_RULES environment variable to True, the rules in prometheus.rules -# will be added. By setting that value to anything else, this file will be added instead. diff --git a/birdhouse/optional-components/prometheus-longterm-metrics/default.env b/birdhouse/optional-components/prometheus-longterm-metrics/default.env index 75c6c46d3..3e3b5aa7e 100644 --- a/birdhouse/optional-components/prometheus-longterm-metrics/default.env +++ b/birdhouse/optional-components/prometheus-longterm-metrics/default.env @@ -1,23 +1,30 @@ +export PROMETHEUS_LONGTERM_VERSION='${PROMETHEUS_VERSION:-"v2.52.0"}' +export PROMETHEUS_LONGTERM_DOCKER='${PROMETHEUS_DOCKER:-prom/prometheus}' +export PROMETHEUS_LONGTERM_IMAGE='${PROMETHEUS_LONGTERM_DOCKER}:${PROMETHEUS_LONGTERM_VERSION}' + export PROMETHEUS_LONGTERM_RETENTION_TIME=1y -export PROMETHEUS_LONGTERM_ENABLE_DEFAULT_RULES=True export PROMETHEUS_LONGTERM_SCRAPE_INTERVAL=1h # These are the prometheus defaults export PROMETHEUS_LONGTERM_TSDB_MIN_BLOCK_DURATION=2h export PROMETHEUS_LONGTERM_TSDB_MAX_BLOCK_DURATION=1d12h -export PROMETHEUS_LONGTERM_RULES_FILE='$([ "${PROMETHEUS_LONGTERM_ENABLE_DEFAULT_RULES}" = "True" ] && echo prometheus.rules || echo prometheus.null.rules)' +# These are the targets that +export PROMETHEUS_LONGTERM_TARGETS='["prometheus:9090"]' # yaml list syntax OPTIONAL_VARS=" $OPTIONAL_VARS \$PROMETHEUS_LONGTERM_SCRAPE_INTERVAL + \$PROMETHEUS_LONGTERM_TARGETS " export DELAYED_EVAL=" $DELAYED_EVAL + PROMETHEUS_LONGTERM_VERSION + PROMETHEUS_LONGTERM_DOCKER + PROMETHEUS_LONGTERM_IMAGE PROMETHEUS_LONGTERM_RULES_FILE " -COMPONENT_DEPENDENCIES=" - ./components/monitoring -" +# Note that this component does not depend explicitly on the `components/monitoring` component so that this can +# theoretically be deployed on a different machine than the `prometheus` service. This is currently untested. diff --git a/birdhouse/optional-components/prometheus-longterm-metrics/docker-compose-extra.yml b/birdhouse/optional-components/prometheus-longterm-metrics/docker-compose-extra.yml index 49c6f152c..426d4d0ef 100644 --- a/birdhouse/optional-components/prometheus-longterm-metrics/docker-compose-extra.yml +++ b/birdhouse/optional-components/prometheus-longterm-metrics/docker-compose-extra.yml @@ -9,7 +9,7 @@ x-logging: services: prometheus-longterm-metrics: - image: ${PROMETHEUS_IMAGE} + image: ${PROMETHEUS_LONGTERM_IMAGE} container_name: prometheus-longterm-metrics volumes: - ./optional-components/prometheus-longterm-metrics/prometheus.yml:/etc/prometheus/prometheus.yml:ro diff --git a/birdhouse/optional-components/prometheus-longterm-metrics/prometheus.yml.template b/birdhouse/optional-components/prometheus-longterm-metrics/prometheus.yml.template index 5ca24f003..c0ade1ba1 100644 --- a/birdhouse/optional-components/prometheus-longterm-metrics/prometheus.yml.template +++ b/birdhouse/optional-components/prometheus-longterm-metrics/prometheus.yml.template @@ -14,5 +14,4 @@ scrape_configs: - '{group="longterm-metrics"}' static_configs: - - targets: - - 'prometheus:9090' + - targets: ${PROMETHEUS_LONGTERM_TARGETS} diff --git a/birdhouse/optional-components/prometheus-longterm-rules/config/monitoring/docker-compose-extra.yml b/birdhouse/optional-components/prometheus-longterm-rules/config/monitoring/docker-compose-extra.yml new file mode 100644 index 000000000..0f701b30b --- /dev/null +++ b/birdhouse/optional-components/prometheus-longterm-rules/config/monitoring/docker-compose-extra.yml @@ -0,0 +1,6 @@ +version: "3.4" + +services: + prometheus: + volumes: + - ./optional-components/prometheus-longterm-rules/config/monitoring/prometheus.rules:/etc/prometheus/prometheus-longterm-metrics.rules:ro diff --git a/birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.rules b/birdhouse/optional-components/prometheus-longterm-rules/config/monitoring/prometheus.rules similarity index 100% rename from birdhouse/optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.rules rename to birdhouse/optional-components/prometheus-longterm-rules/config/monitoring/prometheus.rules From 3f75d496b932ab4d90857206003d0225cd20c435 Mon Sep 17 00:00:00 2001 From: mishaschwartz <4380924+mishaschwartz@users.noreply.github.com> Date: Mon, 16 Sep 2024 14:21:49 -0400 Subject: [PATCH 08/10] documentation updates and remove unused references to variables --- CHANGES.md | 9 +++------ birdhouse/components/README.rst | 18 ++++++++++++++++++ birdhouse/optional-components/README.rst | 10 ++++++++-- .../prometheus-longterm-metrics/default.env | 1 - 4 files changed, 29 insertions(+), 9 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 6d0529c7a..6e4b5261a 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -27,9 +27,6 @@ added by default see the ``optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.rules.template`` file. - If you do not want the default longterm-metric rules included, set the ``PROMETHEUS_LONGTERM_RULES_FILE`` to anything - other than ``True`` in your ``env.local`` file. - To configure this component: * update the ``PROMETHEUS_LONGTERM_RETENTION_TIME`` variable to set how long the data will be kept by prometheus @@ -44,13 +41,13 @@ by updating the ``env.local`` file. These set the login credentials for the root user that runs the [minio](https://min.io/) object store. - Enabling the `thanos` component creates the additional endpoints: +- Enabling the `thanos` component creates the additional endpoints: * ``/thanos-query``: a prometheus-like query interface to inspect the data stored by thanos * ``/thanos-minio``: a minio web console to inspect the data stored by minio. - This also includes an update to the prometheus version from `v2.19.0` to the current latest `v2.52.0`. This is to - required to support the interaction between prometheus and thanos. +- Update the prometheus version from `v2.19.0` to the current latest `v2.52.0`. This is required to support the interaction between + prometheus and thanos. [2.5.3](https://github.com/bird-house/birdhouse-deploy/tree/2.5.3) (2024-09-11) ------------------------------------------------------------------------------------------------------------------ diff --git a/birdhouse/components/README.rst b/birdhouse/components/README.rst index f67a9767e..ba6c83de5 100644 --- a/birdhouse/components/README.rst +++ b/birdhouse/components/README.rst @@ -371,6 +371,7 @@ AlertManager for Alert Dashboard and Silencing .. image:: monitoring/images/alertmanager-dashboard.png .. image:: monitoring/images/alertmanager-silence-alert.png +.. _monitoring-customize-the-component Customizing the Component ------------------------- @@ -389,6 +390,23 @@ Customizing the Component Slack or other services accepting webhooks), ``ALERTMANAGER_EXTRA_RECEIVERS``. +Longterm Storage of Prometheus Metrics +-------------------------------------- + +Prometheus stores metrics for 90 days by default. This may be sufficient for some use cases but you may wish to store +some metrics for longer. In order to store certain metrics for a longer than 90 days, you can enable the following +additional components: + +- :ref:`prometheus-longterm-metrics`: a second Prometheus instance used to collect the metrics that you want to store longterm +- :ref:`thanos`: a service that enables more efficient storage of the metrics collected by the :ref:`prometheus-longterm-metrics` + component. + +.. note:: + A separate prometheus instance is necessary since the retention time for prometheus metrics is set at the + instance level. This means that increasing the retention time must be done for all metrics at once which is undesirable + because you probably don't need to store every metric for a long period of time and you'll end up using a lot more + disk space than needed. + Weaver ====== diff --git a/birdhouse/optional-components/README.rst b/birdhouse/optional-components/README.rst index 583cbe169..8e651befd 100644 --- a/birdhouse/optional-components/README.rst +++ b/birdhouse/optional-components/README.rst @@ -444,6 +444,8 @@ How to enable X-Robots-Tag Header in ``env.local`` (a copy from `env.local.examp .. seealso:: See the `env.local.example`_ file for more details about this ``BIRDHOUSE_PROXY_ROOT_LOCATION`` behaviour. +.. _prometheus-longterm-metrics + Prometheus Long-term Metrics ---------------------------- @@ -454,8 +456,8 @@ Longterm metrics are any prometheus rule that have the label ``group: longterm-m selectable using prometheus' ``'{group="longterm-metrics"}'`` query filter. To add some default longterm metrics rules also enable the ``prometheus-longterm-rules`` component. -You may also choose to create your own set of rules in another component that you would like to use instead of the -default ones. +You may also choose to create your own set of rules instead of, or as well as, the default ones. See how to +:ref:`add additional rules here `. To configure this component: @@ -463,6 +465,8 @@ To configure this component: Enabling this component creates the additional endpoint ``/prometheus-longterm-metrics``. +.. _prometheus-longterm-rules + Prometheus Long-term Rules -------------------------- @@ -472,6 +476,8 @@ component. These rules all have the label ``group: longterm-metrics``. To see which rules are added, check out the `optional-components/prometheus-longterm-rules/config/monitoring/prometheus.rules` file. +.. _thanos + Thanos ------ diff --git a/birdhouse/optional-components/prometheus-longterm-metrics/default.env b/birdhouse/optional-components/prometheus-longterm-metrics/default.env index 3e3b5aa7e..8f6d9638a 100644 --- a/birdhouse/optional-components/prometheus-longterm-metrics/default.env +++ b/birdhouse/optional-components/prometheus-longterm-metrics/default.env @@ -23,7 +23,6 @@ export DELAYED_EVAL=" PROMETHEUS_LONGTERM_VERSION PROMETHEUS_LONGTERM_DOCKER PROMETHEUS_LONGTERM_IMAGE - PROMETHEUS_LONGTERM_RULES_FILE " # Note that this component does not depend explicitly on the `components/monitoring` component so that this can From 79a531d8cd6be1ec6634aa44eb7d745ba2fc4909 Mon Sep 17 00:00:00 2001 From: mishaschwartz <4380924+mishaschwartz@users.noreply.github.com> Date: Wed, 2 Oct 2024 08:59:22 -0400 Subject: [PATCH 09/10] update documentation [ci skip] --- birdhouse/components/README.rst | 34 ++++++++++++++++++++++++ birdhouse/optional-components/README.rst | 25 ++++++++++++++++- 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/birdhouse/components/README.rst b/birdhouse/components/README.rst index ba6c83de5..b187394a4 100644 --- a/birdhouse/components/README.rst +++ b/birdhouse/components/README.rst @@ -400,6 +400,8 @@ additional components: - :ref:`prometheus-longterm-metrics`: a second Prometheus instance used to collect the metrics that you want to store longterm - :ref:`thanos`: a service that enables more efficient storage of the metrics collected by the :ref:`prometheus-longterm-metrics` component. +- :ref:`prometheus-longterm-rules`: adds some example rules to the monitoring Prometheus instance (the one deployed by this `monitoring` + component) that can be stored longterm by the `prometheus-longterm-metrics` component. .. note:: A separate prometheus instance is necessary since the retention time for prometheus metrics is set at the @@ -407,6 +409,38 @@ additional components: because you probably don't need to store every metric for a long period of time and you'll end up using a lot more disk space than needed. +If some or all of these additional components are enabled, they interact in the following way to store certain metrics for +longer than 90 days: + +1. + - `recording rules`_ are added to the monitoring Prometheus instance (the one deployed by this `monitoring` component). These + rules are any that have the `longterm-metrics` label. + - The metrics described by these rules are collected/calculated by the monitoring Prometheus instance. The monitoring Prometheus + instance treats these rules the same as + - To enable some example longterm `recording rules`_, enable the :ref:`prometheus-longterm-rules` component. You can also choose + to create your own rules (see :ref:`prometheus-longterm-metrics` for details on how to create these longterm metrics rules). +2. + - The :ref:`prometheus-longterm-metrics` Prometheus instance collects/copies only the rules with the `longterm-metrics` label from the + monitoring Prometheus instance. + - The :ref:`prometheus-longterm-metrics` Prometheus instance stores only these metrics for a custom duration (can be longer than + 90 days). +3. + - The :ref:`thanos` component can be deployed alongside the :ref:`prometheus-longterm-metrics` Prometheus instance in order to store + the metrics that the :ref:`prometheus-longterm-metrics` Prometheus instance has already collected. + - The :ref:`thanos` component collects the metrics collected by the :ref:`prometheus-longterm-metrics` Prometheus instance and + stores them in an S3 object store. + - The :ref:`thanos` object store stores the metrics more efficiently, meaning that metrics can be stored for even longer and they'll + take up less disk space than if they were just stored by the :ref:`prometheus-longterm-metrics` Prometheus instance. + +.. note:: + + It is possible to deploy the :ref:`prometheus-longterm-metrics` Prometheus instance and the :ref:`thanos` instance on a different + machine than the monitoring Prometheus instance. However, note that both the :ref:`prometheus-longterm-metrics` and :ref:`thanos` + components *must* be deployed on the same machine (if both are in use). Also note that this is untested and may require serious + troubleshooting to work properly. + +.. _recording rules: https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/ + Weaver ====== diff --git a/birdhouse/optional-components/README.rst b/birdhouse/optional-components/README.rst index 8e651befd..fca9c8333 100644 --- a/birdhouse/optional-components/README.rst +++ b/birdhouse/optional-components/README.rst @@ -449,7 +449,7 @@ How to enable X-Robots-Tag Header in ``env.local`` (a copy from `env.local.examp Prometheus Long-term Metrics ---------------------------- -This is a second prometheus instance that collects longterm monitoring metrics from the original prometheus instance +This is a second prometheus instance that collects longterm monitoring metrics from the monitoring Prometheus instance (the one created by the ``components/monitoring`` component). Longterm metrics are any prometheus rule that have the label ``group: longterm-metrics`` or in other words are @@ -463,6 +463,24 @@ To configure this component: * update the ``PROMETHEUS_LONGTERM_RETENTION_TIME`` variable to set how long the data will be kept by prometheus +If the monitoring Prometheus instance that this Prometheus instance is tracking is not deployed on the same machine +(or at a non-default network address on the same machine), you may configure the network location of the monitoring +Prometheus instance by setting the ``PROMETHEUS_LONGTERM_TARGETS`` variable. For example, if the monitoring Prometheus +instance's API is available at `https://example.com/prometheus:9090` the you can set the variable: + +.. code:: + + export PROMETHEUS_LONGTERM_TARGETS='["https://example.com/prometheus:9090"]' + +.. note:: + + You may list multiple monitoring Prometheus instances to track in this way by adding more URLs to the list. + +.. warning:: + + Deploying the longterm metrics Prometheus instance on a separate machine from the monitoring Prometheus component + is untested and may require serious troubleshooting to work properly. + Enabling this component creates the additional endpoint ``/prometheus-longterm-metrics``. .. _prometheus-longterm-rules @@ -493,4 +511,9 @@ Enabling this component creates the additional endpoints: * ``/thanos-query``: a prometheus-like query interface to inspect the data stored by thanos * ``/thanos-minio``: a minio_ web console to inspect the data stored by minio_. +.. note:: + + The `thanos` component must be deployed on the same machine as the `prometheus-longterm-metrics` component since + `thanos` needs access to the data stored by prometheus on disk (in docker this is acheived by sharing a named volume). + .. _minio: https://min.io/ From 59f6c6819ff6529b5ce2c43b8b04a723d5bfd437 Mon Sep 17 00:00:00 2001 From: mishaschwartz <4380924+mishaschwartz@users.noreply.github.com> Date: Mon, 11 Nov 2024 09:33:38 -0500 Subject: [PATCH 10/10] documentation update [ci skip] --- birdhouse/components/README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/birdhouse/components/README.rst b/birdhouse/components/README.rst index b187394a4..b816f2c74 100644 --- a/birdhouse/components/README.rst +++ b/birdhouse/components/README.rst @@ -416,7 +416,7 @@ longer than 90 days: - `recording rules`_ are added to the monitoring Prometheus instance (the one deployed by this `monitoring` component). These rules are any that have the `longterm-metrics` label. - The metrics described by these rules are collected/calculated by the monitoring Prometheus instance. The monitoring Prometheus - instance treats these rules the same as + instance treats these rules the same as any other (ie. only stores them for 90 days by default). - To enable some example longterm `recording rules`_, enable the :ref:`prometheus-longterm-rules` component. You can also choose to create your own rules (see :ref:`prometheus-longterm-metrics` for details on how to create these longterm metrics rules). 2.