merge upstream master

Signed-off-by: Ben Ye <[email protected]>
cortexproject · Nov 7, 2023 · f1b4c1b · f1b4c1b
2 parents 21e8366 + c542d74
commit f1b4c1b
Show file tree

Hide file tree

Showing 2,179 changed files with 239,537 additions and 86,592 deletions.
diff --git a/.github/workflows/test-build-deploy.yml b/.github/workflows/test-build-deploy.yml
@@ -11,7 +11,7 @@ jobs:
   lint:
     runs-on: ubuntu-20.04
     container:
-      image: quay.io/cortexproject/build-image:upgrade-to-go-1.20.4-6025f83e5
+      image: quay.io/cortexproject/build-image:update-go-1.21.3-e38685e50
     steps:
       - name: Checkout Repo
         uses: actions/checkout@v2
@@ -40,7 +40,7 @@ jobs:
   test:
     runs-on: ubuntu-20.04
     container:
-      image: quay.io/cortexproject/build-image:upgrade-to-go-1.20.4-6025f83e5
+      image: quay.io/cortexproject/build-image:update-go-1.21.3-e38685e50
     steps:
       - name: Checkout Repo
         uses: actions/checkout@v2
@@ -59,7 +59,7 @@ jobs:
   build:
     runs-on: ubuntu-20.04
     container:
-      image: quay.io/cortexproject/build-image:upgrade-to-go-1.20.4-6025f83e5
+      image: quay.io/cortexproject/build-image:update-go-1.21.3-e38685e50
     steps:
       - name: Checkout Repo
         uses: actions/checkout@v2
@@ -118,7 +118,7 @@ jobs:
       - name: Upgrade golang
         uses: actions/setup-go@v2
         with:
-          go-version: 1.20.1
+          go-version: 1.21.3
       - name: Checkout Repo
         uses: actions/checkout@v2
       - name: Install Docker Client
@@ -153,7 +153,11 @@ jobs:
             docker pull quay.io/cortexproject/cortex:v1.10.0
             docker pull quay.io/cortexproject/cortex:v1.11.1
             docker pull quay.io/cortexproject/cortex:v1.13.1
+            docker pull quay.io/cortexproject/cortex:v1.13.2
             docker pull quay.io/cortexproject/cortex:v1.14.0
+            docker pull quay.io/cortexproject/cortex:v1.14.1
+            docker pull quay.io/cortexproject/cortex:v1.15.0
+            docker pull quay.io/cortexproject/cortex:v1.15.1
           fi
           docker pull memcached:1.6.1
           docker pull redis:7.0.4-alpine
@@ -189,14 +193,14 @@ jobs:
         run: |
           touch build-image/.uptodate
           MIGRATIONS_DIR=$(pwd)/cmd/cortex/migrations
-          make BUILD_IMAGE=quay.io/cortexproject/build-image:upgrade-to-go-1.20.4-6025f83e5 TTY='' configs-integration-test
+          make BUILD_IMAGE=quay.io/cortexproject/build-image:update-go-1.21.3-e38685e50 TTY='' configs-integration-test
 
   deploy_website:
     needs: [build, test]
     if: (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/tags/')) && github.repository == 'cortexproject/cortex'
     runs-on: ubuntu-20.04
     container:
-      image: quay.io/cortexproject/build-image:upgrade-to-go-1.20.4-6025f83e5
+      image: quay.io/cortexproject/build-image:update-go-1.21.3-e38685e50
     steps:
       - name: Checkout Repo
         uses: actions/checkout@v2
@@ -238,7 +242,7 @@ jobs:
     if: (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/tags/')) && github.repository == 'cortexproject/cortex'
     runs-on: ubuntu-20.04
     container:
-      image: quay.io/cortexproject/build-image:upgrade-to-go-1.20.4-6025f83e5
+      image: quay.io/cortexproject/build-image:update-go-1.21.3-e38685e50
     steps:
       - name: Checkout Repo
         uses: actions/checkout@v2

diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,7 @@
-cmd/test-exporter/test-exporter
-cmd/cortex/cortex
-cmd/query-tee/query-tee
-cmd/thanosconvert/thanosconvert
+cmd/test-exporter/test-exporter-*
+cmd/cortex/cortex-*
+cmd/query-tee/query-tee-*
+cmd/thanosconvert/thanosconvert*
 .uptodate
 .pkg
 .cache

diff --git a/ADOPTERS.md b/ADOPTERS.md
@@ -14,6 +14,7 @@ This is the list of organisations that are using Cortex in **production environm
 * [MayaData](https://mayadata.io/)
 * [Northflank](https://northflank.com/)
 * [Opstrace](https://opstrace.com/)
+* [PITS Globale Datenrettungsdienste](https://www.pitsdatenrettung.de/)
 * [Platform9](https://platform9.com/)
 * [REWE Digital](https://rewe-digital.com/)
 * [SysEleven](https://www.syseleven.de/)

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,102 @@
 
 ## master / unreleased
 
+## 1.16.0 in progress
+
+* [BUGFIX] Querier: Fix querier limiter bug under multiselect. #5627
+* [CHANGE] Ruler: Add `cortex_ruler_rule_group_load_duration_seconds` and `cortex_ruler_rule_group_sync_duration_seconds` metrics. #5609
+* [CHANGE] Ruler: Add contextual info and query statistics to log
+* [FEATURE] Ruler: Add support for disabling rule groups. #5521
+* [FEATURE] Added the flag `-alertmanager.alerts-gc-interval` to configure alert manager alerts Garbage collection interval. #5550
+* [FEATURE] Ruler: Add support for Limit field on RuleGroup. #5528
+* [FEATURE] AlertManager: Add support for Webex, Discord and Telegram Receiver. #5493
+* [FEATURE] Ingester: added `-admin-limit-message` to customize the message contained in limit errors.#5460
+* [FEATURE] AlertManager: Update version to v0.26.0 and bring in Microsoft Teams receiver. #5543
+* [FEATURE] Store Gateway: Support lazy expanded posting optimization. Added new flag `blocks-storage.bucket-store.lazy-expanded-postings-enabled` and new metrics `cortex_bucket_store_lazy_expanded_postings_total`, `cortex_bucket_store_lazy_expanded_posting_size_bytes_total` and `cortex_bucket_store_lazy_expanded_posting_series_overfetched_size_bytes_total`. #5556.
+* [FEATURE] Store Gateway: Added new flag `blocks-storage.bucket-store.series-batch-size` to control how many series to fetch per batch in Store Gateway. #5582.
+* [CHANGE] AlertManager: include reason label in cortex_alertmanager_notifications_failed_total.#5409
+* [CHANGE] Query: Set CORS Origin headers for Query API #5388
+* [CHANGE] Updating prometheus/alertmanager from v0.25.0 to v0.25.1-0.20230505130626-263ca5c9438e. This includes the below changes. #5276
+  - Validating new fields on the Webhook AM config, PushOver AM Config and Telegram AM Config.
+  - filtering 5xx Errors in numTotalFailedNotifications metric.
+  - Delete silence respond with 404 when silence is not found.
+  - mark webhook URL as a secret.
+* [CHANGE] Ruler: Added user label to `cortex_ruler_write_requests_total`, `cortex_ruler_write_requests_failed_total`, `cortex_ruler_queries_total`, and `cortex_ruler_queries_failed_total` metrics. #5312
+* [CHANGE] Alertmanager: Validating new fields on the PagerDuty AM config. #5290
+* [CHANGE] Ingester: Creating label `native-histogram-sample` on the `cortex_discarded_samples_total` to keep track of discarded native histogram samples. #5289
+* [CHANGE] Store Gateway: Rename `cortex_bucket_store_cached_postings_compression_time_seconds` to `cortex_bucket_store_cached_postings_compression_time_seconds_total`. #5431
+* [CHANGE] Store Gateway: Rename `cortex_bucket_store_cached_series_fetch_duration_seconds` to `cortex_bucket_store_series_fetch_duration_seconds` and `cortex_bucket_store_cached_postings_fetch_duration_seconds` to `cortex_bucket_store_postings_fetch_duration_seconds`. Add new metric `cortex_bucket_store_chunks_fetch_duration_seconds`. #5448
+* [CHANGE] Store Gateway: Remove `idle_timeout`, `max_conn_age`, `pool_size`, `min_idle_conns` fields for Redis index cache and caching bucket. #5448
+* [CHANGE] Store Gateway: Add flag `-store-gateway.sharding-ring.zone-stable-shuffle-sharding` to enable store gateway to use zone stable shuffle sharding. #5489
+* [CHANGE] Bucket Index: Add `series_max_size` and `chunk_max_size` to bucket index. #5489
+* [CHANGE] StoreGateway: Rename `cortex_bucket_store_chunk_pool_returned_bytes_total` and `cortex_bucket_store_chunk_pool_requested_bytes_total` to `cortex_bucket_store_chunk_pool_operation_bytes_total`. #5552
+* [CHANGE] Query Frontend/Querier: Make build info API disabled by default and add feature flag `api.build-info-enabled` to enable it. #5533
+* [CHANGE] Purger: Do no use S3 tenant kms key when uploading deletion marker. #5575
+* [CHANGE] Ingester: Shipper always upload compacted blocks. #5625
+* [FEATURE] Store Gateway: Add `max_downloaded_bytes_per_request` to limit max bytes to download per store gateway request.
+* [FEATURE] Added 2 flags `-alertmanager.alertmanager-client.grpc-max-send-msg-size` and ` -alertmanager.alertmanager-client.grpc-max-recv-msg-size` to configure alert manager grpc client message size limits. #5338
+* [FEATURE] Query Frontend: Add `cortex_rejected_queries_total` metric for throttled queries. #5356
+* [FEATURE] Querier: Log query stats when querying store gateway. #5376
+* [FEATURE] Querier/StoreGateway: Allow the tenant shard sizes to be a percent of total instances. #5393
+* [FEATURE] Added the flag `-alertmanager.api-concurrency` to configure alert manager api concurrency limit. #5412
+* [FEATURE] Store Gateway: Add `-store-gateway.sharding-ring.keep-instance-in-the-ring-on-shutdown` to skip unregistering instance from the ring in shutdown. #5421
+* [FEATURE] Ruler: Support for filtering rules in the API. #5417
+* [FEATURE] Compactor: Add `-compactor.ring.tokens-file-path` to store generated tokens locally. #5432
+* [FEATURE] Query Frontend: Add `-frontend.retry-on-too-many-outstanding-requests` to re-enqueue 429 requests if there are multiple query-schedulers available. #5496
+* [FEATURE] Store Gateway: Add `-blocks-storage.bucket-store.max-inflight-requests` for store gateways to reject further requests upon reaching the limit. #5553
+* [FEATURE] Store Gateway: Add `cortex_bucket_store_block_load_duration_seconds` histogram to track time to load blocks. #5580
+* [FEATURE] AlertManager: Add `cortex_alertmanager_dispatcher_aggregation_groups` and `cortex_alertmanager_dispatcher_alert_processing_duration_seconds` metrics for dispatcher. #5592
+* [ENHANCEMENT] Distributor/Ingester: Add span on push path #5319
+* [ENHANCEMENT] Support object storage backends for runtime configuration file. #5292
+* [ENHANCEMENT] Query Frontend: Reject subquery with too small step size. #5323
+* [ENHANCEMENT] Compactor: Exposing Thanos accept-malformed-index to Cortex compactor. #5334
+* [ENHANCEMENT] Log: Avoid expensive log.Valuer evaluation for disallowed levels. #5297
+* [ENHANCEMENT] Improving Performance on the API Gzip Handler. #5347
+* [ENHANCEMENT] Dynamodb: Add `puller-sync-time` to allow different pull time for ring. #5357
+* [ENHANCEMENT] Emit querier `max_concurrent` as a metric. #5362
+* [ENHANCEMENT] Avoid sort tokens on lifecycler autoJoin. #5394
+* [ENHANCEMENT] Do not resync blocks in running store gateways during rollout deployment and container restart. #5363
+* [ENHANCEMENT] Store Gateway: Add new metrics `cortex_bucket_store_sent_chunk_size_bytes`, `cortex_bucket_store_postings_size_bytes` and `cortex_bucket_store_empty_postings_total`. #5397
+* [ENHANCEMENT] Add jitter to lifecycler heartbeat. #5404
+* [ENHANCEMENT] Store Gateway: Add config `estimated_max_series_size_bytes` and `estimated_max_chunk_size_bytes` to address data overfetch. #5401
+* [ENHANCEMENT] Distributor/Ingester: Add experimental `-distributor.sign_write_requests` flag to sign the write requests. #5430
+* [ENHANCEMENT] Store Gateway/Querier/Compactor: Handling CMK Access Denied errors. #5420 #5442 #5446
+* [ENHANCEMENT] Store Gateway: Implementing multi level index cache. #5451
+* [ENHANCEMENT] Alertmanager: Add the alert name in error log when it get throttled. #5456
+* [ENHANCEMENT] Querier: Retry store gateway on different zones when zone awareness is enabled. #5476
+* [ENHANCEMENT] DDBKV: Change metric name from dynamodb_kv_read_capacity_total to dynamodb_kv_consumed_capacity_total and include Delete, Put, Batch dimension. #5481
+* [ENHANCEMENT] Compactor: allow unregisteronshutdown to be configurable. #5503
+* [ENHANCEMENT] Querier: Batch adding series to query limiter to optimize locking. #5505
+* [ENHANCEMENT] Store Gateway: add metric `cortex_bucket_store_chunk_refetches_total` for number of chunk refetches. #5532
+* [ENHANCEMENT] BasicLifeCycler: allow final-sleep during shutdown #5517
+* [ENHANCEMENT] All: Handling CMK Access Denied errors. #5420 #5542
+* [ENHANCEMENT] Querier: Retry store gateway client connection closing gRPC error. #5558
+* [ENHANCEMENT] QueryFrontend: Add generic retry for all APIs. #5561.
+* [ENHANCEMENT] Querier: Check context before notifying scheduler and frontend. #5565
+* [ENHANCEMENT] QueryFrontend: Add metric for number of series requests. #5373
+* [ENHANCEMENT] Store Gateway: Add histogram metrics for total time spent fetching series and chunks per request. #5573
+* [ENHANCEMENT] Store Gateway: Check context in multi level cache. Add `cortex_store_multilevel_index_cache_fetch_duration_seconds` and `cortex_store_multilevel_index_cache_backfill_duration_seconds` to measure fetch and backfill latency. #5596
+* [ENHANCEMENT] Ingester: Added new ingester TSDB metrics `cortex_ingester_tsdb_head_samples_appended_total`, `cortex_ingester_tsdb_head_out_of_order_samples_appended_total`, `cortex_ingester_tsdb_snapshot_replay_error_total`, `cortex_ingester_tsdb_sample_ooo_delta` and `cortex_ingester_tsdb_mmap_chunks_total`. #5624
+* [BUGFIX] Ruler: Validate if rule group can be safely converted back to rule group yaml from protobuf message #5265
+* [BUGFIX] Querier: Convert gRPC `ResourceExhausted` status code from store gateway to 422 limit error. #5286
+* [BUGFIX] Alertmanager: Route web-ui requests to the alertmanager distributor when sharding is enabled. #5293
+* [BUGFIX] Storage: Bucket index updater should ignore meta not found for partial blocks. #5343
+* [BUGFIX] Ring: Add JOINING state to read operation. #5346
+* [BUGFIX] Compactor: Partial block with only visit marker should be deleted even there is no deletion marker. #5342
+* [BUGFIX] KV: Etcd calls will no longer block indefinitely and will now time out after the DialTimeout period. #5392
+* [BUGFIX] Ring: Allow RF greater than number of zones to select more than one instance per zone #5411
+* [BUGFIX] Store Gateway: Fix bug in store gateway ring comparison logic. #5426
+* [BUGFIX] Ring: Fix bug in consistency of Get func in a scaling zone-aware ring. #5429
+* [BUGFIX] Query Frontend: Fix bug of failing to cancel downstream request context in query frontend v2 mode (query scheduler enabled). #5447
+* [BUGFIX] Alertmanager: Remove the user id from state replication key metric label value. #5453
+* [BUGFIX] Compactor: Avoid cleaner concurrency issues checking global markers before all blocks. #5457
+* [BUGFIX] DDBKV: Disallow instance with older timestamp to update instance with newer timestamp. #5480
+* [BUGFIX] Query Frontend: Handle context error before decoding and merging responses. #5499
+* [BUGFIX] DDBKV: When no change detected in ring, retry the CAS until there is change. #5502
+* [BUGFIX] Fix bug on objstore when configured to use S3 fips endpoints. #5540
+* [BUGFIX] Ruler: Fix bug on ruler where a failure to load a single RuleGroup would prevent rulers to sync all RuleGroup. #5563
+* [BUGFIX] Store-Gateway and AlertManager: Add a `wait_instance_time_out` to WaitInstanceState context to avoid waiting forever. #5581
+
 ## 1.15.3 2023-06-22
 
 * [BUGFIX] Distributor: Fix potential data corruption in cases of timeout between distributors and ingesters. #5422
@@ -108,6 +204,7 @@
     - `-ingester.chunk-age-jitter`
     - `-ingester.concurrent-flushes`
     - `-ingester.spread-flushes`
+    - `-ingester.chunk-encoding`
     - `-store.*` except `-store.engine` and `-store.max-query-length`
     - `-store.query-chunk-limit` was deprecated and replaced by `-querier.max-fetched-chunks-per-query`
   - `-deletes.*`
@@ -154,6 +251,10 @@
 * [BUGFIX] QueryFrontend/Querier: fixed regression added by #4863 where we stopped compressing the response between querier and query frontend. #4960
 * [BUGFIX] QueryFrontend/Querier: fixed fix response error to be ungzipped when status code is not 2xx. #4975
 
+### Known issues
+
+- Configsdb: Ruler configs doesn't work. Remove all configs from postgres database that have format Prometheus 1.x rule format before upgrading to v1.14.0 (see [5387](https://github.com/cortexproject/cortex/issues/5387))
+
 ## 1.13.0 2022-07-14
 
 * [CHANGE] Changed default for `-ingester.min-ready-duration` from 1 minute to 15 seconds. #4539

diff --git a/Makefile b/Makefile
@@ -122,7 +122,7 @@ build-image/$(UPTODATE): build-image/*
 SUDO := $(shell docker info >/dev/null 2>&1 || echo "sudo -E")
 BUILD_IN_CONTAINER := true
 BUILD_IMAGE ?= $(IMAGE_PREFIX)build-image
-LATEST_BUILD_IMAGE_TAG ?= upgrade-to-go-1.20.4-6025f83e5
+LATEST_BUILD_IMAGE_TAG ?= update-go-1.21.3-e38685e50
 
 # TTY is parameterized to allow Google Cloud Builder to run builds,
 # as it currently disallows TTY devices. This value needs to be overridden

diff --git a/README.md b/README.md
@@ -19,7 +19,8 @@ Cortex provides horizontally scalable, highly available, multi-tenant, long term
 Prometheus sources in a single cluster, allowing untrusted parties to share the same cluster.
 - **Long term storage:** Cortex supports S3, GCS, Swift and Microsoft Azure for long term storage of metric data. This allows you to durably store data for longer than the lifetime of any single machine, and use this data for long term capacity planning.
 
-Cortex is a [CNCF](https://cncf.io) incubation project used in several production systems including [Weave Cloud](https://cloud.weave.works) and [Grafana Cloud](https://grafana.com/cloud).
+Cortex is a [CNCF](https://cncf.io) incubation project used in several production systems including [Amazon Managed Service for Prometheus (AMP)](https://aws.amazon.com/prometheus/).
+
 Cortex is primarily used as a [remote write](https://prometheus.io/docs/operating/configuration/#remote_write) destination for Prometheus, with a Prometheus-compatible query API.
 
 
@@ -38,13 +39,17 @@ should read:
 1. [Getting started with Cortex](https://cortexmetrics.io/docs/getting-started/)
 1. [Information regarding configuring Cortex](https://cortexmetrics.io/docs/configuration/)
 
+There are also individual [guides](https://cortexmetrics.io/docs/guides/) to many tasks.
+Please review the important [security advice](https://cortexmetrics.io/docs/guides/security/) before deploying.
+
 For a guide to contributing to Cortex, see the [contributor guidelines](https://cortexmetrics.io/docs/contributing/).
 
 ## Further reading
 
 To learn more about Cortex, consult the following talks and articles.
 
 ### Talks and articles
+- Apr 2023 KubeCon talk "How to Run a Rock Solid Multi-Tenant Prometheus" ([video](https://youtu.be/Pl5hEoRPLJU), [slides](https://static.sched.com/hosted_files/kccnceu2023/49/Kubecon2023.pptx.pdf))
 - Oct 2022 KubeCon talk "Current State and the Future of Cortex" ([video](https://youtu.be/u1SfBAGWHgQ), [slides](https://static.sched.com/hosted_files/kccncna2022/93/KubeCon%20%2B%20CloudNativeCon%20NA%202022%20PowerPoint%20-%20Cortex.pdf))
 - Oct 2021 KubeCon talk "Cortex: Intro and Production Tips" ([video](https://youtu.be/zNE_kGcUGuI), [slides](https://static.sched.com/hosted_files/kccncna2021/8e/KubeCon%202021%20NA%20Cortex%20Maintainer.pdf))
 - Dec 2020 blog post "[How AWS and Grafana Labs are scaling Cortex for the cloud](https://aws.amazon.com/blogs/opensource/how-aws-and-grafana-labs-are-scaling-cortex-for-the-cloud/)"
@@ -151,16 +156,6 @@ To see meeting calendar:
 There are several commercial services where you can use Cortex
 on-demand:
 
-### Weave Cloud
-
-[Weave Cloud](https://cloud.weave.works) from
-[Weaveworks](https://weave.works) lets you deploy, manage, and monitor
-container-based applications. Sign up at https://cloud.weave.works
-and follow the instructions there. Additional help can also be found
-in the [Weave Cloud documentation](https://www.weave.works/docs/cloud/latest/overview/).
-
-[Instrumenting Your App: Best Practices](https://www.weave.works/docs/cloud/latest/tasks/monitor/best-instrumenting/)
-
 ### Amazon Managed Service for Prometheus (AMP)
 
 [Amazon Managed Service for Prometheus (AMP)](https://aws.amazon.com/prometheus/) is a Prometheus-compatible monitoring service that makes it easy to monitor containerized applications at scale. It is a highly available, secure, and managed monitoring for your containers. Get started [here](https://console.aws.amazon.com/prometheus/home). To learn more about the AMP, reference our [documentation](https://docs.aws.amazon.com/prometheus/latest/userguide/what-is-Amazon-Managed-Service-Prometheus.html) and [Getting Started with AMP blog](https://aws.amazon.com/blogs/mt/getting-started-amazon-managed-service-for-prometheus/).