diff --git a/pkg/query/querier.go b/pkg/query/querier.go index fa8f0850c0..0789c3875a 100644 --- a/pkg/query/querier.go +++ b/pkg/query/querier.go @@ -210,7 +210,7 @@ func newQuerierWithOpts( partialResponseStrategy := storepb.PartialResponseStrategy_ABORT if opts.GroupReplicaPartialResponseStrategy { - level.Info(logger).Log("msg", "Enabled group-replica partial response strategy in newQuerierInternal") + level.Debug(logger).Log("msg", "Enabled group-replica partial response strategy in newQuerierInternal") partialResponseStrategy = storepb.PartialResponseStrategy_GROUP_REPLICA } else if partialResponse { partialResponseStrategy = storepb.PartialResponseStrategy_WARN diff --git a/pkg/store/proxy.go b/pkg/store/proxy.go index 28db933d33..335592bd90 100644 --- a/pkg/store/proxy.go +++ b/pkg/store/proxy.go @@ -102,8 +102,9 @@ type ProxyStore struct { } type proxyStoreMetrics struct { - emptyStreamResponses prometheus.Counter - storeFailureCount *prometheus.CounterVec + emptyStreamResponses prometheus.Counter + storeFailureCount *prometheus.CounterVec + missingBlockFileErrorCount prometheus.Counter } func newProxyStoreMetrics(reg prometheus.Registerer) *proxyStoreMetrics { @@ -117,6 +118,10 @@ func newProxyStoreMetrics(reg prometheus.Registerer) *proxyStoreMetrics { Name: "thanos_proxy_store_failure_total", Help: "Total number of store failures.", }, []string{"group", "replica"}) + m.missingBlockFileErrorCount = promauto.With(reg).NewCounter(prometheus.CounterOpts{ + Name: "thanos_proxy_querier_missing_block_file_error_total", + Help: "Total number of missing block file errors.", + }) return &m } @@ -419,24 +424,31 @@ func (s *ProxyStore) Series(originalRequest *storepb.SeriesRequest, srv storepb. resp := respHeap.At() if resp.GetWarning() != "" { - totalFailedStores++ maxWarningBytes := 2000 warning := resp.GetWarning()[:min(maxWarningBytes, len(resp.GetWarning()))] level.Error(s.logger).Log("msg", "Store failure with warning", "warning", warning) // Don't have group/replica keys here, so we can't attribute the warning to a specific store. s.metrics.storeFailureCount.WithLabelValues("", "").Inc() if r.PartialResponseStrategy == storepb.PartialResponseStrategy_GROUP_REPLICA { - // TODO: attribute the warning to the store(group key and replica key) that produced it. - // Each client streams a sequence of time series, so it's not trivial to attribute the warning to a specific client. - if totalFailedStores > 1 { - level.Error(reqLogger).Log("msg", "more than one stores had warnings") - // If we don't know which store has failed, we can tolerate at most one failed store. - if firstWarning != nil { - warning += "; " + *firstWarning + if strings.Contains(resp.GetWarning(), "The specified key does not exist") { + level.Warn(s.logger).Log("msg", "Ignore 'the specified key does not exist' error from Store") + // Ignore this error for now because we know the missing block file is already deleted by compactor. + // There is no other reason for this error to occur. + s.metrics.missingBlockFileErrorCount.Inc() + } else { + totalFailedStores++ + // TODO: attribute the warning to the store(group key and replica key) that produced it. + // Each client streams a sequence of time series, so it's not trivial to attribute the warning to a specific client. + if totalFailedStores > 1 { + level.Error(reqLogger).Log("msg", "more than one stores had warnings") + // If we don't know which store has failed, we can tolerate at most one failed store. + if firstWarning != nil { + warning += "; " + *firstWarning + } + return status.Error(codes.Aborted, warning) } - return status.Error(codes.Aborted, warning) + firstWarning = &warning } - firstWarning = &warning } else if r.PartialResponseDisabled || r.PartialResponseStrategy == storepb.PartialResponseStrategy_ABORT { return status.Error(codes.Aborted, resp.GetWarning()) }