Skip to content

Commit

Permalink
Querier ignore 'The specified key does not exist' error (#97)
Browse files Browse the repository at this point in the history
  • Loading branch information
hczhu-db authored Nov 8, 2024
2 parents 56cfce8 + 0f908d3 commit 0bf9700
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 13 deletions.
2 changes: 1 addition & 1 deletion pkg/query/querier.go
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ func newQuerierWithOpts(

partialResponseStrategy := storepb.PartialResponseStrategy_ABORT
if opts.GroupReplicaPartialResponseStrategy {
level.Info(logger).Log("msg", "Enabled group-replica partial response strategy in newQuerierInternal")
level.Debug(logger).Log("msg", "Enabled group-replica partial response strategy in newQuerierInternal")
partialResponseStrategy = storepb.PartialResponseStrategy_GROUP_REPLICA
} else if partialResponse {
partialResponseStrategy = storepb.PartialResponseStrategy_WARN
Expand Down
36 changes: 24 additions & 12 deletions pkg/store/proxy.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,9 @@ type ProxyStore struct {
}

type proxyStoreMetrics struct {
emptyStreamResponses prometheus.Counter
storeFailureCount *prometheus.CounterVec
emptyStreamResponses prometheus.Counter
storeFailureCount *prometheus.CounterVec
missingBlockFileErrorCount prometheus.Counter
}

func newProxyStoreMetrics(reg prometheus.Registerer) *proxyStoreMetrics {
Expand All @@ -117,6 +118,10 @@ func newProxyStoreMetrics(reg prometheus.Registerer) *proxyStoreMetrics {
Name: "thanos_proxy_store_failure_total",
Help: "Total number of store failures.",
}, []string{"group", "replica"})
m.missingBlockFileErrorCount = promauto.With(reg).NewCounter(prometheus.CounterOpts{
Name: "thanos_proxy_querier_missing_block_file_error_total",
Help: "Total number of missing block file errors.",
})

return &m
}
Expand Down Expand Up @@ -419,24 +424,31 @@ func (s *ProxyStore) Series(originalRequest *storepb.SeriesRequest, srv storepb.
resp := respHeap.At()

if resp.GetWarning() != "" {
totalFailedStores++
maxWarningBytes := 2000
warning := resp.GetWarning()[:min(maxWarningBytes, len(resp.GetWarning()))]
level.Error(s.logger).Log("msg", "Store failure with warning", "warning", warning)
// Don't have group/replica keys here, so we can't attribute the warning to a specific store.
s.metrics.storeFailureCount.WithLabelValues("", "").Inc()
if r.PartialResponseStrategy == storepb.PartialResponseStrategy_GROUP_REPLICA {
// TODO: attribute the warning to the store(group key and replica key) that produced it.
// Each client streams a sequence of time series, so it's not trivial to attribute the warning to a specific client.
if totalFailedStores > 1 {
level.Error(reqLogger).Log("msg", "more than one stores had warnings")
// If we don't know which store has failed, we can tolerate at most one failed store.
if firstWarning != nil {
warning += "; " + *firstWarning
if strings.Contains(resp.GetWarning(), "The specified key does not exist") {
level.Warn(s.logger).Log("msg", "Ignore 'the specified key does not exist' error from Store")
// Ignore this error for now because we know the missing block file is already deleted by compactor.
// There is no other reason for this error to occur.
s.metrics.missingBlockFileErrorCount.Inc()
} else {
totalFailedStores++
// TODO: attribute the warning to the store(group key and replica key) that produced it.
// Each client streams a sequence of time series, so it's not trivial to attribute the warning to a specific client.
if totalFailedStores > 1 {
level.Error(reqLogger).Log("msg", "more than one stores had warnings")
// If we don't know which store has failed, we can tolerate at most one failed store.
if firstWarning != nil {
warning += "; " + *firstWarning
}
return status.Error(codes.Aborted, warning)
}
return status.Error(codes.Aborted, warning)
firstWarning = &warning
}
firstWarning = &warning
} else if r.PartialResponseDisabled || r.PartialResponseStrategy == storepb.PartialResponseStrategy_ABORT {
return status.Error(codes.Aborted, resp.GetWarning())
}
Expand Down

0 comments on commit 0bf9700

Please sign in to comment.