forked from thanos-io/thanos
-
Notifications
You must be signed in to change notification settings - Fork 7
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[PLAT-104290] auto detect/remove overlapping blocks to be compacted to avoid compactor panics #23
Merged
Merged
Changes from all commits
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
5fc3435
[PLAT-104290] remove overlapping blocks into its own file
jnyi 843f712
[PLAT-104290] fixed some bugs and added unit tests
jnyi f86995a
test rollback to previous callbacks
jnyi cfc344f
[PLAT-102919] add a flag to control overlapping block removal behavior
jnyi File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -236,7 +236,6 @@ type DefaultGrouper struct { | |
compactionRunsCompleted *prometheus.CounterVec | ||
compactionFailures *prometheus.CounterVec | ||
verticalCompactions *prometheus.CounterVec | ||
overlappingBlocks *prometheus.CounterVec | ||
garbageCollectedBlocks prometheus.Counter | ||
blocksMarkedForDeletion prometheus.Counter | ||
blocksMarkedForNoCompact prometheus.Counter | ||
|
@@ -284,10 +283,6 @@ func NewDefaultGrouper( | |
Name: "thanos_compact_group_vertical_compactions_total", | ||
Help: "Total number of group compaction attempts that resulted in a new block based on overlapping blocks.", | ||
}, []string{"resolution"}), | ||
overlappingBlocks: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. move this metric to its own source file, this will avoid conflicts when merge from upstream. |
||
Name: "thanos_compact_group_overlapping_blocks_total", | ||
Help: "Total number of blocks that are overlapping and to be deleted.", | ||
}, []string{"resolution", "level"}), | ||
blocksMarkedForNoCompact: blocksMarkedForNoCompact, | ||
garbageCollectedBlocks: garbageCollectedBlocks, | ||
blocksMarkedForDeletion: blocksMarkedForDeletion, | ||
|
@@ -308,7 +303,6 @@ func NewDefaultGrouperWithMetrics( | |
compactionRunsCompleted *prometheus.CounterVec, | ||
compactionFailures *prometheus.CounterVec, | ||
verticalCompactions *prometheus.CounterVec, | ||
overrlappingBlocks *prometheus.CounterVec, | ||
blocksMarkedForDeletion prometheus.Counter, | ||
garbageCollectedBlocks prometheus.Counter, | ||
blocksMarkedForNoCompact prometheus.Counter, | ||
|
@@ -326,7 +320,6 @@ func NewDefaultGrouperWithMetrics( | |
compactionRunsCompleted: compactionRunsCompleted, | ||
compactionFailures: compactionFailures, | ||
verticalCompactions: verticalCompactions, | ||
overlappingBlocks: overrlappingBlocks, | ||
blocksMarkedForNoCompact: blocksMarkedForNoCompact, | ||
garbageCollectedBlocks: garbageCollectedBlocks, | ||
blocksMarkedForDeletion: blocksMarkedForDeletion, | ||
|
@@ -359,7 +352,6 @@ func (g *DefaultGrouper) Groups(blocks map[ulid.ULID]*metadata.Meta) (res []*Gro | |
g.compactionRunsCompleted.WithLabelValues(resolutionLabel), | ||
g.compactionFailures.WithLabelValues(resolutionLabel), | ||
g.verticalCompactions.WithLabelValues(resolutionLabel), | ||
g.overlappingBlocks.WithLabelValues(resolutionLabel, fmt.Sprintf("%d", m.Compaction.Level)), | ||
g.garbageCollectedBlocks, | ||
g.blocksMarkedForDeletion, | ||
g.blocksMarkedForNoCompact, | ||
|
@@ -400,7 +392,6 @@ type Group struct { | |
compactionRunsCompleted prometheus.Counter | ||
compactionFailures prometheus.Counter | ||
verticalCompactions prometheus.Counter | ||
overlappingBlocks prometheus.Counter | ||
groupGarbageCollectedBlocks prometheus.Counter | ||
blocksMarkedForDeletion prometheus.Counter | ||
blocksMarkedForNoCompact prometheus.Counter | ||
|
@@ -424,7 +415,6 @@ func NewGroup( | |
compactionRunsCompleted prometheus.Counter, | ||
compactionFailures prometheus.Counter, | ||
verticalCompactions prometheus.Counter, | ||
overlappingBlocks prometheus.Counter, | ||
groupGarbageCollectedBlocks prometheus.Counter, | ||
blocksMarkedForDeletion prometheus.Counter, | ||
blocksMarkedForNoCompact prometheus.Counter, | ||
|
@@ -453,7 +443,6 @@ func NewGroup( | |
compactionRunsCompleted: compactionRunsCompleted, | ||
compactionFailures: compactionFailures, | ||
verticalCompactions: verticalCompactions, | ||
overlappingBlocks: overlappingBlocks, | ||
groupGarbageCollectedBlocks: groupGarbageCollectedBlocks, | ||
blocksMarkedForDeletion: blocksMarkedForDeletion, | ||
blocksMarkedForNoCompact: blocksMarkedForNoCompact, | ||
|
@@ -1031,53 +1020,6 @@ func (cg *Group) areBlocksOverlapping(include *metadata.Meta, exclude ...*metada | |
return nil | ||
} | ||
|
||
func (cg *Group) removeOverlappingBlocks(ctx context.Context, toCompact []*metadata.Meta, dir string) error { | ||
if len(toCompact) == 0 { | ||
return nil | ||
} | ||
kept := toCompact[0] | ||
for _, m := range toCompact { | ||
if m.MinTime < kept.MinTime && m.MaxTime > kept.MaxTime { | ||
level.Warn(cg.logger).Log("msg", "found overlapping block in plan that are not the first", | ||
"first", kept.String(), "block", m.String()) | ||
kept = m | ||
} else if (m.MinTime < kept.MinTime && kept.MinTime < m.MaxTime) || | ||
(m.MinTime < kept.MaxTime && kept.MaxTime < m.MaxTime) { | ||
err := errors.Errorf("found partially overlapping block: %s vs %s", m.String(), kept.String()) | ||
if cg.enableVerticalCompaction { | ||
level.Error(cg.logger).Log("msg", "best effort to vertical compact", "err", err) | ||
return nil | ||
} else { | ||
return halt(err) | ||
} | ||
} | ||
} | ||
for _, m := range toCompact { | ||
if m.ULID.Compare(kept.ULID) == 0 || m.Thanos.Source == metadata.ReceiveSource { | ||
level.Info(cg.logger).Log("msg", "keep this overlapping block", "block", m.String(), | ||
"level", m.Compaction.Level, "source", m.Thanos.Source, "labels", m.Thanos.GetLabels()) | ||
continue | ||
} | ||
cg.overlappingBlocks.Inc() | ||
if err := DeleteBlockNow(ctx, cg.logger, cg.bkt, m, dir); err != nil { | ||
return retry(err) | ||
} | ||
} | ||
return retry(errors.Errorf("found overlapping blocks in plan. Only kept %s", kept.String())) | ||
} | ||
|
||
func DeleteBlockNow(ctx context.Context, logger log.Logger, bkt objstore.Bucket, m *metadata.Meta, dir string) error { | ||
level.Warn(logger).Log("msg", "delete polluted block immediately", "block", m.String(), | ||
"level", m.Compaction.Level, "source", m.Thanos.Source, "labels", m.Thanos.GetLabels()) | ||
if err := block.Delete(ctx, logger, bkt, m.ULID); err != nil { | ||
return errors.Wrapf(err, "delete overlapping block %s", m.String()) | ||
} | ||
if err := os.RemoveAll(filepath.Join(dir, m.ULID.String())); err != nil { | ||
return errors.Wrapf(err, "remove old block dir %s", m.String()) | ||
} | ||
return nil | ||
} | ||
|
||
// RepairIssue347 repairs the https://github.com/prometheus/tsdb/issues/347 issue when having issue347Error. | ||
func RepairIssue347(ctx context.Context, logger log.Logger, bkt objstore.Bucket, blocksMarkedForDeletion prometheus.Counter, issue347Err error) error { | ||
ie, ok := errors.Cause(issue347Err).(Issue347Error) | ||
|
@@ -1171,12 +1113,9 @@ func (cg *Group) compact(ctx context.Context, dir string, planner Planner, comp | |
begin := groupCompactionBegin | ||
|
||
if err := compactionLifecycleCallback.PreCompactionCallback(ctx, cg.logger, cg, toCompact); err != nil { | ||
level.Error(cg.logger).Log("msg", fmt.Sprintf("failed to run pre compaction callback for plan: %v", toCompact), "err", err) | ||
// instead of halting, we attempt to remove overlapped blocks and only keep the longest one. | ||
if newErr := cg.removeOverlappingBlocks(ctx, toCompact, dir); newErr != nil { | ||
return false, ulid.ULID{}, newErr | ||
} | ||
return false, ulid.ULID{}, errors.Wrapf(err, "failed to run pre compaction callback for plan: %s", fmt.Sprintf("%v", toCompact)) | ||
} | ||
toCompact = FilterRemovedBlocks(toCompact) | ||
level.Info(cg.logger).Log("msg", "finished running pre compaction callback; downloading blocks", "duration", time.Since(begin), "duration_ms", time.Since(begin).Milliseconds(), "plan", fmt.Sprintf("%v", toCompact)) | ||
|
||
begin = time.Now() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
// Copyright (c) The Thanos Authors. | ||
// Licensed under the Apache License 2.0. | ||
|
||
package compact | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
|
||
"github.com/go-kit/log" | ||
"github.com/go-kit/log/level" | ||
"github.com/oklog/ulid" | ||
"github.com/pkg/errors" | ||
"github.com/prometheus/client_golang/prometheus" | ||
"github.com/prometheus/client_golang/prometheus/promauto" | ||
"github.com/prometheus/prometheus/tsdb" | ||
"github.com/thanos-io/objstore" | ||
"github.com/thanos-io/thanos/pkg/block" | ||
"github.com/thanos-io/thanos/pkg/block/metadata" | ||
) | ||
|
||
type OverlappingCompactionLifecycleCallback struct { | ||
overlappingBlocks prometheus.Counter | ||
} | ||
|
||
func NewOverlappingCompactionLifecycleCallback(reg *prometheus.Registry, enabled bool) CompactionLifecycleCallback { | ||
if enabled { | ||
return OverlappingCompactionLifecycleCallback{ | ||
overlappingBlocks: promauto.With(reg).NewCounter(prometheus.CounterOpts{ | ||
Name: "thanos_compact_group_overlapping_blocks_total", | ||
Help: "Total number of blocks that are overlapping and to be deleted.", | ||
}), | ||
} | ||
} | ||
return DefaultCompactionLifecycleCallback{} | ||
} | ||
|
||
// PreCompactionCallback given the assumption that toCompact is sorted by MinTime in ascending order from Planner | ||
// (not guaranteed on MaxTime order), we will detect overlapping blocks and delete them while retaining all others. | ||
func (o OverlappingCompactionLifecycleCallback) PreCompactionCallback(ctx context.Context, logger log.Logger, cg *Group, toCompact []*metadata.Meta) error { | ||
if len(toCompact) == 0 { | ||
return nil | ||
} | ||
prev := 0 | ||
for curr, currB := range toCompact { | ||
prevB := toCompact[prev] | ||
if curr == 0 || currB.Thanos.Source == metadata.ReceiveSource || prevB.MaxTime <= currB.MinTime { | ||
// no overlapping with previous blocks, skip it | ||
prev = curr | ||
continue | ||
} else if currB.MinTime < prevB.MinTime { | ||
// halt when the assumption is broken, the input toCompact isn't sorted by minTime, need manual investigation | ||
return halt(errors.Errorf("later blocks has smaller minTime than previous block: %s -- %s", prevB.String(), currB.String())) | ||
} else if prevB.MaxTime < currB.MaxTime && prevB.MinTime != currB.MinTime { | ||
err := errors.Errorf("found partially overlapping block: %s -- %s", prevB.String(), currB.String()) | ||
if cg.enableVerticalCompaction { | ||
level.Error(logger).Log("msg", "best effort to vertical compact", "err", err) | ||
prev = curr | ||
continue | ||
} else { | ||
return halt(err) | ||
} | ||
} else if prevB.MinTime == currB.MinTime && prevB.MaxTime == currB.MaxTime { | ||
if prevB.Stats.NumSeries != currB.Stats.NumSeries || prevB.Stats.NumSamples != currB.Stats.NumSamples { | ||
level.Warn(logger).Log("msg", "found same time range but different stats, keep both blocks", | ||
"prev", prevB.String(), "prevSeries", prevB.Stats.NumSeries, "prevSamples", prevB.Stats.NumSamples, | ||
"curr", currB.String(), "currSeries", currB.Stats.NumSeries, "currSamples", currB.Stats.NumSamples, | ||
) | ||
prev = curr | ||
continue | ||
} | ||
} | ||
// prev min <= curr min < prev max | ||
toDelete := -1 | ||
if prevB.MaxTime >= currB.MaxTime { | ||
toDelete = curr | ||
level.Warn(logger).Log("msg", "found overlapping block in plan, keep previous block", | ||
"toKeep", prevB.String(), "toDelete", currB.String()) | ||
} else if prevB.MaxTime < currB.MaxTime { | ||
toDelete = prev | ||
prev = curr | ||
level.Warn(logger).Log("msg", "found overlapping block in plan, keep current block", | ||
"toKeep", currB.String(), "toDelete", prevB.String()) | ||
} | ||
o.overlappingBlocks.Inc() | ||
if err := DeleteBlockNow(ctx, logger, cg.bkt, toCompact[toDelete]); err != nil { | ||
return retry(err) | ||
} | ||
toCompact[toDelete] = nil | ||
} | ||
return nil | ||
} | ||
|
||
func (o OverlappingCompactionLifecycleCallback) PostCompactionCallback(_ context.Context, _ log.Logger, _ *Group, _ ulid.ULID) error { | ||
return nil | ||
} | ||
|
||
func (o OverlappingCompactionLifecycleCallback) GetBlockPopulator(_ context.Context, _ log.Logger, _ *Group) (tsdb.BlockPopulator, error) { | ||
return tsdb.DefaultBlockPopulator{}, nil | ||
} | ||
|
||
func FilterRemovedBlocks(blocks []*metadata.Meta) (res []*metadata.Meta) { | ||
for _, b := range blocks { | ||
if b != nil { | ||
res = append(res, b) | ||
} | ||
} | ||
return res | ||
} | ||
|
||
func DeleteBlockNow(ctx context.Context, logger log.Logger, bkt objstore.Bucket, m *metadata.Meta) error { | ||
level.Warn(logger).Log("msg", "delete polluted block immediately", "block", m.String(), | ||
"level", m.Compaction.Level, "parents", fmt.Sprintf("%v", m.Compaction.Parents), | ||
"resolution", m.Thanos.Downsample.Resolution, "source", m.Thanos.Source, "labels", m.Thanos.GetLabels(), | ||
"series", m.Stats.NumSeries, "samples", m.Stats.NumSamples, "chunks", m.Stats.NumChunks) | ||
if err := block.Delete(ctx, logger, bkt, m.ULID); err != nil { | ||
return errors.Wrapf(err, "delete overlapping block %s", m.String()) | ||
} | ||
return nil | ||
} |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is verbose and not useful in testing