Skip to content

Commit

Permalink
Add cli for checking CI test flakiness
Browse files Browse the repository at this point in the history
  • Loading branch information
mtodor committed Dec 3, 2024
1 parent fbc3ca5 commit 3fd2498
Show file tree
Hide file tree
Showing 6 changed files with 572 additions and 6 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
/flakechecker
/junit2jira
.idea
# Binaries for programs and plugins
Expand Down
75 changes: 75 additions & 0 deletions cmd/flakechecker/bq_client.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
package main

import (
"cloud.google.com/go/bigquery"
"context"
"github.com/pkg/errors"
log "github.com/sirupsen/logrus"
"google.golang.org/api/iterator"
"time"
)

const projectID = "acs-san-stackroxci"
const queryTimeout = 1 * time.Minute
const queryStrGetFailureRatio = `
SELECT
JobName,
FilteredName,
Classname,
TotalAll,
FailRatio
FROM
` + "`acs-san-stackroxci.ci_metrics.stackrox_tests_recent_flaky_tests`" + `
WHERE
JobName = @jobName
AND FilteredName = @filteredName
AND Classname = @classname
`

type biqQueryClient interface {
GetRatioForTest(flakeTestConfig *flakeDetectionPolicy, testName string) (int, int, error)
}

type biqQueryClientImpl struct {
client *bigquery.Client
}

func getNewBigQueryClient() (biqQueryClient, error) {
ctx := context.Background()

client, err := bigquery.NewClient(ctx, projectID)
if err != nil {
return nil, errors.Wrap(err, "creating BigQuery client")
}

return &biqQueryClientImpl{client: client}, nil
}

func (c *biqQueryClientImpl) GetRatioForTest(flakeTestRec *flakeDetectionPolicy, testName string) (int, int, error) {
query := c.client.Query(queryStrGetFailureRatio)
query.Parameters = []bigquery.QueryParameter{
{Name: "jobName", Value: flakeTestRec.config.RatioJobName},
{Name: "filteredName", Value: testName},
{Name: "classname", Value: flakeTestRec.config.Classname},
}

ctx, cancelBigQueryRequest := context.WithTimeout(context.Background(), queryTimeout)
defer cancelBigQueryRequest()

resIter, err := query.Read(ctx)
if err != nil {
return 0, 0, errors.Wrap(err, "query data from BigQuery")
}

// We need only first flakyTestInfo. No need to loop over iterator.
var flakyTestInfo recentFlakyTestInfo
if errNext := resIter.Next(&flakyTestInfo); errNext != nil {
return 0, 0, errors.Wrapf(errNext, "read BigQuery result for flaky test for query params: %v - query: %s", query.Parameters, queryStrGetFailureRatio)
}

if errNext := resIter.Next(&flakyTestInfo); !errors.Is(errNext, iterator.Done) {
log.Warnf("Expected to find one row in DB, but got more for query params: %v - query: %s", query.Parameters, queryStrGetFailureRatio)
}

return flakyTestInfo.TotalAll, flakyTestInfo.FailRatio, nil
}
103 changes: 103 additions & 0 deletions cmd/flakechecker/flake_config.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
package main

import (
"fmt"
"github.com/pkg/errors"
"gopkg.in/yaml.v3"
"io"
"os"
"regexp"
)

// flakeDetectionPolicyConfig represents configuration used by flakechecker to evaluate failed tests.
//
// It contains the following fields:
// match_job_name - name of the job that should be evaluated by flakechecker. i.e. (branch should be evaluated, but main not)
// ratio_job_name - job name that should be used for ratio calculation. i.e. we take main branch test runs as base for evaluation of flake ratio
// test_name_regex - regex used to match test names. Some test names contain detailed information (i.e. version 4.4.4), but we want to use ratio for all tests in that group (i.e. 4.4.z). Using regex allow us to group tests differently.
// classname - class name of the test that should be isolated. With this option we can isolate single flake test from suite and isolate only that one from the rest.
// ratio_threshold - failure percentage that is allowed for this test. This information is usually fetched from historical executions and data collected in DB.
type flakeDetectionPolicyConfig struct {
MatchJobName string `yaml:"matchJobName"`
RatioJobName string `yaml:"ratioJobName"`
TestNameRegex string `yaml:"testNameRegex"`
Classname string `yaml:"classname"`
RatioThreshold int `yaml:"ratioThreshold"`
}

type flakeDetectionPolicy struct {
config *flakeDetectionPolicyConfig
regexMatchJobName *regexp.Regexp
regexTestNameRegex *regexp.Regexp
}

func newFlakeDetectionPolicy(config *flakeDetectionPolicyConfig) (*flakeDetectionPolicy, error) {
regexMatchJobName, err := regexp.Compile(fmt.Sprintf("^%s$", config.MatchJobName))
if err != nil {
return nil, errors.Wrap(err, fmt.Sprintf("invalid flake config match job regex: %v", config.MatchJobName))
}

regexTestNameRegex, err := regexp.Compile(fmt.Sprintf("^%s$", config.TestNameRegex))
if err != nil {
return nil, errors.Wrap(err, fmt.Sprintf("invalid flake config test name regex: %v", config.TestNameRegex))
}

return &flakeDetectionPolicy{
config: config,
regexMatchJobName: regexMatchJobName,
regexTestNameRegex: regexTestNameRegex,
}, nil
}

// newFlakeDetectionPolicyMust - is primarily used in tests.
func newFlakeDetectionPolicyMust(config *flakeDetectionPolicyConfig) *flakeDetectionPolicy {
policy, err := newFlakeDetectionPolicy(config)
if err != nil {
panic(err)
}

return policy
}

func (r *flakeDetectionPolicy) matchJobName(jobName string) (bool, error) {
return r.regexMatchJobName.MatchString(jobName), nil
}

func (r *flakeDetectionPolicy) matchTestName(testName string) (bool, error) {
return r.regexTestNameRegex.MatchString(testName), nil
}

func (r *flakeDetectionPolicy) matchClassname(classname string) (bool, error) {
return classname == r.config.Classname, nil
}

func loadFlakeConfigFile(fileName string) ([]*flakeDetectionPolicy, error) {
ymlConfigFile, err := os.Open(fileName)
if err != nil {
return nil, errors.Wrap(err, fmt.Sprintf("open flake config file: %s", fileName))
}
defer ymlConfigFile.Close()

ymlConfigFileData, err := io.ReadAll(ymlConfigFile)
if err != nil {
return nil, errors.Wrap(err, fmt.Sprintf("read flake config file: %s", fileName))
}

flakeConfigs := make([]*flakeDetectionPolicyConfig, 0)
err = yaml.Unmarshal(ymlConfigFileData, &flakeConfigs)
if err != nil {
return nil, errors.Wrap(err, fmt.Sprintf("parse flake config file: %s", fileName))
}

detectionPolicies := make([]*flakeDetectionPolicy, 0, len(flakeConfigs))
for _, flakeConfig := range flakeConfigs {
detectionPolicy, errNewPolicy := newFlakeDetectionPolicy(flakeConfig)
if errNewPolicy != nil {
return nil, errors.Wrap(err, fmt.Sprintf("create flake detection policy from config: %v", flakeConfig))
}

detectionPolicies = append(detectionPolicies, detectionPolicy)
}

return detectionPolicies, nil
}
157 changes: 157 additions & 0 deletions cmd/flakechecker/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
package main

import (
_ "embed"
"flag"
"fmt"
"github.com/carlmjohnson/versioninfo"
junit "github.com/joshdk/go-junit"
"github.com/pkg/errors"
log "github.com/sirupsen/logrus"
"github.com/stackrox/junit2jira/pkg/testcase"
"os"
)

const totalRunsLimit = 30

const errDescAboveThreshold = "allowed flake ratio for test is above threshold"
const errDescGetRatio = "get ratio for test failed"
const errDescNoFailedTests = "no failed tests to process"
const errDescNoMatch = "there is no match in allowed flake tests"
const errDescShortHistory = "total runs for test is under history count threshold"

type flakeCheckerParams struct {
junitReportsDir string
configFile string

jobName string
orchestrator string

dryRun bool
}

func main() {
var debug bool
var err error

p := flakeCheckerParams{}
flag.StringVar(&p.junitReportsDir, "junit-reports-dir", os.Getenv("ARTIFACT_DIR"), "Dir that contains jUnit reports XML files")
flag.StringVar(&p.configFile, "config-file", "", "Config file with defined failure ratios")

flag.StringVar(&p.jobName, "job-name", "", "Name of CI job.")
flag.StringVar(&p.orchestrator, "orchestrator", "", "orchestrator name (such as GKE or OpenShift), if any.")

flag.BoolVar(&p.dryRun, "dry-run", false, "When set to true issues will NOT be created.")
flag.BoolVar(&debug, "debug", false, "Enable debug log level")
versioninfo.AddFlag(flag.CommandLine)
flag.Parse()

if debug {
log.SetLevel(log.DebugLevel)
}

err = p.run()
if err != nil {
log.Fatal(err)
}
}

type recentFlakyTestInfo struct {
JobName string
FilteredName string
Classname string
TotalAll int
FailRatio int
}

func (p *flakeCheckerParams) checkFailedTests(bqClient biqQueryClient, failedTests []testcase.TestCase, flakeCheckerRecs []*flakeDetectionPolicy) error {
if len(failedTests) == 0 {
return errors.New(errDescNoFailedTests)
}

for _, failedTest := range failedTests {
found := false
log.Infof("Checking failed test: %q / %q / %q", p.jobName, failedTest.Name, failedTest.Classname)
for _, flakeCheckerRec := range flakeCheckerRecs {
match, err := flakeCheckerRec.matchJobName(p.jobName)
if err != nil {
return err
}

if !match {
continue
}

match, err = flakeCheckerRec.matchTestName(failedTest.Name)
if err != nil {
return err
}

if !match {
continue
}

match, err = flakeCheckerRec.matchClassname(failedTest.Classname)
if err != nil {
return err
}

if !match {
continue
}

found = true
log.Infof("Match found: %q / %q / %q", flakeCheckerRec.config.MatchJobName, flakeCheckerRec.config.TestNameRegex, flakeCheckerRec.config.Classname)
totalRuns, failRatio, err := bqClient.GetRatioForTest(flakeCheckerRec, failedTest.Name)
if err != nil {
return errors.Wrap(err, errDescGetRatio)
}

if totalRuns < totalRunsLimit {
return errors.Wrap(fmt.Errorf("%d", totalRuns), errDescShortHistory)
}

if failRatio > flakeCheckerRec.config.RatioThreshold {
return errors.Wrap(fmt.Errorf("(%d > %d)", failRatio, flakeCheckerRec.config.RatioThreshold), errDescAboveThreshold)
}

log.Infof("Ratio is below threshold: (%d <= %d)", failRatio, flakeCheckerRec.config.RatioThreshold)
}

if !found {
return errors.Wrap(errors.New(failedTest.Name), errDescNoMatch)
}
}

return nil
}

func (p *flakeCheckerParams) run() error {
testSuites, err := junit.IngestDir(p.junitReportsDir)
if err != nil {
return errors.Wrap(err, "could not read files")
}

failedTests, err := testcase.GetFailedTests(testSuites)
if err != nil {
return errors.Wrap(err, "could not find failed tests")
}
log.Infof("Found %d failed tests", len(failedTests))

flakeConfigs, err := loadFlakeConfigFile(p.configFile)
if err != nil {
return errors.Wrapf(err, "unable to load config file (%s)", p.configFile)
}

bqClient, err := getNewBigQueryClient()
if err != nil {
return errors.Wrap(err, "unable to create BigQuery client")
}

if err = p.checkFailedTests(bqClient, failedTests, flakeConfigs); err != nil {
return errors.Wrap(err, "check failed tests")
}

log.Info("All failed tests are within allowed flake thresholds")
return nil
}
Loading

0 comments on commit 3fd2498

Please sign in to comment.