openshift · openshift-cherrypick-robot · Oct 21, 2022 · Oct 25, 2022 · Oct 26, 2022 · Oct 31, 2022
diff --git a/cmd/openshift-tests/openshift-tests.go b/cmd/openshift-tests/openshift-tests.go
@@ -13,29 +13,30 @@ import (
 	"syscall"
 	"time"
 
+	"github.com/openshift/origin/pkg/monitor"
 	"github.com/openshift/origin/pkg/monitor/monitor_cmd"
+	"github.com/openshift/origin/pkg/monitor/resourcewatch/cmd"
+	"github.com/openshift/origin/pkg/riskanalysis"
+	testginkgo "github.com/openshift/origin/pkg/test/ginkgo"
+	"github.com/openshift/origin/pkg/version"
+	exutil "github.com/openshift/origin/test/extended/util"
+	"github.com/openshift/origin/test/extended/util/cluster"
+	"github.com/openshift/origin/test/extended/util/disruption/controlplane"
 	"github.com/openshift/origin/test/extended/util/disruption/externalservice"
+	"github.com/openshift/origin/test/extended/util/disruption/frontends"
 
-	"k8s.io/cli-runtime/pkg/genericclioptions"
-
-	"github.com/onsi/ginkgo"
 	"github.com/openshift/library-go/pkg/image/reference"
 	"github.com/openshift/library-go/pkg/serviceability"
-	"github.com/spf13/cobra"
-	"github.com/spf13/pflag"
+
+	"k8s.io/cli-runtime/pkg/genericclioptions"
 	utilflag "k8s.io/component-base/cli/flag"
 	"k8s.io/component-base/logs"
 	"k8s.io/klog/v2"
 	"k8s.io/kubectl/pkg/util/templates"
 
-	"github.com/openshift/origin/pkg/monitor"
-	"github.com/openshift/origin/pkg/monitor/resourcewatch/cmd"
-	testginkgo "github.com/openshift/origin/pkg/test/ginkgo"
-	"github.com/openshift/origin/pkg/version"
-	exutil "github.com/openshift/origin/test/extended/util"
-	"github.com/openshift/origin/test/extended/util/cluster"
-	"github.com/openshift/origin/test/extended/util/disruption/controlplane"
-	"github.com/openshift/origin/test/extended/util/disruption/frontends"
+	"github.com/onsi/ginkgo"
+	"github.com/spf13/cobra"
+	"github.com/spf13/pflag"
 )
 
 func main() {
@@ -76,6 +77,7 @@ func main() {
 		newImagesCommand(),
 		newRunTestCommand(),
 		newRunMonitorCommand(),
+		newTestFailureRiskAnalysisCommand(),
 		cmd.NewRunResourceWatchCommand(),
 		monitor_cmd.NewTimelineCommand(genericclioptions.IOStreams{
 			In:     os.Stdin,
@@ -130,6 +132,40 @@ func newRunMonitorCommand() *cobra.Command {
 	return cmd
 }
 
+const sippyDefaultURL = "https://sippy.dptools.openshift.org/api/jobs/runs/risk_analysis"
+
+func newTestFailureRiskAnalysisCommand() *cobra.Command {
+	riskAnalysisOpts := &riskanalysis.Options{
+		Out:    os.Stdout,
+		ErrOut: os.Stderr,
+	}
+
+	cmd := &cobra.Command{
+		Use:   "risk-analysis",
+		Short: "Performs risk analysis on test failures",
+		Long: templates.LongDesc(`
+Uses the test failure summary json files written along-side our junit xml
+files after an invocation of openshift-tests. If multiple files are present
+(multiple invocations of openshift-tests) we will merge them into one.
+Results are then submitted to sippy which will return an analysis of per-test
+and overall risk level given historical pass rates on the failed tests.
+The resulting analysis is then also written to the junit artifacts directory.
+`),
+
+		RunE: func(cmd *cobra.Command, args []string) error {
+			return riskAnalysisOpts.Run()
+		},
+	}
+	cmd.Flags().StringVar(&riskAnalysisOpts.JUnitDir,
+		"junit-dir", riskAnalysisOpts.JUnitDir,
+		"The directory where test reports were written, and analysis file will be stored.")
+	cmd.MarkFlagRequired("junit-dir")
+	cmd.Flags().StringVar(&riskAnalysisOpts.SippyURL,
+		"sippy-url", sippyDefaultURL,
+		"Sippy URL API endpoint")
+	return cmd
+}
+
 type imagesOptions struct {
 	Repository string
 	Upstream   bool

diff --git a/pkg/riskanalysis/cmd.go b/pkg/riskanalysis/cmd.go
@@ -0,0 +1,95 @@
+package riskanalysis
+
+import (
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"net/http"
+	"os"
+	"path/filepath"
+
+	"github.com/pkg/errors"
+)
+
+// Options is used to run a risk analysis to determine how severe or unusual
+// the test failures in an openshift-tests run were.
+type Options struct {
+	Out, ErrOut io.Writer
+	JUnitDir    string
+	SippyURL    string
+}
+
+const testFailureSummaryFilePrefix = "test-failures-summary"
+
+// Run performs the test risk analysis by reading the output files from the test run, submitting them to sippy,
+// and writing out the analysis result as a new artifact.
+func (opt *Options) Run() error {
+	fmt.Fprintf(opt.Out, "Scanning for %s files in: %s\n", testFailureSummaryFilePrefix, opt.JUnitDir)
+
+	resultFiles, err := filepath.Glob(fmt.Sprintf("%s/%s*.json", opt.JUnitDir, testFailureSummaryFilePrefix))
+	if err != nil {
+		return err
+	}
+	fmt.Fprintf(opt.Out, "Found files: %v\n", resultFiles)
+
+	prowJobRuns := []*ProwJobRun{}
+	// Read each result file into a ProwJobRun struct:
+	for _, rf := range resultFiles {
+		data, err := os.ReadFile(rf)
+		if err != nil {
+			return err
+		}
+		jobRun := &ProwJobRun{}
+		err = json.Unmarshal(data, jobRun)
+		if err != nil {
+			return errors.Wrapf(err, "error unmarshalling ProwJob json")
+		}
+		prowJobRuns = append(prowJobRuns, jobRun)
+	}
+
+	// We will often have more than one output file for this job run because openshift-tests is often
+	// invoked multiple times (pre/post upgrade). We need to merge the data together in this case.
+	var finalProwJobRun *ProwJobRun
+	for _, pjr := range prowJobRuns {
+		if finalProwJobRun == nil {
+			finalProwJobRun = pjr
+			continue
+		}
+		if pjr.ProwJob.Name != finalProwJobRun.ProwJob.Name {
+			return fmt.Errorf("mismatched job names found in %s files, %s != %s",
+				testFailureSummaryFilePrefix, finalProwJobRun.ProwJob.Name, pjr.ProwJob.Name)
+		}
+		finalProwJobRun.Tests = append(finalProwJobRun.Tests, pjr.Tests...)
+	}
+
+	inputBytes, err := json.Marshal(finalProwJobRun)
+	if err != nil {
+		return errors.Wrap(err, "error marshalling results")
+	}
+
+	req, err := http.NewRequest("GET", opt.SippyURL, bytes.NewBuffer(inputBytes))
+	req.Header.Set("Content-Type", "application/json")
+	client := &http.Client{}
+	resp, err := client.Do(req)
+	if err != nil {
+		return errors.Wrap(err, "error requesting risk analysis from sippy")
+	}
+	defer resp.Body.Close()
+
+	riskAnalysisBytes, err := ioutil.ReadAll(resp.Body)
+	if err != nil {
+		return errors.Wrap(err, "error reading risk analysis request body from sippy")
+	}
+	fmt.Println("response Body:", string(riskAnalysisBytes))
+
+	outputFile := filepath.Join(opt.JUnitDir, "risk-analysis.json")
+	err = ioutil.WriteFile(outputFile, riskAnalysisBytes, 0644)
+	if err != nil {
+		return errors.Wrap(err, "error writing risk analysis json artifact")
+	}
+	fmt.Fprintf(opt.Out, "Successfully wrote: %s\n", outputFile)
+
+	return nil
+}
diff --git a/pkg/riskanalysis/types.go b/pkg/riskanalysis/types.go
@@ -0,0 +1,30 @@
+package riskanalysis
+
+// Define types, these are subsets of the sippy APIs of the same name, copied here to eliminate a lot of the cruft.
+// ProwJobRunTest defines a join table linking tests to the job runs they execute in, along with the status for
+// that execution.
+// We're getting dangerously close to being able to live push results after a job run.
+
+type ProwJobRun struct {
+	ID      int
+	ProwJob ProwJob
+	Tests   []ProwJobRunTest
+}
+
+type ProwJob struct {
+	Name string
+}
+
+type Test struct {
+	Name string
+}
+
+type Suite struct {
+	Name string
+}
+
+type ProwJobRunTest struct {
+	Test   Test
+	Suite  Suite
+	Status int // would like to use smallint here, but gorm auto-migrate breaks trying to change the type every start
+}
diff --git a/pkg/riskanalysis/write_test_failure_summary.go b/pkg/riskanalysis/write_test_failure_summary.go
@@ -0,0 +1,88 @@
+package riskanalysis
+
+import (
+	"encoding/json"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"strconv"
+
+	"github.com/openshift/origin/pkg/test/ginkgo/junitapi"
+)
+
+// WriteJobRunTestFailureSummary writes a more minimal json file summarizing a little info about the
+// job run, and what tests flaked and failed. (successful tests are omitted)
+// This is intended to be later submitted to sippy for a risk analysis of how unusual the
+// test failures were, but that final step is handled elsewhere.
+func WriteJobRunTestFailureSummary(artifactDir, timeSuffix string, finalSuiteResults *junitapi.JUnitTestSuite) error {
+
+	tests := map[string]*passFail{}
+
+	for _, testCase := range finalSuiteResults.TestCases {
+		if _, ok := tests[testCase.Name]; !ok {
+			tests[testCase.Name] = &passFail{}
+		}
+		if testCase.SkipMessage != nil {
+			continue
+		}
+
+		if testCase.FailureOutput != nil {
+			tests[testCase.Name].Failed = true
+		} else {
+			tests[testCase.Name].Passed = true
+		}
+	}
+
+	// If we can't parse this, we submit without it, it is not required.
+	jobRunID, _ := strconv.Atoi(os.Getenv("BUILD_ID"))
+
+	jr := ProwJobRun{
+		ID:      jobRunID,
+		ProwJob: ProwJob{Name: os.Getenv("JOB_NAME")},
+		Tests:   []ProwJobRunTest{},
+	}
+
+	for k, v := range tests {
+		if !v.Failed {
+			// if no failures, it is neither a fail nor a flake:
+			continue
+		}
+		if v.Failed && v.Passed {
+			// skip flakes for now, we're not ready to process them yet:
+			continue
+		}
+		jr.Tests = append(jr.Tests, ProwJobRunTest{
+			Test:   Test{Name: k},
+			Suite:  Suite{Name: finalSuiteResults.Name},
+			Status: getSippyStatusCode(v),
+		})
+	}
+
+	jsonContent, err := json.MarshalIndent(jr, "", "    ")
+	if err != nil {
+		return err
+	}
+	outputFile := filepath.Join(artifactDir, fmt.Sprintf("%s%s.json",
+		testFailureSummaryFilePrefix, timeSuffix))
+	return ioutil.WriteFile(outputFile, jsonContent, 0644)
+}
+
+// passFail is a simple struct to track test names which can appear more than once.
+// If both passed and failed are true, it was a flake.
+type passFail struct {
+	Passed bool
+	Failed bool
+}
+
+// getSippyStatusCode returns the code sippy uses internally for each type of failure.
+func getSippyStatusCode(pf *passFail) int {
+	switch {
+	case pf.Failed && pf.Passed:
+		return 13 // flake
+	case pf.Failed && !pf.Passed:
+		return 12 // fail
+	}
+	// we should not hit this given the above filtering
+	return 0
+}
diff --git a/pkg/test/ginkgo/cmd_runsuite.go b/pkg/test/ginkgo/cmd_runsuite.go
@@ -16,6 +16,7 @@ import (
 
 	"github.com/onsi/ginkgo/config"
 	"github.com/openshift/origin/pkg/monitor"
+	"github.com/openshift/origin/pkg/riskanalysis"
 	"github.com/openshift/origin/pkg/test/ginkgo/junitapi"
 	"k8s.io/apimachinery/pkg/util/sets"
 )
@@ -444,11 +445,14 @@ func (opt *Options) Run(suite *TestSuite, junitSuiteName string) error {
 	var syntheticTestResults []*junitapi.JUnitTestCase
 	var syntheticFailure bool
 
+	timeSuffix := fmt.Sprintf("_%s", opt.MonitorEventsOptions.GetStartTime().
+		UTC().Format("20060102-150405"))
+
 	if err := opt.MonitorEventsOptions.End(ctx, restConfig, opt.JUnitDir); err != nil {
 		return err
 	}
 	if len(opt.JUnitDir) > 0 {
-		if err := opt.MonitorEventsOptions.WriteRunDataToArtifactsDir(opt.JUnitDir); err != nil {
+		if err := opt.MonitorEventsOptions.WriteRunDataToArtifactsDir(opt.JUnitDir, timeSuffix); err != nil {
 			fmt.Fprintf(opt.ErrOut, "error: Failed to write run-data: %v\n", err)
 		}
 	}
@@ -497,8 +501,13 @@ func (opt *Options) Run(suite *TestSuite, junitSuiteName string) error {
 	}
 
 	if len(opt.JUnitDir) > 0 {
-		if err := writeJUnitReport("junit_e2e", junitSuiteName, tests, opt.JUnitDir, duration, opt.ErrOut, syntheticTestResults...); err != nil {
-			fmt.Fprintf(opt.Out, "error: Unable to write e2e JUnit results: %v", err)
+		finalSuiteResults := generateJUnitTestSuiteResults(junitSuiteName, duration, tests, syntheticTestResults...)
+		if err := writeJUnitReport(finalSuiteResults, "junit_e2e", timeSuffix, opt.JUnitDir, opt.ErrOut); err != nil {
+			fmt.Fprintf(opt.Out, "error: Unable to write e2e JUnit xml results: %v", err)
+		}
+
+		if err := riskanalysis.WriteJobRunTestFailureSummary(opt.JUnitDir, timeSuffix, finalSuiteResults); err != nil {
+			fmt.Fprintf(opt.Out, "error: Unable to write e2e job run failures summary: %v", err)
 		}
 	}
 

diff --git a/pkg/test/ginkgo/cmd_runtest.go b/pkg/test/ginkgo/cmd_runtest.go
@@ -100,7 +100,9 @@ func (opt *TestOptions) Run(args []string) error {
 		if err := opt.MonitorEventsOptions.End(ctx, restConfig, ""); err != nil {
 			return err
 		}
-		if err := opt.MonitorEventsOptions.WriteRunDataToArtifactsDir(""); err != nil {
+		timeSuffix := fmt.Sprintf("_%s", opt.MonitorEventsOptions.GetStartTime().
+			UTC().Format("20060102-150405"))
+		if err := opt.MonitorEventsOptions.WriteRunDataToArtifactsDir("", timeSuffix); err != nil {
 			fmt.Fprintf(opt.ErrOut, "error: Failed to write run-data: %v\n", err)
 		}
 	}

diff --git a/pkg/test/ginkgo/junit.go b/pkg/test/ginkgo/junit.go
@@ -14,7 +14,12 @@ import (
 	"github.com/openshift/origin/pkg/version"
 )
 
-func writeJUnitReport(filePrefix, name string, tests []*testCase, dir string, duration time.Duration, errOut io.Writer, additionalResults ...*junitapi.JUnitTestCase) error {
+func generateJUnitTestSuiteResults(
+	name string,
+	duration time.Duration,
+	tests []*testCase,
+	syntheticTestResults ...*junitapi.JUnitTestCase) *junitapi.JUnitTestSuite {
+
 	s := &junitapi.JUnitTestSuite{
 		Name:     name,
 		Duration: duration.Seconds(),
@@ -69,7 +74,7 @@ func writeJUnitReport(filePrefix, name string, tests []*testCase, dir string, du
 			})
 		}
 	}
-	for _, result := range additionalResults {
+	for _, result := range syntheticTestResults {
 		switch {
 		case result.SkipMessage != nil:
 			s.NumSkipped++
@@ -79,11 +84,15 @@ func writeJUnitReport(filePrefix, name string, tests []*testCase, dir string, du
 		s.NumTests++
 		s.TestCases = append(s.TestCases, result)
 	}
+	return s
+}
+
+func writeJUnitReport(s *junitapi.JUnitTestSuite, filePrefix, fileSuffix, dir string, errOut io.Writer) error {
 	out, err := xml.Marshal(s)
 	if err != nil {
 		return err
 	}
-	path := filepath.Join(dir, fmt.Sprintf("%s_%s.xml", filePrefix, time.Now().UTC().Format("20060102-150405")))
+	path := filepath.Join(dir, fmt.Sprintf("%s_%s.xml", filePrefix, fileSuffix))
 	fmt.Fprintf(errOut, "Writing JUnit report to %s\n\n", path)
 	return ioutil.WriteFile(path, out, 0640)
 }