diff --git a/cmd/openshift-tests/openshift-tests.go b/cmd/openshift-tests/openshift-tests.go index a9ac822e92e9..fb2174128c06 100644 --- a/cmd/openshift-tests/openshift-tests.go +++ b/cmd/openshift-tests/openshift-tests.go @@ -13,29 +13,30 @@ import ( "syscall" "time" + "github.com/openshift/origin/pkg/monitor" "github.com/openshift/origin/pkg/monitor/monitor_cmd" + "github.com/openshift/origin/pkg/monitor/resourcewatch/cmd" + "github.com/openshift/origin/pkg/riskanalysis" + testginkgo "github.com/openshift/origin/pkg/test/ginkgo" + "github.com/openshift/origin/pkg/version" + exutil "github.com/openshift/origin/test/extended/util" + "github.com/openshift/origin/test/extended/util/cluster" + "github.com/openshift/origin/test/extended/util/disruption/controlplane" "github.com/openshift/origin/test/extended/util/disruption/externalservice" + "github.com/openshift/origin/test/extended/util/disruption/frontends" - "k8s.io/cli-runtime/pkg/genericclioptions" - - "github.com/onsi/ginkgo" "github.com/openshift/library-go/pkg/image/reference" "github.com/openshift/library-go/pkg/serviceability" - "github.com/spf13/cobra" - "github.com/spf13/pflag" + + "k8s.io/cli-runtime/pkg/genericclioptions" utilflag "k8s.io/component-base/cli/flag" "k8s.io/component-base/logs" "k8s.io/klog/v2" "k8s.io/kubectl/pkg/util/templates" - "github.com/openshift/origin/pkg/monitor" - "github.com/openshift/origin/pkg/monitor/resourcewatch/cmd" - testginkgo "github.com/openshift/origin/pkg/test/ginkgo" - "github.com/openshift/origin/pkg/version" - exutil "github.com/openshift/origin/test/extended/util" - "github.com/openshift/origin/test/extended/util/cluster" - "github.com/openshift/origin/test/extended/util/disruption/controlplane" - "github.com/openshift/origin/test/extended/util/disruption/frontends" + "github.com/onsi/ginkgo" + "github.com/spf13/cobra" + "github.com/spf13/pflag" ) func main() { @@ -76,6 +77,7 @@ func main() { newImagesCommand(), newRunTestCommand(), newRunMonitorCommand(), + newTestFailureRiskAnalysisCommand(), cmd.NewRunResourceWatchCommand(), monitor_cmd.NewTimelineCommand(genericclioptions.IOStreams{ In: os.Stdin, @@ -130,6 +132,40 @@ func newRunMonitorCommand() *cobra.Command { return cmd } +const sippyDefaultURL = "https://sippy.dptools.openshift.org/api/jobs/runs/risk_analysis" + +func newTestFailureRiskAnalysisCommand() *cobra.Command { + riskAnalysisOpts := &riskanalysis.Options{ + Out: os.Stdout, + ErrOut: os.Stderr, + } + + cmd := &cobra.Command{ + Use: "risk-analysis", + Short: "Performs risk analysis on test failures", + Long: templates.LongDesc(` +Uses the test failure summary json files written along-side our junit xml +files after an invocation of openshift-tests. If multiple files are present +(multiple invocations of openshift-tests) we will merge them into one. +Results are then submitted to sippy which will return an analysis of per-test +and overall risk level given historical pass rates on the failed tests. +The resulting analysis is then also written to the junit artifacts directory. +`), + + RunE: func(cmd *cobra.Command, args []string) error { + return riskAnalysisOpts.Run() + }, + } + cmd.Flags().StringVar(&riskAnalysisOpts.JUnitDir, + "junit-dir", riskAnalysisOpts.JUnitDir, + "The directory where test reports were written, and analysis file will be stored.") + cmd.MarkFlagRequired("junit-dir") + cmd.Flags().StringVar(&riskAnalysisOpts.SippyURL, + "sippy-url", sippyDefaultURL, + "Sippy URL API endpoint") + return cmd +} + type imagesOptions struct { Repository string Upstream bool diff --git a/pkg/riskanalysis/cmd.go b/pkg/riskanalysis/cmd.go new file mode 100644 index 000000000000..ca75d6b8b15f --- /dev/null +++ b/pkg/riskanalysis/cmd.go @@ -0,0 +1,95 @@ +package riskanalysis + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "io/ioutil" + "net/http" + "os" + "path/filepath" + + "github.com/pkg/errors" +) + +// Options is used to run a risk analysis to determine how severe or unusual +// the test failures in an openshift-tests run were. +type Options struct { + Out, ErrOut io.Writer + JUnitDir string + SippyURL string +} + +const testFailureSummaryFilePrefix = "test-failures-summary" + +// Run performs the test risk analysis by reading the output files from the test run, submitting them to sippy, +// and writing out the analysis result as a new artifact. +func (opt *Options) Run() error { + fmt.Fprintf(opt.Out, "Scanning for %s files in: %s\n", testFailureSummaryFilePrefix, opt.JUnitDir) + + resultFiles, err := filepath.Glob(fmt.Sprintf("%s/%s*.json", opt.JUnitDir, testFailureSummaryFilePrefix)) + if err != nil { + return err + } + fmt.Fprintf(opt.Out, "Found files: %v\n", resultFiles) + + prowJobRuns := []*ProwJobRun{} + // Read each result file into a ProwJobRun struct: + for _, rf := range resultFiles { + data, err := os.ReadFile(rf) + if err != nil { + return err + } + jobRun := &ProwJobRun{} + err = json.Unmarshal(data, jobRun) + if err != nil { + return errors.Wrapf(err, "error unmarshalling ProwJob json") + } + prowJobRuns = append(prowJobRuns, jobRun) + } + + // We will often have more than one output file for this job run because openshift-tests is often + // invoked multiple times (pre/post upgrade). We need to merge the data together in this case. + var finalProwJobRun *ProwJobRun + for _, pjr := range prowJobRuns { + if finalProwJobRun == nil { + finalProwJobRun = pjr + continue + } + if pjr.ProwJob.Name != finalProwJobRun.ProwJob.Name { + return fmt.Errorf("mismatched job names found in %s files, %s != %s", + testFailureSummaryFilePrefix, finalProwJobRun.ProwJob.Name, pjr.ProwJob.Name) + } + finalProwJobRun.Tests = append(finalProwJobRun.Tests, pjr.Tests...) + } + + inputBytes, err := json.Marshal(finalProwJobRun) + if err != nil { + return errors.Wrap(err, "error marshalling results") + } + + req, err := http.NewRequest("GET", opt.SippyURL, bytes.NewBuffer(inputBytes)) + req.Header.Set("Content-Type", "application/json") + client := &http.Client{} + resp, err := client.Do(req) + if err != nil { + return errors.Wrap(err, "error requesting risk analysis from sippy") + } + defer resp.Body.Close() + + riskAnalysisBytes, err := ioutil.ReadAll(resp.Body) + if err != nil { + return errors.Wrap(err, "error reading risk analysis request body from sippy") + } + fmt.Println("response Body:", string(riskAnalysisBytes)) + + outputFile := filepath.Join(opt.JUnitDir, "risk-analysis.json") + err = ioutil.WriteFile(outputFile, riskAnalysisBytes, 0644) + if err != nil { + return errors.Wrap(err, "error writing risk analysis json artifact") + } + fmt.Fprintf(opt.Out, "Successfully wrote: %s\n", outputFile) + + return nil +} diff --git a/pkg/riskanalysis/types.go b/pkg/riskanalysis/types.go new file mode 100644 index 000000000000..e4618724353f --- /dev/null +++ b/pkg/riskanalysis/types.go @@ -0,0 +1,30 @@ +package riskanalysis + +// Define types, these are subsets of the sippy APIs of the same name, copied here to eliminate a lot of the cruft. +// ProwJobRunTest defines a join table linking tests to the job runs they execute in, along with the status for +// that execution. +// We're getting dangerously close to being able to live push results after a job run. + +type ProwJobRun struct { + ID int + ProwJob ProwJob + Tests []ProwJobRunTest +} + +type ProwJob struct { + Name string +} + +type Test struct { + Name string +} + +type Suite struct { + Name string +} + +type ProwJobRunTest struct { + Test Test + Suite Suite + Status int // would like to use smallint here, but gorm auto-migrate breaks trying to change the type every start +} diff --git a/pkg/riskanalysis/write_test_failure_summary.go b/pkg/riskanalysis/write_test_failure_summary.go new file mode 100644 index 000000000000..2d6274bba0be --- /dev/null +++ b/pkg/riskanalysis/write_test_failure_summary.go @@ -0,0 +1,88 @@ +package riskanalysis + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strconv" + + "github.com/openshift/origin/pkg/test/ginkgo/junitapi" +) + +// WriteJobRunTestFailureSummary writes a more minimal json file summarizing a little info about the +// job run, and what tests flaked and failed. (successful tests are omitted) +// This is intended to be later submitted to sippy for a risk analysis of how unusual the +// test failures were, but that final step is handled elsewhere. +func WriteJobRunTestFailureSummary(artifactDir, timeSuffix string, finalSuiteResults *junitapi.JUnitTestSuite) error { + + tests := map[string]*passFail{} + + for _, testCase := range finalSuiteResults.TestCases { + if _, ok := tests[testCase.Name]; !ok { + tests[testCase.Name] = &passFail{} + } + if testCase.SkipMessage != nil { + continue + } + + if testCase.FailureOutput != nil { + tests[testCase.Name].Failed = true + } else { + tests[testCase.Name].Passed = true + } + } + + // If we can't parse this, we submit without it, it is not required. + jobRunID, _ := strconv.Atoi(os.Getenv("BUILD_ID")) + + jr := ProwJobRun{ + ID: jobRunID, + ProwJob: ProwJob{Name: os.Getenv("JOB_NAME")}, + Tests: []ProwJobRunTest{}, + } + + for k, v := range tests { + if !v.Failed { + // if no failures, it is neither a fail nor a flake: + continue + } + if v.Failed && v.Passed { + // skip flakes for now, we're not ready to process them yet: + continue + } + jr.Tests = append(jr.Tests, ProwJobRunTest{ + Test: Test{Name: k}, + Suite: Suite{Name: finalSuiteResults.Name}, + Status: getSippyStatusCode(v), + }) + } + + jsonContent, err := json.MarshalIndent(jr, "", " ") + if err != nil { + return err + } + outputFile := filepath.Join(artifactDir, fmt.Sprintf("%s%s.json", + testFailureSummaryFilePrefix, timeSuffix)) + return ioutil.WriteFile(outputFile, jsonContent, 0644) +} + +// passFail is a simple struct to track test names which can appear more than once. +// If both passed and failed are true, it was a flake. +type passFail struct { + Passed bool + Failed bool +} + +// getSippyStatusCode returns the code sippy uses internally for each type of failure. +func getSippyStatusCode(pf *passFail) int { + switch { + case pf.Failed && pf.Passed: + return 13 // flake + case pf.Failed && !pf.Passed: + return 12 // fail + } + // we should not hit this given the above filtering + return 0 +} diff --git a/pkg/test/ginkgo/cmd_runsuite.go b/pkg/test/ginkgo/cmd_runsuite.go index 13e83f90547c..8eaf00791cd7 100644 --- a/pkg/test/ginkgo/cmd_runsuite.go +++ b/pkg/test/ginkgo/cmd_runsuite.go @@ -16,6 +16,7 @@ import ( "github.com/onsi/ginkgo/config" "github.com/openshift/origin/pkg/monitor" + "github.com/openshift/origin/pkg/riskanalysis" "github.com/openshift/origin/pkg/test/ginkgo/junitapi" "k8s.io/apimachinery/pkg/util/sets" ) @@ -444,11 +445,14 @@ func (opt *Options) Run(suite *TestSuite, junitSuiteName string) error { var syntheticTestResults []*junitapi.JUnitTestCase var syntheticFailure bool + timeSuffix := fmt.Sprintf("_%s", opt.MonitorEventsOptions.GetStartTime(). + UTC().Format("20060102-150405")) + if err := opt.MonitorEventsOptions.End(ctx, restConfig, opt.JUnitDir); err != nil { return err } if len(opt.JUnitDir) > 0 { - if err := opt.MonitorEventsOptions.WriteRunDataToArtifactsDir(opt.JUnitDir); err != nil { + if err := opt.MonitorEventsOptions.WriteRunDataToArtifactsDir(opt.JUnitDir, timeSuffix); err != nil { fmt.Fprintf(opt.ErrOut, "error: Failed to write run-data: %v\n", err) } } @@ -497,8 +501,13 @@ func (opt *Options) Run(suite *TestSuite, junitSuiteName string) error { } if len(opt.JUnitDir) > 0 { - if err := writeJUnitReport("junit_e2e", junitSuiteName, tests, opt.JUnitDir, duration, opt.ErrOut, syntheticTestResults...); err != nil { - fmt.Fprintf(opt.Out, "error: Unable to write e2e JUnit results: %v", err) + finalSuiteResults := generateJUnitTestSuiteResults(junitSuiteName, duration, tests, syntheticTestResults...) + if err := writeJUnitReport(finalSuiteResults, "junit_e2e", timeSuffix, opt.JUnitDir, opt.ErrOut); err != nil { + fmt.Fprintf(opt.Out, "error: Unable to write e2e JUnit xml results: %v", err) + } + + if err := riskanalysis.WriteJobRunTestFailureSummary(opt.JUnitDir, timeSuffix, finalSuiteResults); err != nil { + fmt.Fprintf(opt.Out, "error: Unable to write e2e job run failures summary: %v", err) } } diff --git a/pkg/test/ginkgo/cmd_runtest.go b/pkg/test/ginkgo/cmd_runtest.go index 9cf98d37a380..2067f660ee20 100644 --- a/pkg/test/ginkgo/cmd_runtest.go +++ b/pkg/test/ginkgo/cmd_runtest.go @@ -100,7 +100,9 @@ func (opt *TestOptions) Run(args []string) error { if err := opt.MonitorEventsOptions.End(ctx, restConfig, ""); err != nil { return err } - if err := opt.MonitorEventsOptions.WriteRunDataToArtifactsDir(""); err != nil { + timeSuffix := fmt.Sprintf("_%s", opt.MonitorEventsOptions.GetStartTime(). + UTC().Format("20060102-150405")) + if err := opt.MonitorEventsOptions.WriteRunDataToArtifactsDir("", timeSuffix); err != nil { fmt.Fprintf(opt.ErrOut, "error: Failed to write run-data: %v\n", err) } } diff --git a/pkg/test/ginkgo/junit.go b/pkg/test/ginkgo/junit.go index aa935bdcc421..bf56b8ab5fb2 100644 --- a/pkg/test/ginkgo/junit.go +++ b/pkg/test/ginkgo/junit.go @@ -14,7 +14,12 @@ import ( "github.com/openshift/origin/pkg/version" ) -func writeJUnitReport(filePrefix, name string, tests []*testCase, dir string, duration time.Duration, errOut io.Writer, additionalResults ...*junitapi.JUnitTestCase) error { +func generateJUnitTestSuiteResults( + name string, + duration time.Duration, + tests []*testCase, + syntheticTestResults ...*junitapi.JUnitTestCase) *junitapi.JUnitTestSuite { + s := &junitapi.JUnitTestSuite{ Name: name, Duration: duration.Seconds(), @@ -69,7 +74,7 @@ func writeJUnitReport(filePrefix, name string, tests []*testCase, dir string, du }) } } - for _, result := range additionalResults { + for _, result := range syntheticTestResults { switch { case result.SkipMessage != nil: s.NumSkipped++ @@ -79,11 +84,15 @@ func writeJUnitReport(filePrefix, name string, tests []*testCase, dir string, du s.NumTests++ s.TestCases = append(s.TestCases, result) } + return s +} + +func writeJUnitReport(s *junitapi.JUnitTestSuite, filePrefix, fileSuffix, dir string, errOut io.Writer) error { out, err := xml.Marshal(s) if err != nil { return err } - path := filepath.Join(dir, fmt.Sprintf("%s_%s.xml", filePrefix, time.Now().UTC().Format("20060102-150405"))) + path := filepath.Join(dir, fmt.Sprintf("%s_%s.xml", filePrefix, fileSuffix)) fmt.Fprintf(errOut, "Writing JUnit report to %s\n\n", path) return ioutil.WriteFile(path, out, 0640) } diff --git a/pkg/test/ginkgo/options_monitor_events.go b/pkg/test/ginkgo/options_monitor_events.go index 229e54921b23..ae9c61b3a64a 100644 --- a/pkg/test/ginkgo/options_monitor_events.go +++ b/pkg/test/ginkgo/options_monitor_events.go @@ -161,12 +161,15 @@ func (o *MonitorEventsOptions) GetRecordedResources() monitorapi.ResourcesMap { return o.recordedResources } +func (o *MonitorEventsOptions) GetStartTime() *time.Time { + return o.startTime +} + // WriteRunDataToArtifactsDir attempts to write useful run data to the specified directory. -func (o *MonitorEventsOptions) WriteRunDataToArtifactsDir(artifactDir string) error { +func (o *MonitorEventsOptions) WriteRunDataToArtifactsDir(artifactDir string, timeSuffix string) error { if o.endTime == nil { return fmt.Errorf("not ended") } - timeSuffix := fmt.Sprintf("_%s", o.startTime.UTC().Format("20060102-150405")) errs := []error{}