diff --git a/go.mod b/go.mod index 850750f09..180c58f93 100644 --- a/go.mod +++ b/go.mod @@ -8,12 +8,14 @@ require ( github.com/cheggaaa/pb/v3 v3.0.1 github.com/coreos/go-semver v0.3.0 // indirect github.com/coreos/go-systemd v0.0.0-20190719114852-fd7a80b32e1f // indirect + github.com/fatih/color v1.9.0 // indirect github.com/fsouza/fake-gcs-server v1.15.0 github.com/go-sql-driver/mysql v1.4.1 github.com/gogo/protobuf v1.3.1 github.com/golang/snappy v0.0.1 // indirect github.com/google/btree v1.0.0 github.com/google/uuid v1.1.1 + github.com/mattn/go-runewidth v0.0.7 // indirect github.com/onsi/ginkgo v1.10.3 // indirect github.com/onsi/gomega v1.7.1 // indirect github.com/pingcap/check v0.0.0-20191216031241-8a5a85928f12 @@ -33,6 +35,7 @@ require ( go.uber.org/zap v1.13.0 golang.org/x/net v0.0.0-20191011234655-491137f69257 // indirect golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45 + golang.org/x/tools v0.0.0-20200107184032-11e9d9cc0042 // indirect google.golang.org/api v0.14.0 google.golang.org/grpc v1.25.1 ) diff --git a/go.sum b/go.sum index d5e9c891d..0fe4a3024 100644 --- a/go.sum +++ b/go.sum @@ -97,6 +97,8 @@ github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymF github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= github.com/fatih/color v1.7.0 h1:DkWD4oS2D8LGGgTQ6IvwJJXSL5Vp2ffcQg58nFV38Ys= github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= +github.com/fatih/color v1.9.0 h1:8xPHl4/q1VyqGIPif1F+1V3Y3lSmrq01EabUW3CoW5s= +github.com/fatih/color v1.9.0/go.mod h1:eQcE1qtQxscV5RaZvpXrrb8Drkc3/DdQ+uUYCNjL+zU= github.com/fsnotify/fsnotify v1.4.7 h1:IXs+QLmnXW2CcXuY+8Mzv/fWEsPGWxqefPtCP5CnV9I= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/fsouza/fake-gcs-server v1.15.0 h1:ss/ztlt10Y64A5qslmxZKsiqW/i28t5DkRtv6qSFaLQ= @@ -224,12 +226,18 @@ github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czP github.com/mattn/go-colorable v0.0.9/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaOChaDxuIBZU= github.com/mattn/go-colorable v0.1.2 h1:/bC9yWikZXAL9uJdulbSfyVNIR3n3trXl+v8+1sx8mU= github.com/mattn/go-colorable v0.1.2/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE= +github.com/mattn/go-colorable v0.1.4 h1:snbPLB8fVfU9iwbbo30TPtbLRzwWu6aJS6Xh4eaaviA= +github.com/mattn/go-colorable v0.1.4/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE= github.com/mattn/go-isatty v0.0.4/go.mod h1:M+lRXTBqGeGNdLjl/ufCoiOlB5xdOkqRJdNxMWT7Zi4= github.com/mattn/go-isatty v0.0.8 h1:HLtExJ+uU2HOZ+wI0Tt5DtUDrx8yhUqDcp7fYERX4CE= github.com/mattn/go-isatty v0.0.8/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= +github.com/mattn/go-isatty v0.0.11 h1:FxPOTFNqGkuDUGi3H/qkUbQO4ZiBa2brKq5r0l8TGeM= +github.com/mattn/go-isatty v0.0.11/go.mod h1:PhnuNfih5lzO57/f3n+odYbM4JtupLOxQOAqxQCu2WE= github.com/mattn/go-runewidth v0.0.2/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU= github.com/mattn/go-runewidth v0.0.4 h1:2BvfKmzob6Bmd4YsL0zygOqfdFnK7GR4QL06Do4/p7Y= github.com/mattn/go-runewidth v0.0.4/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU= +github.com/mattn/go-runewidth v0.0.7 h1:Ei8KR0497xHyKJPAv59M1dkC+rOZCMBJ+t3fZ+twI54= +github.com/mattn/go-runewidth v0.0.7/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI= github.com/mattn/go-shellwords v1.0.3/go.mod h1:3xCvwCdWdlDJUrvuMn7Wuy9eWs4pE8vqg+NOMyg4B2o= github.com/matttproud/golang_protobuf_extensions v1.0.0/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= github.com/matttproud/golang_protobuf_extensions v1.0.1 h1:4hp9jkHxhMHkqkrB3Ix0jegS5sx/RkqARlsWZ6pIwiU= @@ -518,6 +526,7 @@ golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20190826190057-c7b8b68b1456/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190909082730-f460065e899a h1:mIzbOulag9/gXacgxKlFVwpCOWSfBT3/pDyyCwGA9as= golang.org/x/sys v0.0.0-20190909082730-f460065e899a/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191210023423-ac6580df4449 h1:gSbV7h1NRL2G1xTg/owz62CST1oJBmxy4QpMMregXVQ= golang.org/x/sys v0.0.0-20191210023423-ac6580df4449/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg= @@ -555,6 +564,8 @@ golang.org/x/tools v0.0.0-20191107010934-f79515f33823/go.mod h1:b+2E5dAYhXwXZwtn golang.org/x/tools v0.0.0-20191115202509-3a792d9c32b2 h1:EtTFh6h4SAKemS+CURDMTDIANuduG5zKEXShyy18bGA= golang.org/x/tools v0.0.0-20191115202509-3a792d9c32b2/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191125144606-a911d9008d1f/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20200107184032-11e9d9cc0042 h1:BKiPVwWbEdmAh+5CBwk13CYeVJQRDJpDnKgDyMOGz9M= +golang.org/x/tools v0.0.0-20200107184032-11e9d9cc0042/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/tools v0.0.0-20191227053925-7b8e75db28f4 h1:Toz2IK7k8rbltAXwNAxKcn9OzqyNfMUhUNjz3sL0NMk= golang.org/x/tools v0.0.0-20191227053925-7b8e75db28f4/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/pkg/restore/backoff.go b/pkg/restore/backoff.go new file mode 100644 index 000000000..dae14e109 --- /dev/null +++ b/pkg/restore/backoff.go @@ -0,0 +1,117 @@ +package restore + +import ( + "time" + + "github.com/pingcap/errors" + "github.com/pingcap/log" + "go.uber.org/zap" + + "github.com/pingcap/br/pkg/utils" +) + +var ( + errNotLeader = errors.NewNoStackError("not leader") + errEpochNotMatch = errors.NewNoStackError("epoch not match") + errKeyNotInRegion = errors.NewNoStackError("key not in region") + errRegionNotFound = errors.NewNoStackError("region not found") + errResp = errors.NewNoStackError("response error") + errRewriteRuleNotFound = errors.NewNoStackError("rewrite rule not found") + errRangeIsEmpty = errors.NewNoStackError("range is empty") + errGrpc = errors.NewNoStackError("gRPC error") + + // TODO: add `error` field to `DownloadResponse` for distinguish the errors of gRPC + // and the errors of request + errBadFormat = errors.NewNoStackError("bad format") + errWrongKeyPrefix = errors.NewNoStackError("wrong key prefix") + errFileCorrupted = errors.NewNoStackError("file corrupted") + errCannotRead = errors.NewNoStackError("cannot read externel storage") +) + +const ( + importSSTRetryTimes = 16 + importSSTWaitInterval = 10 * time.Millisecond + importSSTMaxWaitInterval = 1 * time.Second + + downloadSSTRetryTimes = 8 + downloadSSTWaitInterval = 10 * time.Millisecond + downloadSSTMaxWaitInterval = 1 * time.Second + + resetTsRetryTime = 16 + resetTSWaitInterval = 50 * time.Millisecond + resetTSMaxWaitInterval = 500 * time.Millisecond +) + +type importerBackoffer struct { + attempt int + delayTime time.Duration + maxDelayTime time.Duration +} + +func newImportSSTBackoffer() utils.Backoffer { + return &importerBackoffer{ + attempt: importSSTRetryTimes, + delayTime: importSSTWaitInterval, + maxDelayTime: importSSTMaxWaitInterval, + } +} + +func newDownloadSSTBackoffer() utils.Backoffer { + return &importerBackoffer{ + attempt: downloadSSTRetryTimes, + delayTime: downloadSSTWaitInterval, + maxDelayTime: downloadSSTMaxWaitInterval, + } +} + +func (bo *importerBackoffer) NextBackoff(err error) time.Duration { + switch errors.Cause(err) { + case errResp, errGrpc, errEpochNotMatch, errNotLeader: + bo.delayTime = 2 * bo.delayTime + bo.attempt-- + case errRangeIsEmpty, errRewriteRuleNotFound: + // Excepted error, finish the operation + bo.delayTime = 0 + bo.attempt = 0 + default: + // Unexcepted error + bo.delayTime = 0 + bo.attempt = 0 + log.Warn("unexcepted error, stop to retry", zap.Error(err)) + } + if bo.delayTime > bo.maxDelayTime { + return bo.maxDelayTime + } + return bo.delayTime +} + +func (bo *importerBackoffer) Attempt() int { + return bo.attempt +} + +type resetTSBackoffer struct { + attempt int + delayTime time.Duration + maxDelayTime time.Duration +} + +func newResetTSBackoffer() utils.Backoffer { + return &resetTSBackoffer{ + attempt: resetTsRetryTime, + delayTime: resetTSWaitInterval, + maxDelayTime: resetTSMaxWaitInterval, + } +} + +func (bo *resetTSBackoffer) NextBackoff(err error) time.Duration { + bo.delayTime = 2 * bo.delayTime + bo.attempt-- + if bo.delayTime > bo.maxDelayTime { + return bo.maxDelayTime + } + return bo.delayTime +} + +func (bo *resetTSBackoffer) Attempt() int { + return bo.attempt +} diff --git a/pkg/restore/backoff_test.go b/pkg/restore/backoff_test.go new file mode 100644 index 000000000..537f0980c --- /dev/null +++ b/pkg/restore/backoff_test.go @@ -0,0 +1,58 @@ +package restore + +import ( + "context" + "time" + + . "github.com/pingcap/check" + "github.com/pingcap/tidb/util/testleak" + + "github.com/pingcap/br/pkg/utils" +) + +var _ = Suite(&testBackofferSuite{}) + +type testBackofferSuite struct { + mock *utils.MockCluster +} + +func (s *testBackofferSuite) SetUpSuite(c *C) { + var err error + s.mock, err = utils.NewMockCluster() + c.Assert(err, IsNil) +} + +func (s *testBackofferSuite) TearDownSuite(c *C) { + testleak.AfterTest(c)() +} + +func (s *testBackofferSuite) TestImporterBackoffer(c *C) { + var counter int + err := utils.WithRetry(context.Background(), func() error { + defer func() { counter++ }() + switch counter { + case 0: + return errGrpc + case 1: + return errResp + case 2: + return errRangeIsEmpty + } + return nil + }, newImportSSTBackoffer()) + c.Assert(counter, Equals, 3) + c.Assert(err, Equals, errRangeIsEmpty) + + counter = 0 + backoffer := importerBackoffer{ + attempt: 10, + delayTime: time.Nanosecond, + maxDelayTime: time.Nanosecond, + } + err = utils.WithRetry(context.Background(), func() error { + defer func() { counter++ }() + return errResp + }, &backoffer) + c.Assert(counter, Equals, 10) + c.Assert(err, Equals, errResp) +} diff --git a/pkg/restore/client.go b/pkg/restore/client.go index 5e8df4418..5402d78bc 100644 --- a/pkg/restore/client.go +++ b/pkg/restore/client.go @@ -2,7 +2,6 @@ package restore import ( "context" - "fmt" "math" "sync" "time" @@ -26,15 +25,9 @@ import ( "github.com/pingcap/br/pkg/utils" ) -const ( - resetTsRetryTime = 16 - resetTSWaitInterval = 50 * time.Millisecond - resetTSMaxWaitInterval = 500 * time.Millisecond - - // defaultChecksumConcurrency is the default number of the concurrent - // checksum tasks. - defaultChecksumConcurrency = 64 -) +// defaultChecksumConcurrency is the default number of the concurrent +// checksum tasks. +const defaultChecksumConcurrency = 64 // Client sends requests to restore files type Client struct { @@ -138,13 +131,10 @@ func (rc *Client) ResetTS(pdAddrs []string) error { restoreTS := rc.backupMeta.GetEndVersion() log.Info("reset pd timestamp", zap.Uint64("ts", restoreTS)) i := 0 - return withRetry(func() error { + return utils.WithRetry(rc.ctx, func() error { idx := i % len(pdAddrs) return utils.ResetTS(pdAddrs[idx], restoreTS) - }, func(e error) bool { - i++ - return true - }, resetTsRetryTime, resetTSWaitInterval, resetTSMaxWaitInterval) + }, newResetTSBackoffer()) } // GetDatabases returns all databases. @@ -228,29 +218,28 @@ func (rc *Client) setSpeedLimit() error { return nil } -// RestoreTable tries to restore the data of a table. -func (rc *Client) RestoreTable( - table *utils.Table, +// RestoreFiles tries to restore the files. +func (rc *Client) RestoreFiles( + files []*backup.File, rewriteRules *RewriteRules, updateCh chan<- struct{}, ) (err error) { start := time.Now() defer func() { elapsed := time.Since(start) - log.Info("restore table", - zap.Stringer("table", table.Schema.Name), zap.Duration("take", elapsed)) - key := fmt.Sprintf("%s.%s", table.Db.Name.String(), table.Schema.Name.String()) - if err != nil { - summary.CollectFailureUnit(key, err) + if err == nil { + log.Info("Restore Files", + zap.Int("files", len(files)), zap.Duration("take", elapsed)) + summary.CollectSuccessUnit("files", elapsed) + } else { + summary.CollectFailureUnit("files", err) } }() - log.Debug("start to restore table", - zap.Stringer("table", table.Schema.Name), - zap.Stringer("db", table.Db.Name), - zap.Array("files", files(table.Files)), + log.Debug("start to restore files", + zap.Int("files", len(files)), ) - errCh := make(chan error, len(table.Files)) + errCh := make(chan error, len(files)) wg := new(sync.WaitGroup) defer close(errCh) err = rc.setSpeedLimit() @@ -258,7 +247,7 @@ func (rc *Client) RestoreTable( return err } - for _, file := range table.Files { + for _, file := range files { wg.Add(1) fileReplica := file rc.workerPool.Apply( @@ -272,100 +261,18 @@ func (rc *Client) RestoreTable( } }) } - for range table.Files { + for range files { err := <-errCh if err != nil { rc.cancel() wg.Wait() log.Error( - "restore table failed", - zap.Stringer("table", table.Schema.Name), - zap.Stringer("db", table.Db.Name), + "restore files failed", zap.Error(err), ) return err } } - log.Info( - "finish to restore table", - zap.Stringer("table", table.Schema.Name), - zap.Stringer("db", table.Db.Name), - ) - return nil -} - -// RestoreDatabase tries to restore the data of a database -func (rc *Client) RestoreDatabase( - db *utils.Database, - rewriteRules *RewriteRules, - updateCh chan<- struct{}, -) (err error) { - start := time.Now() - defer func() { - elapsed := time.Since(start) - log.Info("Restore Database", zap.Stringer("db", db.Schema.Name), zap.Duration("take", elapsed)) - }() - errCh := make(chan error, len(db.Tables)) - wg := new(sync.WaitGroup) - defer close(errCh) - for _, table := range db.Tables { - wg.Add(1) - tblReplica := table - rc.tableWorkerPool.Apply(func() { - defer wg.Done() - select { - case <-rc.ctx.Done(): - errCh <- nil - case errCh <- rc.RestoreTable( - tblReplica, rewriteRules, updateCh): - } - }) - } - for range db.Tables { - err = <-errCh - if err != nil { - wg.Wait() - return err - } - } - return nil -} - -// RestoreAll tries to restore all the data of backup files. -func (rc *Client) RestoreAll( - rewriteRules *RewriteRules, - updateCh chan<- struct{}, -) (err error) { - start := time.Now() - defer func() { - elapsed := time.Since(start) - log.Info("Restore All", zap.Duration("take", elapsed)) - summary.CollectSuccessUnit("restore all", elapsed) - }() - errCh := make(chan error, len(rc.databases)) - wg := new(sync.WaitGroup) - defer close(errCh) - for _, db := range rc.databases { - wg.Add(1) - dbReplica := db - rc.tableWorkerPool.Apply(func() { - defer wg.Done() - select { - case <-rc.ctx.Done(): - errCh <- nil - case errCh <- rc.RestoreDatabase( - dbReplica, rewriteRules, updateCh): - } - }) - } - - for range rc.databases { - err = <-errCh - if err != nil { - wg.Wait() - return err - } - } return nil } diff --git a/pkg/restore/import.go b/pkg/restore/import.go index 77273ebab..01f8456ef 100644 --- a/pkg/restore/import.go +++ b/pkg/restore/import.go @@ -2,6 +2,7 @@ package restore import ( "context" + "strings" "sync" "time" @@ -16,25 +17,10 @@ import ( "google.golang.org/grpc" "github.com/pingcap/br/pkg/summary" + "github.com/pingcap/br/pkg/utils" ) -var ( - errNotLeader = errors.New("not leader") - errEpochNotMatch = errors.New("epoch not match") - errRewriteRuleNotFound = errors.New("rewrite rule not found") - errRangeIsEmpty = errors.New("range is empty") -) - -const ( - importScanResgionTime = 10 * time.Second - importFileRetryTimes = 16 - importFileWaitInterval = 10 * time.Millisecond - importFileMaxWaitInterval = 1 * time.Second - - downloadSSTRetryTimes = 8 - downloadSSTWaitInterval = 10 * time.Millisecond - downloadSSTMaxWaitInterval = 1 * time.Second -) +const importScanRegionTime = 10 * time.Second // ImporterClient is used to import a file to TiKV type ImporterClient interface { @@ -172,10 +158,9 @@ func (importer *FileImporter) Import(file *backup.File, rewriteRules *RewriteRul log.Debug("rewrite file keys", zap.Stringer("file", file), zap.Binary("startKey", startKey), - zap.Binary("endKey", endKey), - ) - err = withRetry(func() error { - ctx, cancel := context.WithTimeout(importer.ctx, importScanResgionTime) + zap.Binary("endKey", endKey)) + err = utils.WithRetry(importer.ctx, func() error { + ctx, cancel := context.WithTimeout(importer.ctx, importScanRegionTime) defer cancel() // Scan regions covered by the file range regionInfos, err1 := importer.metaClient.ScanRegions(ctx, startKey, endKey, 0) @@ -185,63 +170,56 @@ func (importer *FileImporter) Import(file *backup.File, rewriteRules *RewriteRul log.Debug("scan regions", zap.Stringer("file", file), zap.Int("count", len(regionInfos))) // Try to download and ingest the file in every region for _, regionInfo := range regionInfos { - var downloadMeta *import_sstpb.SSTMeta info := regionInfo // Try to download file. - err = withRetry(func() error { - var err2 error - var isEmpty bool - downloadMeta, isEmpty, err2 = importer.downloadSST(info, file, rewriteRules) - if err2 != nil { - if err != errRewriteRuleNotFound { - log.Warn("download file failed", - zap.Stringer("file", file), - zap.Stringer("region", info.Region), - zap.Binary("startKey", startKey), - zap.Binary("endKey", endKey), - zap.Error(err2), - ) - } - return err2 - } - if isEmpty { - log.Info( - "file don't have any key in this region, skip it", - zap.Stringer("file", file), - zap.Stringer("region", info.Region), - ) - return errRangeIsEmpty - } - return nil - }, func(e error) bool { - // Scan regions may return some regions which cannot match any rewrite rule, - // like [t{tableID}, t{tableID}_r), those regions should be skipped - return e != errRewriteRuleNotFound && e != errRangeIsEmpty - }, downloadSSTRetryTimes, downloadSSTWaitInterval, downloadSSTMaxWaitInterval) - if err != nil { - if err == errRewriteRuleNotFound || err == errRangeIsEmpty { + var downloadMeta *import_sstpb.SSTMeta + err1 = utils.WithRetry(importer.ctx, func() error { + var e error + downloadMeta, e = importer.downloadSST(info, file, rewriteRules) + return e + }, newDownloadSSTBackoffer()) + if err1 != nil { + if err1 == errRewriteRuleNotFound || err1 == errRangeIsEmpty { // Skip this region continue } - return err + log.Error("download file failed", + zap.Stringer("file", file), + zap.Stringer("region", info.Region), + zap.Binary("startKey", startKey), + zap.Binary("endKey", endKey), + zap.Error(err1)) + return err1 } - err = importer.ingestSST(downloadMeta, info) - if err != nil { - log.Warn("ingest file failed", + err1 = importer.ingestSST(downloadMeta, info) + // If error is `NotLeader`, update the region info and retry + for errors.Cause(err1) == errNotLeader { + log.Debug("ingest sst returns not leader error, retry it", + zap.Stringer("region", info.Region)) + var newInfo *RegionInfo + newInfo, err1 = importer.metaClient.GetRegion(importer.ctx, info.Region.GetStartKey()) + if err1 != nil { + break + } + if !checkRegionEpoch(newInfo, info) { + err1 = errEpochNotMatch + break + } + err1 = importer.ingestSST(downloadMeta, newInfo) + } + if err1 != nil { + log.Error("ingest file failed", zap.Stringer("file", file), zap.Stringer("range", downloadMeta.GetRange()), zap.Stringer("region", info.Region), - zap.Error(err), - ) - return err + zap.Error(err1)) + return err1 } summary.CollectSuccessUnit(summary.TotalKV, file.TotalKvs) summary.CollectSuccessUnit(summary.TotalBytes, file.TotalBytes) } return nil - }, func(e error) bool { - return true - }, importFileRetryTimes, importFileWaitInterval, importFileMaxWaitInterval) + }, newImportSSTBackoffer()) return err } @@ -257,33 +235,25 @@ func (importer *FileImporter) downloadSST( regionInfo *RegionInfo, file *backup.File, rewriteRules *RewriteRules, -) (*import_sstpb.SSTMeta, bool, error) { +) (*import_sstpb.SSTMeta, error) { id, err := uuid.New().MarshalBinary() if err != nil { - return nil, true, errors.Trace(err) + return nil, errors.Trace(err) } // Assume one region reflects to one rewrite rule _, key, err := codec.DecodeBytes(regionInfo.Region.GetStartKey()) if err != nil { - return nil, true, err + return nil, err } regionRule := matchNewPrefix(key, rewriteRules) if regionRule == nil { - log.Debug("cannot find rewrite rule, skip region", - zap.Stringer("region", regionInfo.Region), - zap.Array("tableRule", rules(rewriteRules.Table)), - zap.Array("dataRule", rules(rewriteRules.Data)), - zap.Binary("key", key), - ) - return nil, true, errRewriteRuleNotFound + return nil, errors.Trace(errRewriteRuleNotFound) } rule := import_sstpb.RewriteRule{ OldKeyPrefix: encodeKeyPrefix(regionRule.GetOldKeyPrefix()), NewKeyPrefix: encodeKeyPrefix(regionRule.GetNewKeyPrefix()), } sstMeta := getSSTMetaFromFile(id, file, regionInfo.Region, &rule) - sstMeta.RegionId = regionInfo.Region.GetId() - sstMeta.RegionEpoch = regionInfo.Region.GetRegionEpoch() req := &import_sstpb.DownloadRequest{ Sst: sstMeta, StorageBackend: importer.backend, @@ -298,15 +268,15 @@ func (importer *FileImporter) downloadSST( for _, peer := range regionInfo.Region.GetPeers() { resp, err = importer.importClient.DownloadSST(importer.ctx, peer.GetStoreId(), req) if err != nil { - return nil, true, err + return nil, extractDownloadSSTError(err) } if resp.GetIsEmpty() { - return &sstMeta, true, nil + return nil, errors.Trace(errRangeIsEmpty) } } sstMeta.Range.Start = truncateTS(resp.Range.GetStart()) sstMeta.Range.End = truncateTS(resp.Range.GetEnd()) - return &sstMeta, false, nil + return &sstMeta, nil } func (importer *FileImporter) ingestSST( @@ -329,17 +299,45 @@ func (importer *FileImporter) ingestSST( log.Debug("download SST", zap.Stringer("sstMeta", sstMeta)) resp, err := importer.importClient.IngestSST(importer.ctx, leader.GetStoreId(), req) if err != nil { - return err + if strings.Contains(err.Error(), "RegionNotFound") { + return errors.Trace(errRegionNotFound) + } + return errors.Trace(err) } respErr := resp.GetError() if respErr != nil { - if respErr.EpochNotMatch != nil { - return errEpochNotMatch + log.Debug("ingest sst resp error", zap.Stringer("error", respErr)) + if respErr.GetKeyNotInRegion() != nil { + return errors.Trace(errKeyNotInRegion) } - if respErr.NotLeader != nil { - return errNotLeader + if respErr.GetNotLeader() != nil { + return errors.Trace(errNotLeader) } - return errors.Errorf("ingest failed: %v", respErr) + return errors.Wrap(errResp, respErr.String()) } return nil } + +func checkRegionEpoch(new, old *RegionInfo) bool { + if new.Region.GetId() == old.Region.GetId() && + new.Region.GetRegionEpoch().GetVersion() == old.Region.GetRegionEpoch().GetVersion() && + new.Region.GetRegionEpoch().GetConfVer() == old.Region.GetRegionEpoch().GetConfVer() { + return true + } + return false +} + +func extractDownloadSSTError(e error) error { + err := errGrpc + switch { + case strings.Contains(e.Error(), "bad format"): + err = errBadFormat + case strings.Contains(e.Error(), "wrong prefix"): + err = errWrongKeyPrefix + case strings.Contains(e.Error(), "corrupted"): + err = errFileCorrupted + case strings.Contains(e.Error(), "Cannot read"): + err = errCannotRead + } + return errors.Trace(err) +} diff --git a/pkg/restore/split.go b/pkg/restore/split.go index 31b23a60f..3248fdd0d 100644 --- a/pkg/restore/split.go +++ b/pkg/restore/split.go @@ -111,7 +111,7 @@ SplitRegions: } time.Sleep(interval) if i > 3 { - log.Warn("splitting regions failed, retry it", zap.Error(err)) + log.Warn("splitting regions failed, retry it", zap.Error(err), zap.ByteStrings("keys", keys)) } continue SplitRegions } @@ -259,6 +259,7 @@ func getSplitKeys(rewriteRules *RewriteRules, ranges []Range, regions []*RegionI splitKeys = make([][]byte, 0, 1) } splitKeyMap[region.Region.GetId()] = append(splitKeys, key) + log.Debug("get key for split region", zap.Binary("key", key), zap.Stringer("region", region.Region)) } } return splitKeyMap diff --git a/pkg/restore/util.go b/pkg/restore/util.go index ea8629470..63ee92969 100644 --- a/pkg/restore/util.go +++ b/pkg/restore/util.go @@ -17,31 +17,12 @@ import ( "github.com/pingcap/tidb/tablecodec" "github.com/pingcap/tidb/util/codec" "go.uber.org/zap" - "go.uber.org/zap/zapcore" "github.com/pingcap/br/pkg/summary" ) var recordPrefixSep = []byte("_r") -type files []*backup.File - -func (fs files) MarshalLogArray(arr zapcore.ArrayEncoder) error { - for i := range fs { - arr.AppendString(fs[i].String()) - } - return nil -} - -type rules []*import_sstpb.RewriteRule - -func (rs rules) MarshalLogArray(arr zapcore.ArrayEncoder) error { - for i := range rs { - arr.AppendString(rs[i].String()) - } - return nil -} - // idAllocator always returns a specified ID type idAllocator struct { id int64 @@ -163,40 +144,11 @@ func getSSTMetaFromFile( Start: rangeStart, End: rangeEnd, }, + RegionId: region.GetId(), + RegionEpoch: region.GetRegionEpoch(), } } -type retryableFunc func() error -type continueFunc func(error) bool - -func withRetry( - retryableFunc retryableFunc, - continueFunc continueFunc, - attempts uint, - delayTime time.Duration, - maxDelayTime time.Duration, -) error { - var lastErr error - for i := uint(0); i < attempts; i++ { - err := retryableFunc() - if err != nil { - lastErr = err - // If this is the last attempt, do not wait - if !continueFunc(err) || i == attempts-1 { - break - } - delayTime = 2 * delayTime - if delayTime > maxDelayTime { - delayTime = maxDelayTime - } - time.Sleep(delayTime) - } else { - return nil - } - } - return lastErr -} - // ValidateFileRanges checks and returns the ranges of the files. func ValidateFileRanges( files []*backup.File, diff --git a/pkg/task/restore.go b/pkg/task/restore.go index f2f3caf43..a56a1d6da 100644 --- a/pkg/task/restore.go +++ b/pkg/task/restore.go @@ -139,7 +139,7 @@ func RunRestore(c context.Context, cmdName string, cfg *RestoreConfig) error { if err != nil { return err } - err = client.RestoreAll(rewriteRules, updateCh) + err = client.RestoreFiles(files, rewriteRules, updateCh) // always run the post-work even on error, so we don't stuck in the import mode or emptied schedulers postErr := restorePostWork(ctx, client, mgr, removedSchedulers) diff --git a/pkg/utils/retry.go b/pkg/utils/retry.go new file mode 100644 index 000000000..a8f446764 --- /dev/null +++ b/pkg/utils/retry.go @@ -0,0 +1,40 @@ +package utils + +import ( + "context" + "time" +) + +// RetryableFunc presents a retryable opreation +type RetryableFunc func() error + +// Backoffer implements a backoff policy for retrying operations +type Backoffer interface { + // NextBackoff returns a duration to wait before retrying again + NextBackoff(err error) time.Duration + // Attempt returns the remain attempt times + Attempt() int +} + +// WithRetry retrys a given operation with a backoff policy +func WithRetry( + ctx context.Context, + retryableFunc RetryableFunc, + backoffer Backoffer, +) error { + var lastErr error + for backoffer.Attempt() > 0 { + err := retryableFunc() + if err != nil { + lastErr = err + select { + case <-ctx.Done(): + return lastErr + case <-time.After(backoffer.NextBackoff(err)): + } + } else { + return nil + } + } + return lastErr +} diff --git a/tests/br_full_ddl/run.sh b/tests/br_full_ddl/run.sh index 3db1ecd60..1e40415d7 100755 --- a/tests/br_full_ddl/run.sh +++ b/tests/br_full_ddl/run.sh @@ -28,7 +28,7 @@ for i in $(seq $DDL_COUNT); do run_sql "USE $DB; ALTER TABLE $TABLE ADD INDEX (FIELD$i);" done -for i in $(sql $DDL_COUNT); do +for i in $(seq $DDL_COUNT); do if (( RANDOM % 2 )); then run_sql "USE $DB; ALTER TABLE $TABLE DROP INDEX FIELD$i;" fi