diff --git a/pkg/pdutil/pd.go b/pkg/pdutil/pd.go index ddd852be0..6a9d133e6 100644 --- a/pkg/pdutil/pd.go +++ b/pkg/pdutil/pd.go @@ -61,6 +61,7 @@ var ( "shuffle-region-scheduler": {}, "shuffle-hot-region-scheduler": {}, } + // TODO remove this, see https://github.com/pingcap/br/pull/555#discussion_r509855972 pdRegionMergeCfg = []string{ "max-merge-region-keys", "max-merge-region-size", @@ -73,11 +74,12 @@ var ( // DefaultPDCfg find by https://github.com/tikv/pd/blob/master/conf/config.toml. DefaultPDCfg = map[string]interface{}{ - "max-merge-region-keys": 200000, - "max-merge-region-size": 20, - "leader-schedule-limit": 4, - "region-schedule-limit": 2048, - "max-snapshot-count": 3, + "max-merge-region-keys": 200000, + "max-merge-region-size": 20, + "leader-schedule-limit": 4, + "region-schedule-limit": 2048, + "max-snapshot-count": 3, + "enable-location-replacement": "true", } ) @@ -410,6 +412,7 @@ func (p *PdController) UpdatePDScheduleConfig( if e == nil { return nil } + log.Warn("failed to update PD config, will try next", zap.Error(e), zap.String("pd", addr)) } return errors.Annotate(berrors.ErrPDUpdateFailed, "failed to update PD schedule config") } @@ -444,6 +447,12 @@ func restoreSchedulers(ctx context.Context, pd *PdController, clusterCfg cluster if err := pd.UpdatePDScheduleConfig(ctx, scheduleLimitCfg); err != nil { return errors.Annotate(err, "fail to update PD schedule config") } + if locationPlacement, ok := clusterCfg.scheduleCfg["enable-location-replacement"]; ok { + log.Debug("restoring config enable-location-replacement", zap.Any("enable-location-placement", locationPlacement)) + if err := pd.UpdatePDScheduleConfig(ctx, map[string]interface{}{"enable-location-replacement": locationPlacement}); err != nil { + return err + } + } return nil } @@ -485,6 +494,7 @@ func (p *PdController) RemoveSchedulers(ctx context.Context) (undo utils.UndoFun } undo = p.makeUndoFunctionByConfig(clusterConfig{scheduler: removedSchedulers, scheduleCfg: scheduleCfg}) + log.Debug("saved PD config", zap.Any("config", scheduleCfg)) disableMergeCfg := make(map[string]interface{}) for _, cfgKey := range pdRegionMergeCfg { @@ -515,7 +525,10 @@ func (p *PdController) RemoveSchedulers(ctx context.Context) (undo utils.UndoFun limit := int(value.(float64)) scheduleLimitCfg[cfgKey] = math.Min(40, float64(limit*len(stores))) } - return undo, p.UpdatePDScheduleConfig(ctx, scheduleLimitCfg) + if err := p.UpdatePDScheduleConfig(ctx, scheduleLimitCfg); err != nil { + return undo, err + } + return undo, p.UpdatePDScheduleConfig(ctx, map[string]interface{}{"enable-location-replacement": "false"}) } // Close close the connection to pd. diff --git a/run-test.sh b/run-test.sh deleted file mode 100755 index cc085d8ea..000000000 --- a/run-test.sh +++ /dev/null @@ -1,6 +0,0 @@ -#! /bin/sh - -apt update && apt install default-mysql-client jq --yes - -cd /brie -TEST_NAME=br_other make integration_test \ No newline at end of file diff --git a/tests/br_other/run.sh b/tests/br_other/run.sh index 143130f69..ed3b26af1 100644 --- a/tests/br_other/run.sh +++ b/tests/br_other/run.sh @@ -83,7 +83,7 @@ curl "http://localhost:$PPROF_PORT/debug/pprof/trace?seconds=1" 2>&1 > /dev/null echo "pprof started..." curl http://$PD_ADDR/pd/api/v1/config/schedule | grep '"disable": false' - +curl http://$PD_ADDR/pd/api/v1/config/schedule | jq '."enable-location-replacement"' | grep "false" backup_fail=0 echo "another backup start expect to fail due to last backup add a lockfile" run_br --pd $PD_ADDR backup full -s "local://$TEST_DIR/$DB/lock" --concurrency 4 || backup_fail=1 @@ -94,7 +94,7 @@ fi # check is there still exists scheduler not in pause. pause_schedulers=$(curl http://$PD_ADDR/pd/api/v1/schedulers?status="paused" | grep "scheduler" | wc -l) -if [ "$pause_schedulers" -ne "3" ];then +if [ "$pause_schedulers" -lt "3" ];then echo "TEST: [$TEST_NAME] failed because paused scheduler are not enough" exit 1 fi @@ -120,6 +120,7 @@ then exit 1 fi + default_pd_values='{ "max-merge-region-keys": 200000, "max-merge-region-size": 20, @@ -136,22 +137,25 @@ for key in $(echo $default_pd_values | jq 'keys[]'); do fi done -pd_settings=5 # check is there still exists scheduler in pause. pause_schedulers=$(curl http://$PD_ADDR/pd/api/v1/schedulers?status="paused" | grep "scheduler" | wc -l) -# There shouldn't be any paused schedulers since BR gracfully shutdown. -if [ "$pause_schedulers" -ne "0" ];then + # There shouldn't be any paused schedulers since BR gracfully shutdown. + if [ "$pause_schedulers" -ne "0" ];then echo "TEST: [$TEST_NAME] failed because paused scheduler has changed" exit 1 fi +pd_settings=6 + # balance-region scheduler enabled curl http://$PD_ADDR/pd/api/v1/config/schedule | jq '."schedulers-v2"[] | {disable: .disable, type: ."type" | select (.=="balance-region")}' | grep '"disable": false' || ((pd_settings--)) # balance-leader scheduler enabled curl http://$PD_ADDR/pd/api/v1/config/schedule | jq '."schedulers-v2"[] | {disable: .disable, type: ."type" | select (.=="balance-leader")}' | grep '"disable": false' || ((pd_settings--)) # hot region scheduler enabled curl http://$PD_ADDR/pd/api/v1/config/schedule | jq '."schedulers-v2"[] | {disable: .disable, type: ."type" | select (.=="hot-region")}' | grep '"disable": false' || ((pd_settings--)) +# location replacement enabled +curl http://$PD_ADDR/pd/api/v1/config/schedule | jq '."enable-location-replacement"' | grep "true" || ((pd_settings--)) # we need reset pd config to default # until pd has the solution to temporary set these scheduler/configs. @@ -163,7 +167,7 @@ curl http://$PD_ADDR/pd/api/v1/config/schedule | jq '."max-merge-region-size"' | # max-merge-region-keys set to default 200000 curl http://$PD_ADDR/pd/api/v1/config/schedule | jq '."max-merge-region-keys"' | grep "200000" || ((pd_settings--)) -if [ "$pd_settings" -ne "5" ];then +if [ "$pd_settings" -ne "6" ];then echo "TEST: [$TEST_NAME] test validate reset pd config failed!" exit 1 fi diff --git a/tests/br_tiflash/run.sh b/tests/br_tiflash/run.sh index dab046c52..d19c55788 100644 --- a/tests/br_tiflash/run.sh +++ b/tests/br_tiflash/run.sh @@ -42,6 +42,7 @@ while ! [ $(run_sql "select * from information_schema.tiflash_replica" | grep "P echo "Waiting for TiFlash synchronizing [$i]." if [ $i -gt 20 ]; then echo "Failed to sync data to tiflash." + exit 1 fi sleep 5 done @@ -52,12 +53,14 @@ run_br backup full -s "local://$TEST_DIR/$DB" --pd $PD_ADDR run_sql "DROP DATABASE $DB" run_br restore full -s "local://$TEST_DIR/$DB" --pd $PD_ADDR +# wating for TiFlash sync +sleep 90 AFTER_BR_COUNT=`run_sql "SELECT count(*) FROM $DB.kv;" | sed -n "s/[^0-9]//g;/^[0-9]*$/p" | tail -n1` -if [ $AFTER_BR_COUNT -ne $RECORD_COUNT ]; then +if [ "$AFTER_BR_COUNT" -ne "$RECORD_COUNT" ]; then echo "failed to restore, before: $RECORD_COUNT; after: $AFTER_BR_COUNT" exit 1 fi run_sql "DROP DATABASE $DB" -echo "TEST $TEST_NAME passed!" \ No newline at end of file +echo "TEST $TEST_NAME passed!"