From 4355bcb601edce2a8e370f478086f3a91e2aa8c7 Mon Sep 17 00:00:00 2001 From: lance6716 Date: Tue, 13 Oct 2020 15:40:23 +0800 Subject: [PATCH 01/13] add wait timeout --- dm/master/shardddl/pessimist_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dm/master/shardddl/pessimist_test.go b/dm/master/shardddl/pessimist_test.go index 64eadf53f7..2498a9536a 100644 --- a/dm/master/shardddl/pessimist_test.go +++ b/dm/master/shardddl/pessimist_test.go @@ -201,7 +201,7 @@ func (t *testPessimist) testPessimistProgress(c *C, restart int) { done, _, err = pessimism.PutOperationDeleteExistInfo(etcdTestCli, op12c, i12) c.Assert(err, IsNil) c.Assert(done, IsTrue) - c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool { + c.Assert(utils.WaitSomething(50, 100*time.Millisecond, func() bool { _, ok := p.Locks()[ID1] return !ok }), IsTrue) From af043212c10c4ee8503961bb1e48def174631423 Mon Sep 17 00:00:00 2001 From: lance6716 Date: Thu, 15 Oct 2020 11:16:00 +0800 Subject: [PATCH 02/13] improve retry for sync_diff --- tests/ha_cases/run.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/ha_cases/run.sh b/tests/ha_cases/run.sh index d7408d763a..3bcf3ede5d 100755 --- a/tests/ha_cases/run.sh +++ b/tests/ha_cases/run.sh @@ -72,8 +72,8 @@ function test_multi_task_running() { sleep 5 # wait for flush checkpoint echo "use sync_diff_inspector to check increment data" - check_sync_diff $WORK_DIR $cur/conf/diff_config.toml 10 || print_debug_status - check_sync_diff $WORK_DIR $cur/conf/diff_config_multi_task.toml 10 || print_debug_status + check_sync_diff $WORK_DIR $cur/conf/diff_config.toml 50 || print_debug_status + check_sync_diff $WORK_DIR $cur/conf/diff_config_multi_task.toml 50 || print_debug_status echo "[$(date)] <<<<<< finish test_multi_task_running >>>>>>" } From 582a0141a57487d037ad636fe3ce15d474f4dfe4 Mon Sep 17 00:00:00 2001 From: csuzhangxc Date: Thu, 15 Oct 2020 11:45:56 +0800 Subject: [PATCH 03/13] Makefile: check coverage --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 98f874945f..3174363937 100644 --- a/Makefile +++ b/Makefile @@ -192,8 +192,8 @@ coverage_fix_cover_mode: sed -i "s/mode: count/mode: atomic/g" $(TEST_DIR)/cov.*.dmctl.*.out coverage: coverage_fix_cover_mode retool_setup - retool do gocovmerge "$(TEST_DIR)"/cov.* | grep -vE ".*.pb.go|.*.__failpoint_binding__.go|.*debug-tools.*|.*portal.*|.*chaos.*" > "$(TEST_DIR)/all_cov.out" - retool do gocovmerge "$(TEST_DIR)"/cov.unit_test*.out | grep -vE ".*.pb.go|.*.__failpoint_binding__.go|.*debug-tools.*|.*portal.*|.*chaos.*" > $(TEST_DIR)/unit_test.out + retool do gocovmerge "$(TEST_DIR)"/cov.* | grep -vE ".*.pb.go|.*.__failpoint_binding__.go|.*debug-tools.*|.*portal.*|.*chaos.*|.*ctl.*|.*loader.*|.*relay.*" > "$(TEST_DIR)/all_cov.out" + retool do gocovmerge "$(TEST_DIR)"/cov.unit_test*.out | grep -vE ".*.pb.go|.*.__failpoint_binding__.go|.*debug-tools.*|.*portal.*|.*chaos.*|.*ctl.*|.*loader.*|.*relay.*" > $(TEST_DIR)/unit_test.out ifeq ("$(JenkinsCI)", "1") @bash <(curl -s https://codecov.io/bash) -f $(TEST_DIR)/unit_test.out -t $(CODECOV_TOKEN) @retool do goveralls -coverprofile=$(TEST_DIR)/all_cov.out -service=jenkins-ci -repotoken $(COVERALLS_TOKEN) From e80e24b2b644af24c149d7dde78f638a2559c262 Mon Sep 17 00:00:00 2001 From: lance6716 Date: Thu, 15 Oct 2020 13:22:44 +0800 Subject: [PATCH 04/13] improve robust for test_pause_task --- tests/ha_cases/run.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/ha_cases/run.sh b/tests/ha_cases/run.sh index 3bcf3ede5d..736de4c7a5 100755 --- a/tests/ha_cases/run.sh +++ b/tests/ha_cases/run.sh @@ -421,14 +421,15 @@ function test_pause_task() { task_name=(test test2) for name in ${task_name[@]}; do echo "pause tasks $name" + + # because some SQL may running (often remove checkpoint record), pause will cause that SQL failed + # thus `result` is not true run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \ - "pause-task $name"\ - "\"result\": true" 3 + "pause-task $name" # pause twice, just used to test pause by the way run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \ - "pause-task $name"\ - "\"result\": true" 3 + "pause-task $name" run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \ "query-status $name"\ From 7455d918c4c4cb3495a14b7295ce686d6e18e0a3 Mon Sep 17 00:00:00 2001 From: csuzhangxc Date: Thu, 15 Oct 2020 13:54:51 +0800 Subject: [PATCH 05/13] *: ignore more for coverage --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 3174363937..7c2741adfd 100644 --- a/Makefile +++ b/Makefile @@ -192,8 +192,8 @@ coverage_fix_cover_mode: sed -i "s/mode: count/mode: atomic/g" $(TEST_DIR)/cov.*.dmctl.*.out coverage: coverage_fix_cover_mode retool_setup - retool do gocovmerge "$(TEST_DIR)"/cov.* | grep -vE ".*.pb.go|.*.__failpoint_binding__.go|.*debug-tools.*|.*portal.*|.*chaos.*|.*ctl.*|.*loader.*|.*relay.*" > "$(TEST_DIR)/all_cov.out" - retool do gocovmerge "$(TEST_DIR)"/cov.unit_test*.out | grep -vE ".*.pb.go|.*.__failpoint_binding__.go|.*debug-tools.*|.*portal.*|.*chaos.*|.*ctl.*|.*loader.*|.*relay.*" > $(TEST_DIR)/unit_test.out + retool do gocovmerge "$(TEST_DIR)"/cov.* | grep -vE ".*.pb.go|.*.pb.gw.go|.*.__failpoint_binding__.go|.*debug-tools.*|.*portal.*|.*chaos.*|.*ctl.*|.*loader.*|.*relay.*" > "$(TEST_DIR)/all_cov.out" + retool do gocovmerge "$(TEST_DIR)"/cov.unit_test*.out | grep -vE ".*.pb.go|.*.pb.gw.go|.*.__failpoint_binding__.go|.*debug-tools.*|.*portal.*|.*chaos.*|.*ctl.*|.*loader.*|.*relay.*" > $(TEST_DIR)/unit_test.out ifeq ("$(JenkinsCI)", "1") @bash <(curl -s https://codecov.io/bash) -f $(TEST_DIR)/unit_test.out -t $(CODECOV_TOKEN) @retool do goveralls -coverprofile=$(TEST_DIR)/all_cov.out -service=jenkins-ci -repotoken $(COVERALLS_TOKEN) From ed6ceedd4ce6de993a22002be922f91909fd0f7e Mon Sep 17 00:00:00 2001 From: csuzhangxc Date: Thu, 15 Oct 2020 14:22:17 +0800 Subject: [PATCH 06/13] *: revert for coverage --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 7c2741adfd..1a02b30cdf 100644 --- a/Makefile +++ b/Makefile @@ -192,8 +192,8 @@ coverage_fix_cover_mode: sed -i "s/mode: count/mode: atomic/g" $(TEST_DIR)/cov.*.dmctl.*.out coverage: coverage_fix_cover_mode retool_setup - retool do gocovmerge "$(TEST_DIR)"/cov.* | grep -vE ".*.pb.go|.*.pb.gw.go|.*.__failpoint_binding__.go|.*debug-tools.*|.*portal.*|.*chaos.*|.*ctl.*|.*loader.*|.*relay.*" > "$(TEST_DIR)/all_cov.out" - retool do gocovmerge "$(TEST_DIR)"/cov.unit_test*.out | grep -vE ".*.pb.go|.*.pb.gw.go|.*.__failpoint_binding__.go|.*debug-tools.*|.*portal.*|.*chaos.*|.*ctl.*|.*loader.*|.*relay.*" > $(TEST_DIR)/unit_test.out + retool do gocovmerge "$(TEST_DIR)"/cov.* | grep -vE ".*.pb.go|.*.pb.gw.go|.*.__failpoint_binding__.go|.*debug-tools.*|.*portal.*|.*chaos.*" > "$(TEST_DIR)/all_cov.out" + retool do gocovmerge "$(TEST_DIR)"/cov.unit_test*.out | grep -vE ".*.pb.go|.*.pb.gw.go|.*.__failpoint_binding__.go|.*debug-tools.*|.*portal.*|.*chaos.*" > $(TEST_DIR)/unit_test.out ifeq ("$(JenkinsCI)", "1") @bash <(curl -s https://codecov.io/bash) -f $(TEST_DIR)/unit_test.out -t $(CODECOV_TOKEN) @retool do goveralls -coverprofile=$(TEST_DIR)/all_cov.out -service=jenkins-ci -repotoken $(COVERALLS_TOKEN) From c6c190842346f2b48f51aae28ffe9fe0123850bb Mon Sep 17 00:00:00 2001 From: lance6716 Date: Mon, 26 Oct 2020 10:43:27 +0800 Subject: [PATCH 07/13] save my work --- tests/shardddl3/run.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/shardddl3/run.sh b/tests/shardddl3/run.sh index 06a9a0d2a7..f9630bef83 100644 --- a/tests/shardddl3/run.sh +++ b/tests/shardddl3/run.sh @@ -469,12 +469,14 @@ function DM_101() { function DM_102_CASE() { run_sql_source1 "alter table ${shardddl1}.${tb1} add column new_col1 int default 0;" run_sql_source1 "insert into ${shardddl1}.${tb1} values (1,1);" - run_sql_source2 "alter table ${shardddl1}.${tb1} add column new_col1 int default -1;" +# run_sql_source2 "alter table ${shardddl1}.${tb1} add column new_col1 int default -1;" run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ "show-ddl-locks" \ "\"ID\": \"test-\`shardddl\`.\`tb\`\"" 1 + read -p 123 + run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ "unlock-ddl-lock test-\`shardddl\`.\`tb\`" \ "\"result\": true" 1 @@ -570,7 +572,7 @@ function DM_RemoveLock() { function run() { init_cluster init_database - start=71 + start=102 end=103 except=(071 072 073 074 075 083 084 085 086 087 088 089 090 091 092 093) for i in $(seq -f "%03g" ${start} ${end}); do From 25429e6932ab39bb50c3a1fc887fca01d2874bc4 Mon Sep 17 00:00:00 2001 From: lance6716 Date: Mon, 26 Oct 2020 11:55:46 +0800 Subject: [PATCH 08/13] check received DDL --- tests/shardddl3/run.sh | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/tests/shardddl3/run.sh b/tests/shardddl3/run.sh index f9630bef83..cf8051785b 100644 --- a/tests/shardddl3/run.sh +++ b/tests/shardddl3/run.sh @@ -469,7 +469,24 @@ function DM_101() { function DM_102_CASE() { run_sql_source1 "alter table ${shardddl1}.${tb1} add column new_col1 int default 0;" run_sql_source1 "insert into ${shardddl1}.${tb1} values (1,1);" -# run_sql_source2 "alter table ${shardddl1}.${tb1} add column new_col1 int default -1;" + run_sql_source2 "alter table ${shardddl1}.${tb1} add column new_col1 int default -1;" + + sleep 1 + # wait DM receive source2's DDL + found=false + for ((k=0; k<10; k++)); do + content=$($PWD/bin/dmctl.test DEVEL --master-addr=127.0.0.1:$MASTER_PORT query-status test) + master2=$(echo $content | sed 's/"masterBinlog":/"masterBinlog":\n/g' | awk -F')' 'FNR==3{print $1}') + syncer2=$(echo $content | sed 's/"syncerBinlog":/"syncerBinlog":\n/g' | awk -F')' 'FNR==3{print $1}') + if [ "$master2" != "$syncer2" ]; then + found=true + break + fi + done + if [[ $found == false ]]; then + echo "didn't receive mismatched DDL" + exit 2 + fi run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ "show-ddl-locks" \ @@ -572,7 +589,7 @@ function DM_RemoveLock() { function run() { init_cluster init_database - start=102 + start=71 end=103 except=(071 072 073 074 075 083 084 085 086 087 088 089 090 091 092 093) for i in $(seq -f "%03g" ${start} ${end}); do From c6c6e638b5e9dee5d3c169eddfcd939221bb5b5a Mon Sep 17 00:00:00 2001 From: lance6716 Date: Mon, 26 Oct 2020 11:57:06 +0800 Subject: [PATCH 09/13] remove helper --- tests/shardddl3/run.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/shardddl3/run.sh b/tests/shardddl3/run.sh index cf8051785b..5760ed11a6 100644 --- a/tests/shardddl3/run.sh +++ b/tests/shardddl3/run.sh @@ -492,8 +492,6 @@ function DM_102_CASE() { "show-ddl-locks" \ "\"ID\": \"test-\`shardddl\`.\`tb\`\"" 1 - read -p 123 - run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ "unlock-ddl-lock test-\`shardddl\`.\`tb\`" \ "\"result\": true" 1 From 5d93432d30ca90acce28aa9e844e7f8ea775a52a Mon Sep 17 00:00:00 2001 From: lance6716 Date: Mon, 26 Oct 2020 14:28:42 +0800 Subject: [PATCH 10/13] fix relay test --- relay/relay_test.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/relay/relay_test.go b/relay/relay_test.go index 8ba84484fd..56b961dd64 100644 --- a/relay/relay_test.go +++ b/relay/relay_test.go @@ -566,8 +566,11 @@ func (t *testRelaySuite) TestProcess(c *C) { // kill the binlog dump connection ctx2, cancel2 := context.WithTimeout(context.Background(), 10*time.Second) defer cancel2() - connID, err := getBinlogDumpConnID(ctx2, r.db) - c.Assert(err, IsNil) + var connID uint32 + c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool { + connID, err = getBinlogDumpConnID(ctx2, r.db) + return err == nil + }), IsTrue) _, err = r.db.ExecContext(ctx2, fmt.Sprintf(`KILL %d`, connID)) c.Assert(err, IsNil) From e0432074bcca28f17ba8657cb93f286b94c84db8 Mon Sep 17 00:00:00 2001 From: lance6716 Date: Mon, 26 Oct 2020 17:12:49 +0800 Subject: [PATCH 11/13] use CI to check some error --- tests/_utils/test_prepare | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/_utils/test_prepare b/tests/_utils/test_prepare index 4960512c5b..87e6e21bd4 100644 --- a/tests/_utils/test_prepare +++ b/tests/_utils/test_prepare @@ -199,13 +199,13 @@ function check_log_contain_with_retry() { rc=0 for ((k=1;k<11;k++)); do got=`grep "$text" $log1 | wc -l` - if [[ ! $got = 0 ]]; then + if [[ $got -ne 0 ]]; then rc=1 break fi if [[ ! "$log2" = "" ]]; then got=`grep "$text" $log2 | wc -l` - if [[ ! $got = 0 ]]; then + if [[ $got -ne 0 ]]; then rc=1 break fi @@ -213,7 +213,7 @@ function check_log_contain_with_retry() { echo "check log contain failed $k-th time, retry later" sleep 2 done - if [[ $rc = 0 ]]; then + if [[ $rc -eq 0 ]]; then echo "log dosen't contain $text" exit 1 fi From be85c87c934f73129a4a9b6fc00713d8d34a0473 Mon Sep 17 00:00:00 2001 From: lance6716 Date: Mon, 26 Oct 2020 17:30:21 +0800 Subject: [PATCH 12/13] dynamic wait for newly put DDL --- dm/master/shardddl/optimist.go | 37 ++++++++++++++++++++++++++++++++-- tests/shardddl3/run.sh | 2 +- 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/dm/master/shardddl/optimist.go b/dm/master/shardddl/optimist.go index e31de1f518..b63da694ea 100644 --- a/dm/master/shardddl/optimist.go +++ b/dm/master/shardddl/optimist.go @@ -25,6 +25,7 @@ import ( "go.etcd.io/etcd/clientv3" "go.uber.org/zap" + "github.com/pingcap/dm/dm/common" "github.com/pingcap/dm/dm/config" "github.com/pingcap/dm/dm/master/metrics" "github.com/pingcap/dm/dm/pb" @@ -549,8 +550,40 @@ func (o *Optimist) handleLock(info optimism.Info, tts []optimism.TargetTable, sk func (o *Optimist) removeLock(lock *optimism.Lock) (bool, error) { failpoint.Inject("SleepWhenRemoveLock", func(val failpoint.Value) { t := val.(int) - log.L().Info("wait new ddl info putted into etcd", zap.String("failpoint", "SleepWhenRemoveLock")) - time.Sleep(time.Duration(t) * time.Second) + log.L().Info("wait new ddl info putted into etcd", + zap.String("failpoint", "SleepWhenRemoveLock"), + zap.Int("max wait second", t)) + + ticker := time.NewTicker(time.Second) + defer ticker.Stop() + timer := time.NewTimer(time.Duration(t) * time.Second) + defer timer.Stop() + OUTER: + for { + select { + case <-timer.C: + log.L().Info("failed to wait new DDL info", zap.Int("wait second", t)) + break OUTER + case <-ticker.C: + // manually check etcd + cmps := make([]clientv3.Cmp, 0) + for source, schemaTables := range lock.Ready() { + for schema, tables := range schemaTables { + for table := range tables { + info := optimism.NewInfo(lock.Task, source, schema, table, lock.DownSchema, lock.DownTable, nil, nil, nil) + info.Version = lock.GetVersion(source, schema, table) + key := common.ShardDDLOptimismInfoKeyAdapter.Encode(info.Task, info.Source, info.UpSchema, info.UpTable) + cmps = append(cmps, clientv3.Compare(clientv3.Version(key), "<", info.Version+1)) + } + } + } + resp, _, err := etcdutil.DoOpsInOneCmpsTxnWithRetry(o.cli, cmps, nil, nil) + if err == nil && !resp.Succeeded { + log.L().Info("found new DDL info") + break OUTER + } + } + } }) deleted, err := o.deleteInfosOps(lock) if err != nil { diff --git a/tests/shardddl3/run.sh b/tests/shardddl3/run.sh index 42946dd0bb..5821dc99de 100644 --- a/tests/shardddl3/run.sh +++ b/tests/shardddl3/run.sh @@ -700,7 +700,7 @@ function DM_RemoveLock_CASE() { function DM_RemoveLock() { ps aux | grep dm-master |awk '{print $2}'|xargs kill || true check_port_offline $MASTER_PORT1 20 - export GO_FAILPOINTS="github.com/pingcap/dm/dm/master/shardddl/SleepWhenRemoveLock=return(5)" + export GO_FAILPOINTS="github.com/pingcap/dm/dm/master/shardddl/SleepWhenRemoveLock=return(10)" run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ From 3676df73ba2e2c29216502d1ae6c28b5186b9a21 Mon Sep 17 00:00:00 2001 From: lance6716 Date: Tue, 27 Oct 2020 11:27:18 +0800 Subject: [PATCH 13/13] add log --- pkg/shardddl/optimism/info_test.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pkg/shardddl/optimism/info_test.go b/pkg/shardddl/optimism/info_test.go index 6479994fa4..efc54679a5 100644 --- a/pkg/shardddl/optimism/info_test.go +++ b/pkg/shardddl/optimism/info_test.go @@ -179,7 +179,11 @@ func (t *testForEtcd) TestInfoEtcd(c *C) { resp, err := etcdTestCli.Txn(context.Background()).Then(deleteOp).Commit() c.Assert(err, IsNil) c.Assert(resp.Succeeded, IsTrue) - <-wch + select { + case err2 := <-ech: + c.Fatal(err2) + case <-wch: + } // put again // version reset to 1