Skip to content

[BUG] Error during multiIngest because of RegionMerge #268

@marsishandsome

Description

@marsishandsome

In the design doc https://github.com/tikv/rfcs/blob/master/text/0072-online-bulk-load-for-rawkv.md, one thing is missing.

During bulk load, we will firstly split and scatter regions. After that, we will sort the data. If sort cost more than 10 minutes, PD will try to merge the regions we split, which will cause errors during ingest.

We should let PD pause region merge during ingest (just like what lightning does).

This PR tikv/pd#4092 try to let pd support pause region merge.

org.tikv.common.exception.GrpcException: message: "peer is not leader for region 1324, leader may Some(id: 1327 store_id: 4)"
not_leader {
  region_id: 1324
  leader {
    id: 1327
    store_id: 4
  }
}

	at org.tikv.common.importer.ImporterStoreClient.multiIngest(ImporterStoreClient.java:152)
	at org.tikv.common.importer.ImporterClient.ingest(ImporterClient.java:224)
	at org.tikv.common.importer.ImporterClient.write(ImporterClient.java:96)
	at org.tikv.bulkload.RawKVBulkLoader.writeAndIngest(RawKVBulkLoader.scala:152)
	at org.tikv.bulkload.RawKVBulkLoader.$anonfun$bulkLoad$5(RawKVBulkLoader.scala:110)
	at org.tikv.bulkload.RawKVBulkLoader.$anonfun$bulkLoad$5$adapted(RawKVBulkLoader.scala:109)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2(RDD.scala:994)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2$adapted(RDD.scala:994)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2154)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:127)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:463)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:466)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
org.tikv.common.exception.GrpcException: message: "EpochNotMatch current epoch of region 4601 is conf_ver: 5 version: 71, but you sent conf_ver: 5 version: 69"
epoch_not_match {
  current_regions {
    id: 4601
    start_key: "c0fb99f6463525f16782957af25b21a4"
    end_key: "c13256dc9dc9fedebb7f8fa623274d86"
    region_epoch {
      conf_ver: 5
      version: 71
    }
    peers {
      id: 4602
      store_id: 1
    }
    peers {
      id: 4603
      store_id: 4
    }
    peers {
      id: 4604
      store_id: 7
    }
  }
  current_regions {
    id: 3001
    start_key: "c0d229b6079d6a3e32cb8294ef2ead09"
    end_key: "c0fb99f6463525f16782957af25b21a4"
    region_epoch {
      conf_ver: 5
      version: 68
    }
    peers {
      id: 3002
      store_id: 1
    }
    peers {
      id: 3003
      store_id: 4
    }
    peers {
      id: 3004
      store_id: 7
    }
  }
  current_regions {
    id: 3629
    start_key: "c09bda130a29e4b1a3fca3290b6e0e7a"
    end_key: "c0d229b6079d6a3e32cb8294ef2ead09"
    region_epoch {
      conf_ver: 5
      version: 68
    }
    peers {
      id: 3630
      store_id: 1
    }
    peers {
      id: 3631
      store_id: 4
    }
    peers {
      id: 3632
      store_id: 7
    }
  }
}

 at org.tikv.common.importer.ImporterStoreClient.multiIngest(ImporterStoreClient.java:152)
 at org.tikv.common.importer.ImporterClient.ingest(ImporterClient.java:224)
 at org.tikv.common.importer.ImporterClient.write(ImporterClient.java:96)
 at org.tikv.bulkload.RawKVBulkLoader.writeAndIngest(RawKVBulkLoader.scala:152)
 at org.tikv.bulkload.RawKVBulkLoader.$anonfun$bulkLoad$5(RawKVBulkLoader.scala:110)
 at org.tikv.bulkload.RawKVBulkLoader.$anonfun$bulkLoad$5$adapted(RawKVBulkLoader.scala:109)
 at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2(RDD.scala:994)
 at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2$adapted(RDD.scala:994)
 at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2139)
 at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
 at org.apache.spark.scheduler.Task.run(Task.scala:127)
 at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
 at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
 at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
 at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
 at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
 at java.lang.Thread.run(Thread.java:748)
org.tikv.common.exception.RegionException: Region Exception occurred region 5697 not found
	at org.tikv.common.importer.ImporterStoreClient.multiIngest(ImporterStoreClient.java:153)
	at org.tikv.common.importer.ImporterClient.ingest(ImporterClient.java:230)
	at org.tikv.common.importer.ImporterClient.write(ImporterClient.java:101)
	at org.tikv.bulkload.RawKVBulkLoader.writeAndIngest(RawKVBulkLoader.scala:152)
	at org.tikv.bulkload.RawKVBulkLoader.$anonfun$bulkLoad$5(RawKVBulkLoader.scala:110)
	at org.tikv.bulkload.RawKVBulkLoader.$anonfun$bulkLoad$5$adapted(RawKVBulkLoader.scala:109)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2(RDD.scala:994)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2$adapted(RDD.scala:994)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2154)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:127)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:463)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:466)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions