diff --git a/build.gradle b/build.gradle index 4073b69a094..64539348cc3 100644 --- a/build.gradle +++ b/build.gradle @@ -102,7 +102,6 @@ dependencies { compile group: 'joda-time', name: 'joda-time', version:'2.9.9' compile group: 'org.joda', name: 'joda-convert', version:'1.9.2' testCompile group: 'io.grpc', name: 'grpc-testing', version:'1.7.0' - //remove unused hadoop dependencies /*compile group: 'org.apache.logging.log4j', name: 'log4j-api', version:'2.8.1' compile group: 'org.apache.logging.log4j', name: 'log4j-core', version:'2.8.1' compile group: 'org.apache.spark', name: 'spark-core_2.11', version:'2.3.2' diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index e0b3fb8d70b..5c04802d86f 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,5 +1,6 @@ +#Wed Feb 13 17:14:42 CST 2019 distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-4.10.2-bin.zip zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists +distributionUrl=https\://services.gradle.org/distributions/gradle-4.10.2-all.zip diff --git a/src/main/java/org/tikv/common/AbstractGRPCClient.java b/src/main/java/org/tikv/common/AbstractGRPCClient.java index 71f4cb06401..ee7e0fca7b9 100644 --- a/src/main/java/org/tikv/common/AbstractGRPCClient.java +++ b/src/main/java/org/tikv/common/AbstractGRPCClient.java @@ -29,21 +29,24 @@ import org.tikv.common.policy.RetryPolicy; import org.tikv.common.streaming.StreamingResponse; import org.tikv.common.util.BackOffer; -import org.tikv.common.util.ChannelFactory; public abstract class AbstractGRPCClient< BlockingStubT extends AbstractStub, StubT extends AbstractStub> implements AutoCloseable { protected final Logger logger = Logger.getLogger(this.getClass()); - protected final TiConfiguration conf; - protected final ChannelFactory channelFactory; + protected TiSession session; + protected TiConfiguration conf; - protected AbstractGRPCClient(TiConfiguration conf, ChannelFactory channelFactory) { - this.conf = conf; - this.channelFactory = channelFactory; + protected AbstractGRPCClient(TiSession session) { + this.session = session; + this.conf = session.getConf(); } - protected TiConfiguration getConf() { + public TiSession getSession() { + return session; + } + + public TiConfiguration getConf() { return conf; } diff --git a/src/main/java/org/tikv/common/PDClient.java b/src/main/java/org/tikv/common/PDClient.java index febd83a21a7..8a000a1070e 100644 --- a/src/main/java/org/tikv/common/PDClient.java +++ b/src/main/java/org/tikv/common/PDClient.java @@ -32,7 +32,6 @@ import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.function.Supplier; -import org.tikv.common.TiConfiguration.KVMode; import org.tikv.common.codec.Codec.BytesCodec; import org.tikv.common.codec.CodecDataOutput; import org.tikv.common.exception.GrpcException; @@ -42,7 +41,6 @@ import org.tikv.common.operation.PDErrorHandler; import org.tikv.common.region.TiRegion; import org.tikv.common.util.BackOffer; -import org.tikv.common.util.ChannelFactory; import org.tikv.common.util.FutureObserver; import org.tikv.kvproto.Metapb.Store; import org.tikv.kvproto.PDGrpc; @@ -50,7 +48,6 @@ import org.tikv.kvproto.PDGrpc.PDStub; import org.tikv.kvproto.Pdpb.*; -/** PDClient is thread-safe and suggested to be shared threads */ public class PDClient extends AbstractGRPCClient implements ReadOnlyPDClient { private RequestHeader header; @@ -76,7 +73,7 @@ public TiTimestamp getTimestamp(BackOffer backOffer) { @Override public TiRegion getRegionByKey(BackOffer backOffer, ByteString key) { Supplier request; - if (conf.getKvMode() == KVMode.RAW) { + if (conf.getKvMode().equalsIgnoreCase("RAW")) { request = () -> GetRegionRequest.newBuilder().setHeader(header).setRegionKey(key).build(); } else { CodecDataOutput cdo = new CodecDataOutput(); @@ -198,8 +195,8 @@ public void close() { } } - public static ReadOnlyPDClient create(TiConfiguration conf, ChannelFactory channelFactory) { - return createRaw(conf, channelFactory); + public static ReadOnlyPDClient create(TiSession session) { + return createRaw(session); } @VisibleForTesting @@ -250,7 +247,7 @@ void close() {} public GetMembersResponse getMembers(HostAndPort url) { try { - ManagedChannel probChan = channelFactory.getChannel(url.getHostText() + ":" + url.getPort()); + ManagedChannel probChan = session.getChannel(url.getHostText() + ":" + url.getPort()); PDGrpc.PDBlockingStub stub = PDGrpc.newBlockingStub(probChan); GetMembersRequest request = GetMembersRequest.newBuilder().setHeader(RequestHeader.getDefaultInstance()).build(); @@ -282,7 +279,7 @@ private boolean createLeaderWrapper(String leaderUrlStr) { } // create new Leader - ManagedChannel clientChannel = channelFactory.getChannel(leaderUrlStr); + ManagedChannel clientChannel = session.getChannel(leaderUrlStr); leaderWrapper = new LeaderWrapper( leaderUrlStr, @@ -333,13 +330,13 @@ protected PDStub getAsyncStub() { .withDeadlineAfter(getConf().getTimeout(), getConf().getTimeoutUnit()); } - private PDClient(TiConfiguration conf, ChannelFactory channelFactory) { - super(conf, channelFactory); + private PDClient(TiSession session) { + super(session); } private void initCluster() { GetMembersResponse resp = null; - List pdAddrs = getConf().getPdAddrs(); + List pdAddrs = getSession().getConf().getPdAddrs(); for (HostAndPort u : pdAddrs) { resp = getMembers(u); if (resp != null) { @@ -369,10 +366,10 @@ private void initCluster() { TimeUnit.MINUTES); } - static PDClient createRaw(TiConfiguration conf, ChannelFactory channelFactory) { + static PDClient createRaw(TiSession session) { PDClient client = null; try { - client = new PDClient(conf, channelFactory); + client = new PDClient(session); client.initCluster(); } catch (Exception e) { if (client != null) { diff --git a/src/main/java/org/tikv/common/ReadOnlyPDClient.java b/src/main/java/org/tikv/common/ReadOnlyPDClient.java index 59659b31f96..7d48f14e668 100644 --- a/src/main/java/org/tikv/common/ReadOnlyPDClient.java +++ b/src/main/java/org/tikv/common/ReadOnlyPDClient.java @@ -60,4 +60,7 @@ public interface ReadOnlyPDClient { Store getStore(BackOffer backOffer, long storeId); Future getStoreAsync(BackOffer backOffer, long storeId); + + /** Get associated session * @return the session associated to client */ + TiSession getSession(); } diff --git a/src/main/java/org/tikv/common/Snapshot.java b/src/main/java/org/tikv/common/Snapshot.java new file mode 100644 index 00000000000..666ca9bb7f0 --- /dev/null +++ b/src/main/java/org/tikv/common/Snapshot.java @@ -0,0 +1,108 @@ +/* + * Copyright 2017 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.tikv.common; + +import static org.tikv.common.util.KeyRangeUtils.makeRange; + +import com.google.common.collect.Range; +import com.google.protobuf.ByteString; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import org.tikv.common.exception.TiClientInternalException; +import org.tikv.common.key.Key; +import org.tikv.common.meta.TiTimestamp; +import org.tikv.common.operation.iterator.ConcreteScanIterator; +import org.tikv.common.region.RegionStoreClient; +import org.tikv.common.region.TiRegion; +import org.tikv.common.util.BackOffer; +import org.tikv.common.util.ConcreteBackOffer; +import org.tikv.common.util.Pair; +import org.tikv.kvproto.Kvrpcpb.KvPair; +import org.tikv.kvproto.Metapb.Store; + +public class Snapshot { + private final TiTimestamp timestamp; + private final TiSession session; + private final TiConfiguration conf; + + public Snapshot(TiTimestamp timestamp, TiSession session) { + this.timestamp = timestamp; + this.session = session; + this.conf = session.getConf(); + } + + public TiSession getSession() { + return session; + } + + public long getVersion() { + return timestamp.getVersion(); + } + + public TiTimestamp getTimestamp() { + return timestamp; + } + + public byte[] get(byte[] key) { + ByteString keyString = ByteString.copyFrom(key); + ByteString value = get(keyString); + return value.toByteArray(); + } + + public ByteString get(ByteString key) { + Pair pair = session.getRegionManager().getRegionStorePairByKey(key); + RegionStoreClient client = RegionStoreClient.create(pair.first, pair.second, getSession()); + // TODO: Need to deal with lock error after grpc stable + return client.get(ConcreteBackOffer.newGetBackOff(), key, timestamp.getVersion()); + } + + public Iterator scan(ByteString startKey) { + return new ConcreteScanIterator(startKey, session, timestamp.getVersion()); + } + + // TODO: Need faster implementation, say concurrent version + // Assume keys sorted + public List batchGet(List keys) { + TiRegion curRegion = null; + Range curKeyRange = null; + Pair lastPair; + List keyBuffer = new ArrayList<>(); + List result = new ArrayList<>(keys.size()); + BackOffer backOffer = ConcreteBackOffer.newBatchGetMaxBackOff(); + for (ByteString key : keys) { + if (curRegion == null || !curKeyRange.contains(Key.toRawKey(key))) { + Pair pair = session.getRegionManager().getRegionStorePairByKey(key); + lastPair = pair; + curRegion = pair.first; + curKeyRange = makeRange(curRegion.getStartKey(), curRegion.getEndKey()); + + try (RegionStoreClient client = + RegionStoreClient.create(lastPair.first, lastPair.second, getSession())) { + List partialResult = + client.batchGet(backOffer, keyBuffer, timestamp.getVersion()); + // TODO: Add lock check + result.addAll(partialResult); + } catch (Exception e) { + throw new TiClientInternalException("Error Closing Store client.", e); + } + keyBuffer = new ArrayList<>(); + keyBuffer.add(key); + } + } + return result; + } +} diff --git a/src/main/java/org/tikv/common/TiConfiguration.java b/src/main/java/org/tikv/common/TiConfiguration.java index b1cfe2fead8..16581006375 100644 --- a/src/main/java/org/tikv/common/TiConfiguration.java +++ b/src/main/java/org/tikv/common/TiConfiguration.java @@ -44,7 +44,7 @@ public class TiConfiguration implements Serializable { private static final IsolationLevel DEF_ISOLATION_LEVEL = IsolationLevel.RC; private static final boolean DEF_SHOW_ROWID = false; private static final String DEF_DB_PREFIX = ""; - private static final KVMode DEF_KV_MODE = KVMode.TXN; + private static final String DEF_KV_MODE = "KV"; private static final int DEF_RAW_CLIENT_CONCURRENCY = 200; private int timeout = DEF_TIMEOUT; @@ -63,14 +63,9 @@ public class TiConfiguration implements Serializable { private int maxRequestKeyRangeSize = MAX_REQUEST_KEY_RANGE_SIZE; private boolean showRowId = DEF_SHOW_ROWID; private String dbPrefix = DEF_DB_PREFIX; - private KVMode kvMode = DEF_KV_MODE; + private String kvMode = DEF_KV_MODE; private int rawClientConcurrency = DEF_RAW_CLIENT_CONCURRENCY; - public enum KVMode { - TXN, - RAW - } - public static TiConfiguration createDefault(String pdAddrsStr) { Objects.requireNonNull(pdAddrsStr, "pdAddrsStr is null"); TiConfiguration conf = new TiConfiguration(); @@ -82,7 +77,7 @@ public static TiConfiguration createRawDefault(String pdAddrsStr) { Objects.requireNonNull(pdAddrsStr, "pdAddrsStr is null"); TiConfiguration conf = new TiConfiguration(); conf.pdAddrs = strToHostAndPort(pdAddrsStr); - conf.kvMode = KVMode.RAW; + conf.kvMode = "RAW"; return conf; } @@ -234,12 +229,12 @@ public void setDBPrefix(String dbPrefix) { this.dbPrefix = dbPrefix; } - public KVMode getKvMode() { + public String getKvMode() { return kvMode; } public void setKvMode(String kvMode) { - this.kvMode = KVMode.valueOf(kvMode); + this.kvMode = kvMode; } public int getRawClientConcurrency() { diff --git a/src/main/java/org/tikv/common/TiSession.java b/src/main/java/org/tikv/common/TiSession.java index 9e06d41989f..51283fadd97 100644 --- a/src/main/java/org/tikv/common/TiSession.java +++ b/src/main/java/org/tikv/common/TiSession.java @@ -15,57 +15,84 @@ package org.tikv.common; -import com.google.common.annotations.VisibleForTesting; +import com.google.common.net.HostAndPort; +import io.grpc.ManagedChannel; +import io.grpc.ManagedChannelBuilder; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.TimeUnit; import org.tikv.common.region.RegionManager; -import org.tikv.common.region.RegionStoreClient.RegionStoreClientBuilder; -import org.tikv.common.util.ChannelFactory; -import org.tikv.raw.RawKVClient; -/** - * TiSession is the holder for PD Client, Store pdClient and PD Cache All sessions share common - * region store connection pool but separated PD conn and cache for better concurrency TiSession is - * thread-safe but it's also recommended to have multiple session avoiding lock contention - */ public class TiSession implements AutoCloseable { + private static final Map connPool = new HashMap<>(); private final TiConfiguration conf; - private final PDClient pdClient; - private final ChannelFactory channelFactory; + // below object creation is either heavy or making connection (pd), pending for lazy loading + private volatile RegionManager regionManager; + private volatile PDClient client; public TiSession(TiConfiguration conf) { this.conf = conf; - this.channelFactory = new ChannelFactory(conf.getMaxFrameSize()); - this.pdClient = PDClient.createRaw(conf, channelFactory); } public TiConfiguration getConf() { return conf; } - public static TiSession create(TiConfiguration conf) { - return new TiSession(conf); + public PDClient getPDClient() { + PDClient res = client; + if (res == null) { + synchronized (this) { + if (client == null) { + client = PDClient.createRaw(this); + } + res = client; + } + } + return res; } - public RawKVClient createRawClient() { - // Create new Region Manager avoiding thread contentions - RegionManager regionMgr = new RegionManager(pdClient); - RegionStoreClientBuilder builder = - new RegionStoreClientBuilder(conf, channelFactory, regionMgr); - return new RawKVClient(conf, builder); + public synchronized RegionManager getRegionManager() { + RegionManager res = regionManager; + if (res == null) { + synchronized (this) { + if (regionManager == null) { + regionManager = new RegionManager(getPDClient()); + } + res = regionManager; + } + } + return res; } - @VisibleForTesting - public PDClient getPDClient() { - return pdClient; + public synchronized ManagedChannel getChannel(String addressStr) { + ManagedChannel channel = connPool.get(addressStr); + if (channel == null) { + HostAndPort address; + try { + address = HostAndPort.fromString(addressStr); + } catch (Exception e) { + throw new IllegalArgumentException("failed to form address"); + } + + // Channel should be lazy without actual connection until first call + // So a coarse grain lock is ok here + channel = + ManagedChannelBuilder.forAddress(address.getHostText(), address.getPort()) + .maxInboundMessageSize(conf.getMaxFrameSize()) + .usePlaintext(true) + .idleTimeout(60, TimeUnit.SECONDS) + .build(); + connPool.put(addressStr, channel); + } + return channel; } - @VisibleForTesting - public ChannelFactory getChannelFactory() { - return channelFactory; + public static TiSession create(TiConfiguration conf) { + return new TiSession(conf); } @Override public void close() { - pdClient.close(); - channelFactory.close(); + getPDClient().close(); } } diff --git a/src/main/java/org/tikv/common/meta/TiTimestamp.java b/src/main/java/org/tikv/common/meta/TiTimestamp.java index ce91214f6be..36b02d76c32 100644 --- a/src/main/java/org/tikv/common/meta/TiTimestamp.java +++ b/src/main/java/org/tikv/common/meta/TiTimestamp.java @@ -40,4 +40,8 @@ public long getPhysical() { public long getLogical() { return this.logical; } + + public static long extraPhysical(long tso) { + return (tso >> PHYSICAL_SHIFT_BITS); + } } diff --git a/src/main/java/org/tikv/common/operation/iterator/ConcreteScanIterator.java b/src/main/java/org/tikv/common/operation/iterator/ConcreteScanIterator.java index 3c95b5935a8..d430b26324c 100644 --- a/src/main/java/org/tikv/common/operation/iterator/ConcreteScanIterator.java +++ b/src/main/java/org/tikv/common/operation/iterator/ConcreteScanIterator.java @@ -16,27 +16,28 @@ package org.tikv.common.operation.iterator; import com.google.protobuf.ByteString; -import org.tikv.common.TiConfiguration; +import org.tikv.common.TiSession; import org.tikv.common.region.RegionStoreClient; -import org.tikv.common.region.RegionStoreClient.RegionStoreClientBuilder; import org.tikv.common.region.TiRegion; import org.tikv.common.util.BackOffer; import org.tikv.common.util.ConcreteBackOffer; +import org.tikv.common.util.Pair; +import org.tikv.kvproto.Metapb; public class ConcreteScanIterator extends ScanIterator { private final long version; - public ConcreteScanIterator( - TiConfiguration conf, RegionStoreClientBuilder builder, ByteString startKey, long version) { + public ConcreteScanIterator(ByteString startKey, TiSession session, long version) { // Passing endKey as ByteString.EMPTY means that endKey is +INF by default, - super(conf, builder, startKey, ByteString.EMPTY, Integer.MAX_VALUE); + super(startKey, ByteString.EMPTY, Integer.MAX_VALUE, session); this.version = version; } TiRegion loadCurrentRegionToCache() throws Exception { - TiRegion region; - try (RegionStoreClient client = builder.build(startKey)) { - region = client.getRegion(); + Pair pair = regionCache.getRegionStorePairByKey(startKey); + TiRegion region = pair.first; + Metapb.Store store = pair.second; + try (RegionStoreClient client = RegionStoreClient.create(region, store, session)) { BackOffer backOffer = ConcreteBackOffer.newScannerNextMaxBackOff(); currentCache = client.scan(backOffer, startKey, version); return region; diff --git a/src/main/java/org/tikv/common/operation/iterator/RawScanIterator.java b/src/main/java/org/tikv/common/operation/iterator/RawScanIterator.java index 81d60694d04..6af32be6b08 100644 --- a/src/main/java/org/tikv/common/operation/iterator/RawScanIterator.java +++ b/src/main/java/org/tikv/common/operation/iterator/RawScanIterator.java @@ -16,31 +16,28 @@ package org.tikv.common.operation.iterator; import com.google.protobuf.ByteString; -import org.tikv.common.TiConfiguration; +import org.tikv.common.TiSession; import org.tikv.common.exception.TiKVException; import org.tikv.common.key.Key; import org.tikv.common.region.RegionStoreClient; -import org.tikv.common.region.RegionStoreClient.RegionStoreClientBuilder; import org.tikv.common.region.TiRegion; import org.tikv.common.util.BackOffFunction; import org.tikv.common.util.BackOffer; import org.tikv.common.util.ConcreteBackOffer; +import org.tikv.common.util.Pair; +import org.tikv.kvproto.Metapb; public class RawScanIterator extends ScanIterator { - public RawScanIterator( - TiConfiguration conf, - RegionStoreClientBuilder builder, - ByteString startKey, - ByteString endKey, - int limit) { - super(conf, builder, startKey, endKey, limit); + public RawScanIterator(ByteString startKey, ByteString endKey, int limit, TiSession session) { + super(startKey, endKey, limit, session); } TiRegion loadCurrentRegionToCache() throws Exception { - TiRegion region; - try (RegionStoreClient client = builder.build(startKey)) { - region = client.getRegion(); + Pair pair = regionCache.getRegionStorePairByKey(startKey); + TiRegion region = pair.first; + Metapb.Store store = pair.second; + try (RegionStoreClient client = RegionStoreClient.create(region, store, session)) { BackOffer backOffer = ConcreteBackOffer.newScannerNextMaxBackOff(); if (limit <= 0) { currentCache = null; diff --git a/src/main/java/org/tikv/common/operation/iterator/ScanIterator.java b/src/main/java/org/tikv/common/operation/iterator/ScanIterator.java index 9814a6a0626..1418392be39 100644 --- a/src/main/java/org/tikv/common/operation/iterator/ScanIterator.java +++ b/src/main/java/org/tikv/common/operation/iterator/ScanIterator.java @@ -20,16 +20,17 @@ import com.google.protobuf.ByteString; import java.util.Iterator; import java.util.List; -import org.tikv.common.TiConfiguration; +import org.tikv.common.TiSession; import org.tikv.common.exception.TiClientInternalException; import org.tikv.common.key.Key; -import org.tikv.common.region.RegionStoreClient.RegionStoreClientBuilder; +import org.tikv.common.region.RegionManager; import org.tikv.common.region.TiRegion; import org.tikv.kvproto.Kvrpcpb; public abstract class ScanIterator implements Iterator { - protected final TiConfiguration conf; - protected final RegionStoreClientBuilder builder; + protected final TiSession session; + protected final RegionManager regionCache; + protected List currentCache; protected ByteString startKey; protected int index = -1; @@ -40,12 +41,7 @@ public abstract class ScanIterator implements Iterator { protected boolean hasEndKey; protected boolean lastBatch = false; - ScanIterator( - TiConfiguration conf, - RegionStoreClientBuilder builder, - ByteString startKey, - ByteString endKey, - int limit) { + ScanIterator(ByteString startKey, ByteString endKey, int limit, TiSession session) { this.startKey = requireNonNull(startKey, "start key is null"); if (startKey.isEmpty()) { throw new IllegalArgumentException("start key cannot be empty"); @@ -53,8 +49,8 @@ public abstract class ScanIterator implements Iterator { this.endKey = Key.toRawKey(requireNonNull(endKey, "end key is null")); this.hasEndKey = !endKey.equals(ByteString.EMPTY); this.limit = limit; - this.conf = conf; - this.builder = builder; + this.session = session; + this.regionCache = session.getRegionManager(); } abstract TiRegion loadCurrentRegionToCache() throws Exception; @@ -84,7 +80,7 @@ boolean cacheLoadFails() { // Session should be single-threaded itself // so that we don't worry about conf change in the middle // of a transaction. Otherwise below code might lose data - if (currentCache.size() < conf.getScanBatchSize()) { + if (currentCache.size() < session.getConf().getScanBatchSize()) { startKey = curRegionEndKey; } else { // Start new scan from exact next key in current region diff --git a/src/main/java/org/tikv/common/region/RegionManager.java b/src/main/java/org/tikv/common/region/RegionManager.java index a69da616115..8954d231171 100644 --- a/src/main/java/org/tikv/common/region/RegionManager.java +++ b/src/main/java/org/tikv/common/region/RegionManager.java @@ -29,6 +29,7 @@ import java.util.Map; import org.apache.log4j.Logger; import org.tikv.common.ReadOnlyPDClient; +import org.tikv.common.TiSession; import org.tikv.common.exception.GrpcException; import org.tikv.common.exception.TiClientInternalException; import org.tikv.common.key.Key; @@ -41,11 +42,13 @@ public class RegionManager { private static final Logger logger = Logger.getLogger(RegionManager.class); private RegionCache cache; + private final ReadOnlyPDClient pdClient; // To avoid double retrieval, we used the async version of grpc // When rpc not returned, instead of call again, it wait for previous one done public RegionManager(ReadOnlyPDClient pdClient) { this.cache = new RegionCache(pdClient); + this.pdClient = pdClient; } public static class RegionCache { @@ -164,6 +167,10 @@ public synchronized Store getStoreById(long id) { } } + public TiSession getSession() { + return pdClient.getSession(); + } + public TiRegion getRegionByKey(ByteString key) { return cache.getRegionByKey(key); } diff --git a/src/main/java/org/tikv/common/region/RegionStoreClient.java b/src/main/java/org/tikv/common/region/RegionStoreClient.java index 13d735c506a..c6223270f8d 100644 --- a/src/main/java/org/tikv/common/region/RegionStoreClient.java +++ b/src/main/java/org/tikv/common/region/RegionStoreClient.java @@ -28,35 +28,16 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import java.util.Objects; import java.util.function.Supplier; import org.apache.log4j.Logger; import org.tikv.common.AbstractGRPCClient; -import org.tikv.common.TiConfiguration; +import org.tikv.common.TiSession; import org.tikv.common.exception.KeyException; import org.tikv.common.exception.RegionException; import org.tikv.common.exception.TiClientInternalException; import org.tikv.common.operation.KVErrorHandler; import org.tikv.common.util.BackOffer; -import org.tikv.common.util.ChannelFactory; -import org.tikv.common.util.Pair; -import org.tikv.kvproto.Kvrpcpb.BatchGetRequest; -import org.tikv.kvproto.Kvrpcpb.BatchGetResponse; -import org.tikv.kvproto.Kvrpcpb.GetRequest; -import org.tikv.kvproto.Kvrpcpb.GetResponse; -import org.tikv.kvproto.Kvrpcpb.KvPair; -import org.tikv.kvproto.Kvrpcpb.RawBatchPutRequest; -import org.tikv.kvproto.Kvrpcpb.RawBatchPutResponse; -import org.tikv.kvproto.Kvrpcpb.RawDeleteRequest; -import org.tikv.kvproto.Kvrpcpb.RawDeleteResponse; -import org.tikv.kvproto.Kvrpcpb.RawGetRequest; -import org.tikv.kvproto.Kvrpcpb.RawGetResponse; -import org.tikv.kvproto.Kvrpcpb.RawPutRequest; -import org.tikv.kvproto.Kvrpcpb.RawPutResponse; -import org.tikv.kvproto.Kvrpcpb.RawScanRequest; -import org.tikv.kvproto.Kvrpcpb.RawScanResponse; -import org.tikv.kvproto.Kvrpcpb.ScanRequest; -import org.tikv.kvproto.Kvrpcpb.ScanResponse; +import org.tikv.kvproto.Kvrpcpb.*; import org.tikv.kvproto.Metapb.Store; import org.tikv.kvproto.TikvGrpc; import org.tikv.kvproto.TikvGrpc.TikvBlockingStub; @@ -71,17 +52,19 @@ public class RegionStoreClient extends AbstractGRPCClient factory = () -> GetRequest.newBuilder() @@ -102,9 +85,6 @@ public ByteString get(BackOffer backOffer, ByteString key, long version) { if (getHelper(backOffer, resp)) { return resp.getValue(); } - - // we should refresh region - region = regionManager.getRegionByKey(key); } } @@ -427,71 +407,32 @@ private List rawScanHelper(RawScanResponse resp) { return resp.getKvsList(); } - public static class RegionStoreClientBuilder { - private final TiConfiguration conf; - private final ChannelFactory channelFactory; - private final RegionManager regionManager; - - public RegionStoreClientBuilder( - TiConfiguration conf, ChannelFactory channelFactory, RegionManager regionManager) { - Objects.requireNonNull(conf, "conf is null"); - Objects.requireNonNull(channelFactory, "channelFactory is null"); - Objects.requireNonNull(regionManager, "regionManager is null"); - this.conf = conf; - this.channelFactory = channelFactory; - this.regionManager = regionManager; - } - - public RegionStoreClient build(TiRegion region, Store store) { - Objects.requireNonNull(region, "region is null"); - Objects.requireNonNull(store, "store is null"); - - String addressStr = store.getAddress(); - if (logger.isDebugEnabled()) { - logger.debug(String.format("Create region store client on address %s", addressStr)); - } - ManagedChannel channel = channelFactory.getChannel(addressStr); - - TikvBlockingStub blockingStub = TikvGrpc.newBlockingStub(channel); - TikvStub asyncStub = TikvGrpc.newStub(channel); - - return new RegionStoreClient( - conf, region, channelFactory, blockingStub, asyncStub, regionManager); - } - - public RegionStoreClient build(ByteString key) { - Pair pair = regionManager.getRegionStorePairByKey(key); - return build(pair.first, pair.second); + public static RegionStoreClient create(TiRegion region, Store store, TiSession session) { + RegionStoreClient client; + String addressStr = store.getAddress(); + if (logger.isDebugEnabled()) { + logger.debug(String.format("Create region store client on address %s", addressStr)); } + ManagedChannel channel = session.getChannel(addressStr); - public RegionStoreClient build(TiRegion region) { - Store store = regionManager.getStoreById(region.getLeader().getStoreId()); - return build(region, store); - } + TikvBlockingStub blockingStub = TikvGrpc.newBlockingStub(channel); - public RegionManager getRegionManager() { - return regionManager; - } + TikvStub asyncStub = TikvGrpc.newStub(channel); + client = new RegionStoreClient(region, session, blockingStub, asyncStub); + return client; } private RegionStoreClient( - TiConfiguration conf, - TiRegion region, - ChannelFactory channelFactory, - TikvBlockingStub blockingStub, - TikvStub asyncStub, - RegionManager regionManager) { - super(conf, channelFactory); + TiRegion region, TiSession session, TikvBlockingStub blockingStub, TikvStub asyncStub) { + super(session); checkNotNull(region, "Region is empty"); checkNotNull(region.getLeader(), "Leader Peer is null"); checkArgument(region.getLeader() != null, "Leader Peer is null"); - this.regionManager = regionManager; + this.regionManager = session.getRegionManager(); this.region = region; this.blockingStub = blockingStub; this.asyncStub = asyncStub; - this.lockResolverClient = - new LockResolverClient( - conf, this.blockingStub, this.asyncStub, channelFactory, regionManager); + this.lockResolverClient = new LockResolverClient(session, this.blockingStub, this.asyncStub); } @Override @@ -527,7 +468,7 @@ public boolean onNotLeader(Store newStore) { } region = cachedRegion; String addressStr = regionManager.getStoreById(region.getLeader().getStoreId()).getAddress(); - ManagedChannel channel = channelFactory.getChannel(addressStr); + ManagedChannel channel = getSession().getChannel(addressStr); blockingStub = TikvGrpc.newBlockingStub(channel); asyncStub = TikvGrpc.newStub(channel); return true; @@ -536,7 +477,7 @@ public boolean onNotLeader(Store newStore) { @Override public void onStoreNotMatch(Store store) { String addressStr = store.getAddress(); - ManagedChannel channel = channelFactory.getChannel(addressStr); + ManagedChannel channel = getSession().getChannel(addressStr); blockingStub = TikvGrpc.newBlockingStub(channel); asyncStub = TikvGrpc.newStub(channel); if (logger.isDebugEnabled() && region.getLeader().getStoreId() != store.getId()) { diff --git a/src/main/java/org/tikv/common/region/TiRegion.java b/src/main/java/org/tikv/common/region/TiRegion.java index 26e7509ba48..d5f0ec5e9e1 100644 --- a/src/main/java/org/tikv/common/region/TiRegion.java +++ b/src/main/java/org/tikv/common/region/TiRegion.java @@ -21,7 +21,6 @@ import java.io.Serializable; import java.util.List; import java.util.Objects; -import org.tikv.common.TiConfiguration.KVMode; import org.tikv.common.codec.Codec.BytesCodec; import org.tikv.common.codec.CodecDataInput; import org.tikv.common.codec.KeyUtils; @@ -45,9 +44,9 @@ public TiRegion( Peer peer, IsolationLevel isolationLevel, Kvrpcpb.CommandPri commandPri, - KVMode kvMode) { + String kvMode) { Objects.requireNonNull(meta, "meta is null"); - this.meta = decodeRegion(meta, kvMode == KVMode.RAW); + this.meta = decodeRegion(meta, kvMode.equalsIgnoreCase("RAW")); if (peer == null || peer.getId() == 0) { if (meta.getPeersCount() == 0) { throw new TiClientInternalException("Empty peer list for region " + meta.getId()); diff --git a/src/main/java/org/tikv/common/region/TxnRegionStoreClient.java b/src/main/java/org/tikv/common/region/TxnRegionStoreClient.java new file mode 100644 index 00000000000..bac4d6dab88 --- /dev/null +++ b/src/main/java/org/tikv/common/region/TxnRegionStoreClient.java @@ -0,0 +1,609 @@ +package org.tikv.common.region; + +import com.google.common.annotations.VisibleForTesting; +import com.google.protobuf.ByteString; +import io.grpc.ManagedChannel; +import org.apache.log4j.Logger; +import org.tikv.common.AbstractGRPCClient; +import org.tikv.common.TiSession; +import org.tikv.common.exception.KeyException; +import org.tikv.common.exception.RegionException; +import org.tikv.common.exception.TiClientInternalException; +import org.tikv.common.operation.KVErrorHandler; +import org.tikv.common.util.BackOffFunction; +import org.tikv.common.util.BackOffer; +import org.tikv.kvproto.Kvrpcpb; +import org.tikv.kvproto.Metapb; +import org.tikv.kvproto.TikvGrpc; +import org.tikv.txn.Lock; +import org.tikv.txn.LockResolverClient; +import org.tikv.kvproto.Kvrpcpb.*; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.function.Supplier; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.tikv.common.util.BackOffFunction.BackOffFuncType.BoRegionMiss; +import static org.tikv.common.util.BackOffFunction.BackOffFuncType.BoTxnLockFast; + +/** + * RegionStore itself is not thread-safe + * APIs for Transaction KV Scan/Put/Get/Delete + */ +public class TxnRegionStoreClient extends AbstractGRPCClient + implements RegionErrorReceiver { + private static final Logger logger = Logger.getLogger(TxnRegionStoreClient.class); + + private TiRegion region; + private final RegionManager regionManager; + @VisibleForTesting + private final LockResolverClient lockResolverClient; + private TikvGrpc.TikvBlockingStub blockingStub; + private TikvGrpc.TikvStub asyncStub; + + private TxnRegionStoreClient( + TiRegion region, TiSession session, TikvGrpc.TikvBlockingStub blockingStub, TikvGrpc.TikvStub asyncStub) { + super(session); + checkNotNull(region, "Region is empty"); + checkNotNull(region.getLeader(), "Leader Peer is null"); + checkArgument(region.getLeader() != null, "Leader Peer is null"); + this.regionManager = session.getRegionManager(); + this.region = region; + this.blockingStub = blockingStub; + this.asyncStub = asyncStub; + this.lockResolverClient = new LockResolverClient(session, this.blockingStub, this.asyncStub); + } + + public static TxnRegionStoreClient create(TiRegion region, Metapb.Store store, TiSession session) { + TxnRegionStoreClient client; + String addressStr = store.getAddress(); + if (logger.isDebugEnabled()) { + logger.debug(String.format("Create region store client on address %s", addressStr)); + } + ManagedChannel channel = session.getChannel(addressStr); + + TikvGrpc.TikvBlockingStub blockingStub = TikvGrpc.newBlockingStub(channel); + + TikvGrpc.TikvStub asyncStub = TikvGrpc.newStub(channel); + client = new TxnRegionStoreClient(region, session, blockingStub, asyncStub); + return client; + } + + @Override + protected TikvGrpc.TikvBlockingStub getBlockingStub() { + return blockingStub.withDeadlineAfter(getConf().getTimeout(), getConf().getTimeoutUnit()); + } + + @Override + protected TikvGrpc.TikvStub getAsyncStub() { + return asyncStub.withDeadlineAfter(getConf().getTimeout(), getConf().getTimeoutUnit()); + } + + @Override + public void close() throws Exception { + + } + + @Override + public boolean onNotLeader(Metapb.Store newStore) { + if (logger.isDebugEnabled()) { + logger.debug(region + ", new leader = " + newStore.getId()); + } + TiRegion cachedRegion = regionManager.getRegionById(region.getId()); + // When switch leader fails or the region changed its key range, + // it would be necessary to re-split task's key range for new region. + if (!region.getStartKey().equals(cachedRegion.getStartKey()) + || !region.getEndKey().equals(cachedRegion.getEndKey())) { + return false; + } + region = cachedRegion; + String addressStr = regionManager.getStoreById(region.getLeader().getStoreId()).getAddress(); + ManagedChannel channel = getSession().getChannel(addressStr); + blockingStub = TikvGrpc.newBlockingStub(channel); + asyncStub = TikvGrpc.newStub(channel); + return true; + } + + @Override + public void onStoreNotMatch(Metapb.Store store) { + String addressStr = store.getAddress(); + ManagedChannel channel = getSession().getChannel(addressStr); + blockingStub = TikvGrpc.newBlockingStub(channel); + asyncStub = TikvGrpc.newStub(channel); + if (logger.isDebugEnabled() && region.getLeader().getStoreId() != store.getId()) { + logger.debug( + "store_not_match may occur? " + + region + + ", original store = " + + store.getId() + + " address = " + + addressStr); + } + } + + // APIs for Transaction KV Scan/Put/Get/Delete + public ByteString get(BackOffer backOffer, ByteString key, long version) { + while (true) { + // we should refresh region + region = regionManager.getRegionByKey(key); + + Supplier factory = + () -> + GetRequest.newBuilder() + .setContext(region.getContext()) + .setKey(key) + .setVersion(version) + .build(); + + KVErrorHandler handler = + new KVErrorHandler<>( + regionManager, + this, + region, + resp -> resp.hasRegionError() ? resp.getRegionError() : null); + + + GetResponse resp = callWithRetry(backOffer, TikvGrpc.METHOD_KV_GET, factory, handler); + + if (getHelper(backOffer, resp)) { + return resp.getValue(); + } + } + } + + private boolean getHelper(BackOffer backOffer, GetResponse resp) { + if (resp == null) { + this.regionManager.onRequestFail(region); + throw new TiClientInternalException("GetResponse failed without a cause"); + } + + if (resp.hasRegionError()) { + backOffer.doBackOff(BoRegionMiss, new RegionException(resp.getRegionError())); + return false; + } + + if (resp.hasError()) { + if (resp.getError().hasLocked()) { + Lock lock = new Lock(resp.getError().getLocked()); + boolean ok = + lockResolverClient.resolveLocks(backOffer, new ArrayList<>(Arrays.asList(lock))); + if (!ok) { + // if not resolve all locks, we wait and retry + backOffer.doBackOff( + BoTxnLockFast, new KeyException((resp.getError().getLocked().toString()))); + } + return false; + } else { + // retry or abort + // this should trigger Spark to retry the txn + throw new KeyException(resp.getError()); + } + } + return true; + } + + + + public List batchGet(BackOffer backOffer, Iterable keys, long version) { + while(true) { + Supplier request = + () -> + BatchGetRequest.newBuilder() + .setContext(region.getContext()) + .addAllKeys(keys) + .setVersion(version) + .build(); + KVErrorHandler handler = + new KVErrorHandler<>( + regionManager, + this, + region, + resp -> resp.hasRegionError() ? resp.getRegionError() : null); + BatchGetResponse resp = + callWithRetry(backOffer, TikvGrpc.METHOD_KV_BATCH_GET, request, handler); + if(batchGetHelper(backOffer, resp)) { + return resp.getPairsList(); + } + } + } + + + private boolean batchGetHelper(BackOffer bo, BatchGetResponse resp) { + List locks = new ArrayList<>(); + + for (KvPair pair : resp.getPairsList()) { + if (pair.hasError()) { + if (pair.getError().hasLocked()) { + Lock lock = new Lock(pair.getError().getLocked()); + locks.add(lock); + } else { + throw new KeyException(pair.getError()); + } + } + } + + if (!locks.isEmpty()) { + boolean ok = lockResolverClient.resolveLocks(bo, locks); + if (!ok) { + // if not resolve all locks, we wait and retry + bo.doBackOff(BoTxnLockFast, new KeyException((resp.getPairsList().get(0).getError()))); + } + + + return false; + } + + if (resp.hasRegionError()) { + + throw new RegionException(resp.getRegionError()); + } + return true; + } + + + public void deleteRange(BackOffer backOffer, ByteString startKey, ByteString endKey) { + while(true) { + Supplier factory = + () -> DeleteRangeRequest.newBuilder().setContext(region.getContext()).setStartKey(startKey).setEndKey(endKey).build(); + KVErrorHandler handler = new KVErrorHandler<>( + regionManager, + this, + region, + resp -> resp.hasRegionError() ? resp.getRegionError() : null); + DeleteRangeResponse resp = callWithRetry(backOffer, TikvGrpc.METHOD_KV_DELETE_RANGE, factory, handler); + if(deleteHelper(backOffer, resp)){ + break; + } + } + } + + private boolean deleteHelper(BackOffer bo, DeleteRangeResponse resp) { + if (resp == null){ + this.regionManager.onRequestFail(region); + throw new TiClientInternalException("DeleteRangeResponse failed without a cause"); + } + if(resp.hasRegionError()){ + bo.doBackOff(BoRegionMiss, new RegionException(resp.getRegionError())); + return false; + } + String error = resp.getError(); + if(error != null && !error.isEmpty()){ + throw new KeyException(resp.getError()); + } + return true; + } + + public List scan(BackOffer backOffer, ByteString startKey, long version) { + return scan(backOffer, startKey, version, false); + } + + public List scan( + BackOffer backOffer, ByteString startKey, long version, boolean keyOnly) { + Supplier request = + () -> + ScanRequest.newBuilder() + .setContext(region.getContext()) + .setStartKey(startKey) + .setVersion(version) + .setKeyOnly(keyOnly) + .setLimit(getConf().getScanBatchSize()) + .build(); + + KVErrorHandler handler = + new KVErrorHandler<>( + regionManager, + this, + region, + resp -> resp.hasRegionError() ? resp.getRegionError() : null); + ScanResponse resp = callWithRetry(backOffer, TikvGrpc.METHOD_KV_SCAN, request, handler); + return scanHelper(resp, backOffer); + } + + + + private List scanHelper(ScanResponse resp, BackOffer bo) { + if (resp == null) { + this.regionManager.onRequestFail(region); + throw new TiClientInternalException("ScanResponse failed without a cause"); + } + + List locks = new ArrayList<>(); + + for (KvPair pair : resp.getPairsList()) { + if (pair.hasError()) { + if (pair.getError().hasLocked()) { + Lock lock = new Lock(pair.getError().getLocked()); + locks.add(lock); + } else { + throw new KeyException(pair.getError()); + } + } + } + + if (!locks.isEmpty()) { + boolean ok = lockResolverClient.resolveLocks(bo, locks); + if (!ok) { + // if not resolve all locks, we wait and retry + bo.doBackOff(BoTxnLockFast, new KeyException((resp.getPairsList().get(0).getError()))); + } + + + } + if (resp.hasRegionError()) { + throw new RegionException(resp.getRegionError()); + } + return resp.getPairsList(); + } + + public void prewrite(BackOffer bo, ByteString primaryLock, Iterable mutations, long startVersion, long ttl, boolean skipConstraintCheck){ + while(true) { + Supplier factory = + () -> PrewriteRequest.newBuilder(). + setContext(region.getContext()). + setStartVersion(startVersion). + setPrimaryLock(primaryLock). + addAllMutations(mutations). + setLockTtl(ttl). + setSkipConstraintCheck(skipConstraintCheck). + build(); + KVErrorHandler handler = new KVErrorHandler<>( + regionManager, + this, + region, + resp -> resp.hasRegionError() ? resp.getRegionError() : null); + PrewriteResponse resp = callWithRetry(bo, TikvGrpc.METHOD_KV_PREWRITE, factory, handler); + if (prewriteHelper(bo, resp)) { + break; + } + } + } + + private boolean prewriteHelper(BackOffer bo, PrewriteResponse resp) { + if(resp == null){ + this.regionManager.onRequestFail(region); + throw new TiClientInternalException("PrewriteResponse failed without a cause"); + } + if (resp.hasRegionError()) { + bo.doBackOff(BoRegionMiss, new RegionException(resp.getRegionError())); + return false; + } + for(KeyError err : resp.getErrorsList()){ + if(err.hasLocked()){ + Lock lock = new Lock(err.getLocked()); + boolean ok = lockResolverClient.resolveLocks(bo, new ArrayList<>(Arrays.asList(lock))); + if(!ok){ + bo.doBackOff(BoTxnLockFast, new KeyException((err.getLocked().toString()))); + } + return false; + } + else{ + throw new KeyException(err.toString()); + } + } + return true; + } + + public void prewrite(BackOffer backOffer, ByteString primary, Iterable mutations, long startTs, long lockTTL) { + this.prewrite(backOffer, primary, mutations, startTs, lockTTL, false); + } + + public void commit(BackOffer backOffer, Iterable keys, long startVersion, long commitVersion) { + while(true) { + Supplier factory = + () -> CommitRequest.newBuilder() + .setStartVersion(startVersion) + .setCommitVersion(commitVersion) + .addAllKeys(keys) + .setContext(region.getContext()).build(); + KVErrorHandler handler = + new KVErrorHandler<>( + regionManager, + this, + region, + resp -> resp.hasRegionError() ? resp.getRegionError() : null); + CommitResponse resp = callWithRetry(backOffer, TikvGrpc.METHOD_KV_COMMIT, factory, handler); + if(commitHelper(backOffer, resp)){ + break; + } + } + } + + private boolean commitHelper(BackOffer bo, CommitResponse resp) { + if(resp == null){ + this.regionManager.onRequestFail(region); + throw new TiClientInternalException("CommitResponse failed without a cause"); + } + if(resp.hasRegionError()){ + bo.doBackOff(BoRegionMiss, new RegionException(resp.getRegionError())); + return false; + } + //if hasLock, need to resolveLocks and retry? + if(resp.hasError()) { + if (resp.getError().hasLocked()) { + Lock lock = new Lock(resp.getError().getLocked()); + boolean ok = lockResolverClient.resolveLocks(bo, new ArrayList<>(Arrays.asList(lock))); + if (!ok) { + bo.doBackOff(BoTxnLockFast, new KeyException((resp.getError().getLocked().toString()))); + } + return false; + } else { + throw new KeyException(resp.getError()); + } + } + return true; + } + + public long cleanup(BackOffer backOffer, ByteString key, long startTs) { + while(true) { + Supplier factory = + () -> CleanupRequest.newBuilder() + .setContext(region.getContext()) + .setKey(key) + .setStartVersion(startTs) + .build(); + KVErrorHandler handler = new KVErrorHandler<>( + regionManager, + this, + region, + resp -> resp.hasRegionError() ? resp.getRegionError() : null); + CleanupResponse resp = callWithRetry(backOffer, TikvGrpc.METHOD_KV_CLEANUP, factory, handler); + if(cleanUpHelper(backOffer, resp)) { + return resp.getCommitVersion(); + } + } + } + + private boolean cleanUpHelper(BackOffer bo, CleanupResponse resp) { + if(resp == null){ + this.regionManager.onRequestFail(region); + throw new TiClientInternalException("CleanupResponse failed without a cause"); + } + if(resp.hasRegionError()){ + bo.doBackOff(BoRegionMiss, new RegionException(resp.getRegionError())); + return false; + } + if(resp.hasError()) { + if (resp.getError().hasLocked()) { + Lock lock = new Lock(resp.getError().getLocked()); + boolean ok = lockResolverClient.resolveLocks(bo, new ArrayList<>(Arrays.asList(lock))); + if (!ok) { + bo.doBackOff(BoTxnLockFast, new KeyException((resp.getError().getLocked().toString()))); + } + return false; + } else { + throw new KeyException(resp.getError()); + } + } + return true; + } + + public void batchRollback(BackOffer backOffer, Iterable keys, long startVersion){ + while(true) { + Supplier factory = + () -> BatchRollbackRequest.newBuilder().setStartVersion(startVersion).setContext(region.getContext()).addAllKeys(keys).build(); + KVErrorHandler handler = + new KVErrorHandler<>( + regionManager, + this, + region, + resp -> resp.hasRegionError() ? resp.getRegionError() : null); + BatchRollbackResponse resp = callWithRetry(backOffer, TikvGrpc.METHOD_KV_BATCH_ROLLBACK, factory, handler); + if(batchRollbackHelper(backOffer, resp)){ + break; + } + } + } + + private boolean batchRollbackHelper(BackOffer bo, BatchRollbackResponse resp) { + if(resp == null){ + this.regionManager.onRequestFail(region); + throw new TiClientInternalException("BatchRollbackResponse failed without a cause"); + } + if(resp.hasRegionError()){ + bo.doBackOff(BoRegionMiss, new RegionException(resp.getRegionError())); + return false; + } + if(resp.hasError()) { + if (resp.getError().hasLocked()) { + Lock lock = new Lock(resp.getError().getLocked()); + boolean ok = lockResolverClient.resolveLocks(bo, new ArrayList<>(Arrays.asList(lock))); + if (!ok) { + bo.doBackOff(BoTxnLockFast, new KeyException((resp.getError().getLocked().toString()))); + } + return false; + } else { + throw new KeyException(resp.getError()); + } + } + return true; + } + + public void gc(BackOffer bo, long safePoint){ + while(true) { + Supplier factory = + () -> GCRequest.newBuilder().setSafePoint(safePoint).setContext(region.getContext()).build(); + KVErrorHandler handler = + new KVErrorHandler<>( + regionManager, + this, + region, + resp -> resp.hasRegionError() ? resp.getRegionError() : null); + GCResponse resp = callWithRetry(bo, TikvGrpc.METHOD_KV_GC, factory, handler); + if (gcHelper(bo, resp)) { + break; + } + } + } + + private boolean gcHelper(BackOffer bo, GCResponse resp) { + if(resp == null){ + this.regionManager.onRequestFail(region); + throw new TiClientInternalException("GCResponse failed without a cause"); + } + if(resp.hasRegionError()){ + bo.doBackOff(BoRegionMiss, new RegionException(resp.getRegionError())); + return false; + } + if(resp.hasError()) { + if (resp.getError().hasLocked()) { + Lock lock = new Lock(resp.getError().getLocked()); + boolean ok = lockResolverClient.resolveLocks(bo, new ArrayList<>(Arrays.asList(lock))); + if (!ok) { + bo.doBackOff(BoTxnLockFast, new KeyException((resp.getError().getLocked().toString()))); + } + return false; + } else { + throw new KeyException(resp.getError()); + } + } + return true; + } + + private List scanLock(BackOffer bo, ByteString startkey ,long maxVersion, int limit){ + while(true) { + Supplier factory = + () -> ScanLockRequest.newBuilder().setContext(region.getContext()).setMaxVersion(maxVersion).setStartKey(startkey).setLimit(limit).build(); + KVErrorHandler handler = + new KVErrorHandler<>( + regionManager, + this, + region, + resp -> resp.hasRegionError() ? resp.getRegionError() : null); + ScanLockResponse resp = callWithRetry(bo, TikvGrpc.METHOD_KV_SCAN_LOCK, factory, handler); + if (scanLockHelper(bo, resp)) { + return resp.getLocksList(); + } + } + } + + private boolean scanLockHelper(BackOffer bo, ScanLockResponse resp) { + if(resp == null){ + this.regionManager.onRequestFail(region); + throw new TiClientInternalException("ScanLockResponse failed without a cause"); + } + if(resp.hasRegionError()){ + bo.doBackOff(BoRegionMiss, new RegionException(resp.getRegionError())); + return false; + } + if(resp.hasError()) { + if (resp.getError().hasLocked()) { + Lock lock = new Lock(resp.getError().getLocked()); + boolean ok = lockResolverClient.resolveLocks(bo, new ArrayList<>(Arrays.asList(lock))); + if (!ok) { + bo.doBackOff(BoTxnLockFast, new KeyException((resp.getError().getLocked().toString()))); + } + return false; + } else { + throw new KeyException(resp.getError()); + } + } + return true; + } + + public void delete(BackOffer backOffer, ByteString key) { + + } +} diff --git a/src/main/java/org/tikv/common/util/BackOffer.java b/src/main/java/org/tikv/common/util/BackOffer.java index 9820b46d03c..53f43ec29a7 100644 --- a/src/main/java/org/tikv/common/util/BackOffer.java +++ b/src/main/java/org/tikv/common/util/BackOffer.java @@ -44,6 +44,7 @@ enum BackOffStrategy { int GcDeleteRangeMaxBackoff = 100000; int rawkvMaxBackoff = 40000; int splitRegionBackoff = 20000; + int commitMaxBackoff = 3000; /** * doBackOff sleeps a while base on the BackOffType and records the error message. Will stop until diff --git a/src/main/java/org/tikv/common/util/ChannelFactory.java b/src/main/java/org/tikv/common/util/ChannelFactory.java deleted file mode 100644 index c390a59e828..00000000000 --- a/src/main/java/org/tikv/common/util/ChannelFactory.java +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright 2017 PingCAP, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.tikv.common.util; - -import com.google.common.net.HostAndPort; -import io.grpc.ManagedChannel; -import io.grpc.ManagedChannelBuilder; -import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.TimeUnit; - -public class ChannelFactory implements AutoCloseable { - private final int maxFrameSize; - private final Map connPool = new ConcurrentHashMap<>(); - - public ChannelFactory(int maxFrameSize) { - this.maxFrameSize = maxFrameSize; - } - - public ManagedChannel getChannel(String addressStr) { - return connPool.computeIfAbsent( - addressStr, - key -> { - HostAndPort address; - try { - address = HostAndPort.fromString(key); - } catch (Exception e) { - throw new IllegalArgumentException("failed to form address"); - } - // Channel should be lazy without actual connection until first call - // So a coarse grain lock is ok here - return ManagedChannelBuilder.forAddress(address.getHostText(), address.getPort()) - .maxInboundMessageSize(maxFrameSize) - .usePlaintext(true) - .idleTimeout(60, TimeUnit.SECONDS) - .build(); - }); - } - - public void close() { - for (ManagedChannel ch : connPool.values()) { - ch.shutdown(); - } - connPool.clear(); - } -} diff --git a/src/main/java/org/tikv/common/util/RegionStoreUtils.java b/src/main/java/org/tikv/common/util/RegionStoreUtils.java new file mode 100644 index 00000000000..6971d35bc81 --- /dev/null +++ b/src/main/java/org/tikv/common/util/RegionStoreUtils.java @@ -0,0 +1,36 @@ +package org.tikv.common.util; + +import com.google.protobuf.ByteString; +import org.tikv.common.region.RegionManager; +import org.tikv.common.region.TiRegion; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Util class for keys split + */ +public class RegionStoreUtils { + + /** + * split keys to groups + * @param regionManager + * @param keys + * @return + */ + public static Map> groupKeysByRegion(final RegionManager regionManager, + List keys) { + Map> groups = new HashMap<>(); + TiRegion lastRegion = null; + BackOffer bo = ConcreteBackOffer.newCustomBackOff(BackOffer.tsoMaxBackoff); + for (ByteString key : keys) { + if (lastRegion == null || !lastRegion.contains(key)) { + lastRegion = regionManager.getRegionByKey(key); + } + groups.computeIfAbsent(lastRegion, k -> new ArrayList<>()).add(key); + } + return groups; + } +} diff --git a/src/main/java/org/tikv/raw/RawKVClient.java b/src/main/java/org/tikv/raw/RawKVClient.java index 72401b9042e..930554584b6 100644 --- a/src/main/java/org/tikv/raw/RawKVClient.java +++ b/src/main/java/org/tikv/raw/RawKVClient.java @@ -16,51 +16,57 @@ package org.tikv.raw; import com.google.protobuf.ByteString; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.Set; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.ExecutorCompletionService; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.TimeoutException; +import java.util.*; +import java.util.concurrent.*; import java.util.stream.Collectors; import org.apache.log4j.Logger; import org.tikv.common.TiConfiguration; +import org.tikv.common.TiSession; import org.tikv.common.exception.TiKVException; import org.tikv.common.operation.iterator.RawScanIterator; +import org.tikv.common.region.RegionManager; import org.tikv.common.region.RegionStoreClient; -import org.tikv.common.region.RegionStoreClient.RegionStoreClientBuilder; import org.tikv.common.region.TiRegion; import org.tikv.common.util.BackOffFunction; import org.tikv.common.util.BackOffer; import org.tikv.common.util.ConcreteBackOffer; +import org.tikv.common.util.Pair; import org.tikv.kvproto.Kvrpcpb; +import org.tikv.kvproto.Metapb; public class RawKVClient implements AutoCloseable { - private final RegionStoreClientBuilder clientBuilder; - private final TiConfiguration conf; + private static final String DEFAULT_PD_ADDRESS = "127.0.0.1:2379"; + private final TiSession session; + private final RegionManager regionManager; private final ExecutorCompletionService completionService; private static final Logger logger = Logger.getLogger(RawKVClient.class); private static final int RAW_BATCH_PUT_SIZE = 16 * 1024; - public RawKVClient(TiConfiguration conf, RegionStoreClientBuilder clientBuilder) { - Objects.requireNonNull(conf, "conf is null"); - Objects.requireNonNull(clientBuilder, "clientBuilder is null"); - this.conf = conf; - this.clientBuilder = clientBuilder; - ExecutorService executors = Executors.newFixedThreadPool(conf.getRawClientConcurrency()); - this.completionService = new ExecutorCompletionService<>(executors); + private RawKVClient(String addresses) { + session = TiSession.create(TiConfiguration.createRawDefault(addresses)); + regionManager = session.getRegionManager(); + ExecutorService executors = + Executors.newFixedThreadPool(session.getConf().getRawClientConcurrency()); + completionService = new ExecutorCompletionService<>(executors); + } + + private RawKVClient() { + this(DEFAULT_PD_ADDRESS); + } + + public static RawKVClient create() { + return new RawKVClient(); + } + + public static RawKVClient create(String address) { + return new RawKVClient(address); } @Override - public void close() {} + public void close() { + session.close(); + } /** * Put a raw key-value pair to TiKV @@ -71,7 +77,8 @@ public void close() {} public void put(ByteString key, ByteString value) { BackOffer backOffer = defaultBackOff(); while (true) { - RegionStoreClient client = clientBuilder.build(key); + Pair pair = regionManager.getRegionStorePairByKey(key); + RegionStoreClient client = RegionStoreClient.create(pair.first, pair.second, session); try { client.rawPut(backOffer, key, value); return; @@ -119,7 +126,8 @@ private void batchPut(BackOffer backOffer, Map kvPairs) public ByteString get(ByteString key) { BackOffer backOffer = defaultBackOff(); while (true) { - RegionStoreClient client = clientBuilder.build(key); + Pair pair = regionManager.getRegionStorePairByKey(key); + RegionStoreClient client = RegionStoreClient.create(pair.first, pair.second, session); try { return client.rawGet(defaultBackOff(), key); } catch (final TiKVException e) { @@ -136,7 +144,7 @@ public ByteString get(ByteString key) { * @return list of key-value pairs in range */ public List scan(ByteString startKey, ByteString endKey) { - Iterator iterator = rawScanIterator(conf, clientBuilder, startKey, endKey); + Iterator iterator = rawScanIterator(startKey, endKey); List result = new ArrayList<>(); iterator.forEachRemaining(result::add); return result; @@ -150,7 +158,7 @@ public List scan(ByteString startKey, ByteString endKey) { * @return list of key-value pairs in range */ public List scan(ByteString startKey, int limit) { - Iterator iterator = rawScanIterator(conf, clientBuilder, startKey, limit); + Iterator iterator = rawScanIterator(startKey, limit); List result = new ArrayList<>(); iterator.forEachRemaining(result::add); return result; @@ -164,7 +172,8 @@ public List scan(ByteString startKey, int limit) { public void delete(ByteString key) { BackOffer backOffer = defaultBackOff(); while (true) { - RegionStoreClient client = clientBuilder.build(key); + Pair pair = regionManager.getRegionStorePairByKey(key); + RegionStoreClient client = RegionStoreClient.create(pair.first, pair.second, session); try { client.rawDelete(defaultBackOff(), key); return; @@ -229,14 +238,14 @@ private Map> groupKeysByRegion(Set keys) TiRegion lastRegion = null; for (ByteString key : keys) { if (lastRegion == null || !lastRegion.contains(key)) { - lastRegion = clientBuilder.getRegionManager().getRegionByKey(key); + lastRegion = regionManager.getRegionByKey(key); } groups.computeIfAbsent(lastRegion, k -> new ArrayList<>()).add(key); } return groups; } - private static Map mapKeysToValues( + static Map mapKeysToValues( List keys, List values) { Map map = new HashMap<>(); for (int i = 0; i < keys.size(); i++) { @@ -255,8 +264,12 @@ private void sendBatchPut(BackOffer backOffer, List batches) { for (Batch batch : batches) { completionService.submit( () -> { - RegionStoreClient client = clientBuilder.build(batch.region); BackOffer singleBatchBackOffer = ConcreteBackOffer.create(backOffer); + RegionStoreClient client = + RegionStoreClient.create( + batch.region, + regionManager.getStoreById(batch.region.getLeader().getStoreId()), + session); List kvPairs = new ArrayList<>(); for (int i = 0; i < batch.keys.size(); i++) { kvPairs.add( @@ -291,17 +304,12 @@ private void sendBatchPut(BackOffer backOffer, List batches) { } } - private Iterator rawScanIterator( - TiConfiguration conf, - RegionStoreClientBuilder builder, - ByteString startKey, - ByteString endKey) { - return new RawScanIterator(conf, builder, startKey, endKey, Integer.MAX_VALUE); + private Iterator rawScanIterator(ByteString startKey, ByteString endKey) { + return new RawScanIterator(startKey, endKey, Integer.MAX_VALUE, session); } - private Iterator rawScanIterator( - TiConfiguration conf, RegionStoreClientBuilder builder, ByteString startKey, int limit) { - return new RawScanIterator(conf, builder, startKey, ByteString.EMPTY, limit); + private Iterator rawScanIterator(ByteString startKey, int limit) { + return new RawScanIterator(startKey, ByteString.EMPTY, limit, session); } private BackOffer defaultBackOff() { diff --git a/src/main/java/org/tikv/txn/ITransaction.java b/src/main/java/org/tikv/txn/ITransaction.java new file mode 100644 index 00000000000..f9525ebcd77 --- /dev/null +++ b/src/main/java/org/tikv/txn/ITransaction.java @@ -0,0 +1,54 @@ +package org.tikv.txn; + +import org.tikv.common.Snapshot; +import org.tikv.common.key.Key; +import org.tikv.common.meta.TiTimestamp; + +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +/** + * definition of Transaction api + */ +public interface ITransaction { + + boolean set(byte[] key, byte[] value); + + byte[] get(byte[] key); + + boolean delete(byte[] key); + + //Iterator iterator(byte[] startKey, byte[] endKey); + + //Iterator iteratorReverse(byte[] startKey); + + /** + * create TwoPhaseCommitter, and call 2pc api + * @return + */ + boolean commit(); + + boolean rollback(); + + boolean lockKeys(Key... lockedKeys); + /** + * + * @return returns if the transaction is valid + */ + boolean valid(); + + long getStartTS(); + + long getStartTime(); + + boolean isReadOnly(); + + Snapshot getSnapshot(); + + TxnKVClient getKVClient(); + + Map getStoredKeys(); + + List getLockedKeys(); +} diff --git a/src/main/java/org/tikv/txn/LockResolverClient.java b/src/main/java/org/tikv/txn/LockResolverClient.java index 401cd4693fe..3c67a78eb02 100644 --- a/src/main/java/org/tikv/txn/LockResolverClient.java +++ b/src/main/java/org/tikv/txn/LockResolverClient.java @@ -21,29 +21,20 @@ import com.google.protobuf.ByteString; import io.grpc.ManagedChannel; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Queue; -import java.util.Set; +import java.util.*; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.function.Supplier; import org.apache.log4j.Logger; import org.tikv.common.AbstractGRPCClient; -import org.tikv.common.TiConfiguration; +import org.tikv.common.TiSession; import org.tikv.common.exception.KeyException; import org.tikv.common.exception.RegionException; import org.tikv.common.operation.KVErrorHandler; import org.tikv.common.region.RegionErrorReceiver; -import org.tikv.common.region.RegionManager; import org.tikv.common.region.TiRegion; import org.tikv.common.region.TiRegion.RegionVerID; import org.tikv.common.util.BackOffer; -import org.tikv.common.util.ChannelFactory; import org.tikv.common.util.TsoUtils; import org.tikv.kvproto.Kvrpcpb.CleanupRequest; import org.tikv.kvproto.Kvrpcpb.CleanupResponse; @@ -80,20 +71,13 @@ public class LockResolverClient extends AbstractGRPCClient(); recentResolved = new LinkedList<>(); readWriteLock = new ReentrantReadWriteLock(); this.blockingStub = blockingStub; - this.regionManager = regionManager; this.asyncStub = asyncStub; } @@ -133,7 +117,7 @@ public Long getTxnStatus(BackOffer bo, Long txnID, ByteString primary) { while (true) { // refresh region - region = regionManager.getRegionByKey(primary); + region = session.getRegionManager().getRegionByKey(primary); Supplier factory = () -> @@ -144,7 +128,7 @@ public Long getTxnStatus(BackOffer bo, Long txnID, ByteString primary) { .build(); KVErrorHandler handler = new KVErrorHandler<>( - regionManager, + session.getRegionManager(), this, region, resp -> resp.hasRegionError() ? resp.getRegionError() : null); @@ -213,7 +197,7 @@ public boolean resolveLocks(BackOffer bo, List locks) { private void resolveLock(BackOffer bo, Lock lock, long txnStatus, Set cleanRegion) { while (true) { - region = regionManager.getRegionByKey(lock.getKey()); + region = session.getRegionManager().getRegionByKey(lock.getKey()); if (cleanRegion.contains(region.getVerID())) { return; @@ -241,7 +225,7 @@ private void resolveLock(BackOffer bo, Lock lock, long txnStatus, Set handler = new KVErrorHandler<>( - regionManager, + session.getRegionManager(), this, region, resp -> resp.hasRegionError() ? resp.getRegionError() : null); @@ -289,7 +273,7 @@ public boolean onNotLeader(Store newStore) { if (logger.isDebugEnabled()) { logger.debug(region + ", new leader = " + newStore.getId()); } - TiRegion cachedRegion = regionManager.getRegionById(region.getId()); + TiRegion cachedRegion = getSession().getRegionManager().getRegionById(region.getId()); // When switch leader fails or the region changed its key range, // it would be necessary to re-split task's key range for new region. if (!region.getStartKey().equals(cachedRegion.getStartKey()) @@ -297,8 +281,9 @@ public boolean onNotLeader(Store newStore) { return false; } region = cachedRegion; - String addressStr = newStore.getAddress(); - ManagedChannel channel = channelFactory.getChannel(addressStr); + String addressStr = + getSession().getRegionManager().getStoreById(region.getLeader().getStoreId()).getAddress(); + ManagedChannel channel = getSession().getChannel(addressStr); blockingStub = TikvGrpc.newBlockingStub(channel); asyncStub = TikvGrpc.newStub(channel); return true; @@ -307,7 +292,7 @@ public boolean onNotLeader(Store newStore) { @Override public void onStoreNotMatch(Store store) { String addressStr = store.getAddress(); - ManagedChannel channel = channelFactory.getChannel(addressStr); + ManagedChannel channel = getSession().getChannel(addressStr); blockingStub = TikvGrpc.newBlockingStub(channel); asyncStub = TikvGrpc.newStub(channel); } diff --git a/src/main/java/org/tikv/txn/TikvTransaction.java b/src/main/java/org/tikv/txn/TikvTransaction.java new file mode 100644 index 00000000000..14a34a155be --- /dev/null +++ b/src/main/java/org/tikv/txn/TikvTransaction.java @@ -0,0 +1,221 @@ +package org.tikv.txn; + +import com.google.common.collect.Lists; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.tikv.common.Snapshot; +import org.tikv.common.key.Key; +import org.tikv.common.meta.TiTimestamp; + +import java.security.SecureRandom; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.function.Function; + +/** + * Transaction implementation of TiKV client + */ +public class TikvTransaction implements ITransaction { + private static final Logger LOG = LoggerFactory.getLogger(TikvTransaction.class); + + private TxnKVClient kvClient; + /** + * start timestamp of transaction which get from PD + */ + private long startTS; + /** + * Monotonic timestamp for recording txn time consuming. + */ + private long startTime; //for recording txn time consuming. + /** + * transaction valid flag + */ + private boolean valid; + + //private ReentrantMutex mutex = new ReentrantMutex(); + + private Map memoryKvStore = new HashMap<>(); + + private List lockKeys; + + private Snapshot snapshot; + + private final Function transactionFunction; + + private static final int retryBackOffCap = 100; + private static final int retryBackOffBase = 1; + private static final int maxRetryCnt = 100; + + private SecureRandom random = new SecureRandom(); + + public TikvTransaction(TxnKVClient client) { + this(client, null); + } + + public TikvTransaction(TxnKVClient client, Function function) { + this.kvClient = client; + this.startTime = System.currentTimeMillis(); + this.transactionFunction = function; + this.lockKeys = Lists.newLinkedList(); + this.init(client); + } + + private void init(TxnKVClient client) { + this.valid = true; + TiTimestamp tiTimestamp = kvClient.getTimestamp(); + this.startTS = tiTimestamp.getVersion(); + this.snapshot = new Snapshot(tiTimestamp, client.getSession()); + } + + @Override + public boolean set(byte[] key, byte[] value) { + memoryKvStore.put(key, value); + return true; + } + + @Override + public byte[] get(byte[] key) { + if(memoryKvStore.get(key) != null) { + return memoryKvStore.get(key); + } + return snapshot.get(key); + } + + @Override + public boolean delete(byte[] key) { + memoryKvStore.put(key, new byte[0]); + return true; + } + + @Override + public boolean commit() { + boolean result; + if(this.transactionFunction != null) { + //commit with restart execute txn when encountered write conflict; + result = this.commitWithRetry(); + } else { + TwoPhaseCommitter committer = new TwoPhaseCommitter(this); + // latches enabled + // for transactions which need to acquire latchess + //TODO latch ?? + result = committer.execute(); + } + long endTime = System.currentTimeMillis(); + LOG.debug("txn startTime at {}, endTime at {}, spend whole time {}s", this.startTime, (endTime - this.startTime) / 1000); + return result; + } + + private boolean commitWithRetry() { + for(int i = 0 ; i < maxRetryCnt; i++) { + Function retryFunction = transactionFunction; + Boolean result = retryFunction.apply(this); + if(!result) { + this.rollback(); + continue; + } + + TwoPhaseCommitter committer = new TwoPhaseCommitter(this); + boolean commit = committer.execute(); + if(commit) { + return true; + } + this.lockKeys.clear(); + this.memoryKvStore.clear(); + this.init(kvClient); + LOG.warn("txn commit failed with attempts {} times, startTs={}", i + 1, startTime); + backoff(i); + } + LOG.warn("txn commit failed at finally, startTs={}", startTime); + return false; + } + + @Override + public boolean rollback() { + if(!this.valid) { + LOG.warn("rollback invalid, startTs={}, startTime={}", this.startTS, this.startTime); + return false; + } + this.close(); + LOG.debug("transaction rollback, startTs={}, startTime={}", this.startTS, this.startTime); + return true; + } + + @Override + public boolean lockKeys(Key... lockedKeys) { + for(Key key : lockedKeys) { + this.lockKeys.add(key.toByteString().toByteArray()); + } + return true; + } + + @Override + public boolean valid() { + return this.valid; + } + + @Override + public long getStartTS() { + return this.startTS; + } + + @Override + public long getStartTime() { + return this.startTime; + } + + @Override + public boolean isReadOnly() { + return false; + } + + @Override + public Snapshot getSnapshot() { + return this.snapshot; + } + + @Override + public TxnKVClient getKVClient() { + return this.kvClient; + } + + @Override + public Map getStoredKeys() { + return memoryKvStore; + } + + @Override + public List getLockedKeys() { + return this.lockKeys; + } + + private void close() { + this.valid = false; + this.lockKeys.clear(); + this.memoryKvStore.clear(); + } + + private byte[][] toKeys() { + byte[][] keys = new byte[memoryKvStore.size()][]; + int i = 0; + for(byte[] key : memoryKvStore.keySet()) { + keys[i++] = key; + } + return keys; + } + + // BackOff Implements exponential backoff with full jitter. + // Returns real back off time in microsecond. + // See http://www.awsarchitectureblog.com/2015/03/backoff.html. + private int backoff(int attempts) { + int upper = (int)(Math.min(retryBackOffCap, retryBackOffBase * Math.pow(2.0, attempts))); + int sleep = random.nextInt(upper); + try { + Thread.sleep(sleep); + LOG.debug("txn sleep {}s at attempts " , sleep, attempts); + } catch (InterruptedException e) { + e.printStackTrace(); + } + return sleep; + } +} diff --git a/src/main/java/org/tikv/txn/TwoPhaseCommitter.java b/src/main/java/org/tikv/txn/TwoPhaseCommitter.java new file mode 100644 index 00000000000..3c7cd49c395 --- /dev/null +++ b/src/main/java/org/tikv/txn/TwoPhaseCommitter.java @@ -0,0 +1,505 @@ +package org.tikv.txn; + +import com.google.protobuf.ByteString; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.tikv.common.ReadOnlyPDClient; +import org.tikv.common.exception.GrpcException; +import org.tikv.common.meta.TiTimestamp; +import org.tikv.common.region.TiRegion; +import org.tikv.common.util.BackOffFunction; +import org.tikv.common.util.BackOffer; +import org.tikv.common.util.ConcreteBackOffer; +import org.tikv.common.util.FastByteComparisons; +import org.tikv.kvproto.Kvrpcpb; +import org.tikv.txn.pool.SecondaryCommitTaskThreadPool; +import org.tikv.txn.type.BatchKeys; +import org.tikv.txn.type.ClientRPCResult; +import org.tikv.txn.type.GroupKeyResult; +import org.tikv.txn.type.TwoPhaseCommitType; + +import java.util.*; + +/** + * 2PC implementation of TiKV + */ +public class TwoPhaseCommitter { + private final static Logger LOG = LoggerFactory.getLogger(TwoPhaseCommitter.class); + + // TiKV recommends each RPC packet should be less than ~1MB. We keep each packet's + // Key+Value size below 16KB. + private static final int txnCommitBatchSize = 16 * 1024; + private static final long defaultLockTTL = 3000;//unit is second + private static final int bytesPerMiB = 1024 * 1024; + private long maxTxnTimeUse = 60_1000;//unit is: milliseconds + // ttl = ttlFactor * sqrt(writeSizeInMiB) + private static final int ttlFactor = 6000; + private static final int maxLockTTL = 12000; + private static final SecondaryCommitTaskThreadPool secondaryCommitPool = new SecondaryCommitTaskThreadPool(); + + private Map mutations = new LinkedHashMap<>(); + private List keysList; + private ReadOnlyPDClient pdClient; + private TxnKVClient kvClient; + + private long lockTTL = 0; + /** + * start timestamp of transaction which get from PD + */ + private long startTs = 0; + /** + * commit timestamp of transaction which get from PD + */ + private long commitTs = 0; + + private volatile boolean prewriteTaskError = false; + //private volatile AtomicInteger seondaryThreadIdGenerator = new AtomicInteger(0); + + private TwoPhaseCommitter() {} + + public TwoPhaseCommitter(ITransaction transaction) { + this.pdClient = transaction.getKVClient().getSession().getPDClient(); + this.keysList = new LinkedList<>(); + this.kvClient = transaction.getKVClient(); + this.startTs = transaction.getStartTS(); + + Map storedKeys = transaction.getStoredKeys(); + int putCount = 0, delCount = 0, lockCount = 0; + int txnSize = 0; + for(byte[] key : storedKeys.keySet()) { + byte[] value = storedKeys.get(key); + if(value.length > 0) { + Kvrpcpb.Mutation mutation = Kvrpcpb.Mutation.newBuilder() + .setKey(ByteString.copyFrom(key)) + .setValue(ByteString.copyFrom(value)) + .setOp(Kvrpcpb.Op.Put) + .build(); + mutations.put(new String(key), mutation); + putCount++; + } else { + Kvrpcpb.Mutation mutation = Kvrpcpb.Mutation.newBuilder() + .setKey(ByteString.copyFrom(key)) + .setOp(Kvrpcpb.Op.Del) + .build(); + mutations.put(new String(key), mutation); + delCount ++; + } + keysList.add(key); + txnSize += (key.length + value.length); + //TODO check transaction maxEntrySize + } + List lockedKeys = transaction.getLockedKeys(); + for(byte[] lockedKey : lockedKeys) { + Kvrpcpb.Mutation mutation = Kvrpcpb.Mutation.newBuilder() + .setKey(ByteString.copyFrom(lockedKey)) + .setOp(Kvrpcpb.Op.Lock) + .build(); + mutations.put(new String(lockedKey), mutation); + lockCount ++; + keysList.add(lockedKey); + txnSize += (lockedKey.length); + } + this.lockTTL = getTxnLockTTL(transaction.getStartTime(), txnSize); + LOG.debug("Txn info, startTs={}, putCount={}, delCount={}, lockCount={}, lockTTL={}", startTs, putCount, delCount, lockCount, lockTTL); + } + + private long getTxnLockTTL(long startTime, int txnSize) { + // Increase lockTTL for large transactions. + // The formula is `ttl = ttlFactor * sqrt(sizeInMiB)`. + // When writeSize is less than 256KB, the base ttl is defaultTTL (3s); + // When writeSize is 1MiB, 100MiB, or 400MiB, ttl is 6s, 60s, 120s correspondingly; + long ttl = defaultLockTTL; + if(txnSize >= txnCommitBatchSize) { + int sizeInMiB = txnSize / bytesPerMiB; + ttl = (long)(ttlFactor* Math.sqrt(sizeInMiB)); + if(ttl < defaultLockTTL) { + ttl = defaultLockTTL; + } + if(ttl > maxLockTTL) { + ttl = maxLockTTL; + } + } + // Increase lockTTL by the transaction's read time. + // When resolving a lock, we compare current ts and startTS+lockTTL to decide whether to clean up. If a txn + // takes a long time to read, increasing its TTL will help to prevent it from been aborted soon after prewrite. + long elapsed = (System.currentTimeMillis() - startTime); + if(elapsed < 0) {//should not happened forever + LOG.warn("transaction startTime elapsed invalid, startTime={}, elapsed={}", startTime, elapsed); + } + return ttl + elapsed; + } + + private GroupKeyResult groupKeysByRegion(BackOffer backOffer, byte[][] keys) { + Map> groups = new HashMap<>(); + long first = 0; + int index = 0; + String error = null; + try { + for(; index < keys.length; index ++) { + byte[] key = keys[index]; + TiRegion tiRegion = this.pdClient.getRegionByKey(backOffer, ByteString.copyFrom(key)); + if(tiRegion != null){ + Long regionId = tiRegion.getId(); + if(index == 0) { + first = regionId; + } + List groupItem = groups.computeIfAbsent(regionId, e -> new LinkedList<>()); + /*List groupItem = groups.get(regionId); + if(groupItem != null) { + groupItem = new LinkedList<>(); + }*/ + groupItem.add(key); + groups.put(tiRegion.getId(), groupItem); + } + } + } catch (Exception e) { + error = String.format("Txn groupKeysByRegion error, %s", e.getMessage()); + } + GroupKeyResult result = new GroupKeyResult(); + if(error == null) { + result.setFirstRegion(first); + result.setGroupsResult(groups); + } + result.setErrorMsg(error); + return result; + } + + private void appendBatchBySize(List batchKeyList, Long regionId, List keys, boolean sizeKeyValue, int limit) { + int start, end = 0; + int len = keys.size(); + for(start = 0; start < len; start = end) { + int size = 0; + for(end = start; end < len && size < limit; end ++) { + if(sizeKeyValue) { + size += this.keyValueSize(keys.get(end)); + } else { + size += this.keySize(keys.get(end)); + } + } + BatchKeys batchKeys = new BatchKeys(regionId, keys.subList(start, end)); + batchKeyList.add(batchKeys); + } + } + + private boolean isPrimaryKey(byte[] key) { + return this.keysList != null && FastByteComparisons.compareTo(this.keysList.get(0), key) == 0; + } + + protected String doActionOnKeys(BackOffer backOffer, TwoPhaseCommitType actionType, byte[][] keys) { + if(keys == null || keys.length == 0) { + return null; + } + //groups keys by region + GroupKeyResult groupResult = this.groupKeysByRegion(backOffer, keys); + if(groupResult.hasError()) { + return groupResult.getErrorMsg(); + } + boolean sizeKeyValue = false; + if(actionType == TwoPhaseCommitType.actionPrewrite) { + sizeKeyValue = true; + } + List batchKeyList = new LinkedList<>(); + Map> groupKeyMap = groupResult.getGroupsResult(); + long firstRegion = groupResult.getFirstRegion(); + // Make sure the group that contains primary key goes first. + this.appendBatchBySize(batchKeyList, firstRegion, groupKeyMap.get(firstRegion), + sizeKeyValue, txnCommitBatchSize); + groupKeyMap.remove(firstRegion); + for(Long regionId : groupKeyMap.keySet()) { + this.appendBatchBySize(batchKeyList, regionId, groupKeyMap.get(regionId), + sizeKeyValue, txnCommitBatchSize); + } + boolean firstIsPrimary = this.isPrimaryKey(keys[0]); + + if (firstIsPrimary && (actionType == TwoPhaseCommitType.actionCommit + || actionType == TwoPhaseCommitType.actionCleanup)) { + // primary should be committed/cleanup first + String error = this.doActionOnBatches(backOffer, actionType, batchKeyList.subList(0, 1)); + if(error != null) { + return error; + } + batchKeyList.remove(0); + } + String error = null; + if(actionType == TwoPhaseCommitType.actionCommit) { + // Commit secondary batches in background goroutine to reduce latency. + secondaryCommitPool.submitSecondaryTask(() -> { + if(prewriteTaskError) { + LOG.error("Txn 2PC async doActionOnBatches canceled, other secondary thread failed"); + return; + } + String errorInner = doActionOnBatches(backOffer, actionType, batchKeyList); + if(errorInner != null) { + LOG.warn("Txn 2PC async doActionOnBatches error: {}", errorInner); + prewriteTaskError = true; + } + }); + } else { + error = this.doActionOnBatches(backOffer, actionType, batchKeyList); + } + return error; + } + + //doActionOnBatches does action to batches in parallel. + public String doActionOnBatches(BackOffer backOffer, TwoPhaseCommitType actionType, List batchKeys) { + if(batchKeys.size() == 0) { + LOG.debug("Txn 2PC doActionOnBatches batch keys is empty: type={}", actionType); + return null; + } + switch(actionType) { + case actionPrewrite: { + return this.doPrewriteActionOnBatches(backOffer, batchKeys); + } + case actionCommit: { + return this.doCommitActionOnBatches(backOffer, batchKeys); + } + case actionCleanup: { + return this.doCleanupActionOnBatches(backOffer, batchKeys); + } + } + return null; + } + + private String doPrewriteActionOnBatches(BackOffer backOffer, List batchKeysList) { + if(batchKeysList.size() == 1) { + return this.prewriteSingleBatch(backOffer, batchKeysList.get(0)); + } + // For prewrite, stop sending other requests after receiving first error. + for(BatchKeys batchKeys : batchKeysList) { + String error = prewriteSingleBatch(backOffer, batchKeys); + if(error != null) { + //single to other thread error happened + prewriteTaskError = true; + //LOG.warn("Txn 2PC doPrewriteActionOnBatches failed, send to other request: {}", error); + return error; + } + } + return null; + } + + private String doCommitActionOnBatches(BackOffer backOffer, List batchKeysList) { + if(batchKeysList.size() == 1) { + return this.commitSingleBatch(backOffer, batchKeysList.get(0)); + } + // For prewrite, stop sending other requests after receiving first error. + for(BatchKeys batchKeys : batchKeysList) { + String error = commitSingleBatch(backOffer, batchKeys); + if(error != null) { + //LOG.warn("Txn 2PC doCommitActionOnBatches failed, send to other request: {}", error); + return error; + } + } + return null; + } + + private String doCleanupActionOnBatches(BackOffer backOffer, List batchKeysList) { + if(batchKeysList.size() == 1) { + return this.cleanupSingleBatch(backOffer, batchKeysList.get(0)); + /*if(error != null) { + LOG.error("Txn 2PC doCleanupActionOnBatches failed, one batch size, error: {}", error); + } + return error;*/ + } + // For prewrite, stop sending other requests after receiving first error. + for(BatchKeys batchKeys : batchKeysList) { + String error = cleanupSingleBatch(backOffer, batchKeys); + if(error != null) { + //single to other thread error happened + prewriteTaskError = true; + LOG.warn("Txn 2PC doCleanupActionOnBatches failed, send to other request: {}", error); + return error; + } + } + return null; + } + + public long keyValueSize(byte[] key) { + long size = key.length; + String keyStr = new String(key); + Kvrpcpb.Mutation mutation = this.mutations.get(keyStr); + if(mutation != null) { + size += mutation.getValue().toByteArray().length; + } + + return size; + } + + public long keySize(byte[] key) { + return key.length; + } + + private byte[] primaryKey() { + return this.keysList.get(0); + } + + public String prewriteSingleBatch(BackOffer backOffer, BatchKeys batchKeys) { + List keyList = batchKeys.getKeys(); + int batchSize = keyList.size(); + byte[][] keys = new byte[batchSize][]; + int index = 0; + List mutationList = new ArrayList<>(batchSize); + for(byte[] key : keyList) { + mutationList.add(mutations.get(new String(key))); + keys[index++] = key; + } + //send rpc request to tikv server + long regionId = batchKeys.getRegioId(); + ClientRPCResult prewriteResult = this.kvClient.prewrite(backOffer, mutationList, primaryKey(), + this.lockTTL, this.startTs, regionId); + if(!prewriteResult.isSuccess() && !prewriteResult.isRetry()) { + return prewriteResult.getError(); + } + if(!prewriteResult.isSuccess() && prewriteResult.isRetry()) { + try { + backOffer.doBackOff(BackOffFunction.BackOffFuncType.BoRegionMiss, + new GrpcException(String.format("Txn prewriteSingleBatch failed, regionId=%s, detail=%s", batchKeys.getRegioId(), prewriteResult.getError()))); + // re-split keys and commit again. + String error = this.prewriteKeys(backOffer, keys); + return error; + } catch (GrpcException e) { + String error = String.format("Txn prewriteSingleBatch error, re-split commit failed, regionId=%s, detail=%s", batchKeys.getRegioId(), e.getMessage()); + LOG.error(error); + return error; + } + } + //success return + return null; + } + + public String commitSingleBatch(BackOffer backOffer, BatchKeys batchKeys) { + List keysCommit = batchKeys.getKeys(); + byte[][] keys = new byte[keysCommit.size()][]; + keysCommit.toArray(keys); + //send rpc request to tikv server + long regionId = batchKeys.getRegioId(); + ClientRPCResult commitResult = this.kvClient.commit(backOffer, keys, + this.startTs, this.commitTs, regionId); + if(!commitResult.isSuccess() && !commitResult.isRetry()) { + String error = String.format("Txn commitSingleBatch error, regionId=%s", batchKeys.getRegioId()); + LOG.error(error); + return error; + } + if(!commitResult.isSuccess() && commitResult.isRetry()) { + try { + backOffer.doBackOff(BackOffFunction.BackOffFuncType.BoRegionMiss, + new GrpcException(String.format("Txn commitSingleBatch failed, regionId=%s", batchKeys.getRegioId()))); + // re-split keys and commit again. + String error = this.commitKeys(backOffer, keys); + if(error != null) { + LOG.error(error); + return error; + } + } catch (GrpcException e) { + String error = String.format("Txn commitSingleBatch error, re-split commit failed, regionId=%s", batchKeys.getKeys()); + LOG.error(error); + return error; + } + } + if(!commitResult.isSuccess()) { + String error = String.format("Txn commitSingleBatch error, regionId=%s, detail=%s", batchKeys.getKeys(), commitResult.getError()); + LOG.error(error); + return error; + } + return null; + } + + public String cleanupSingleBatch(BackOffer backOffer, BatchKeys batchKeys) { + byte[][] keys = new byte[batchKeys.getKeys().size()][]; + batchKeys.getKeys().toArray(keys); + ClientRPCResult rollbackResult = this.kvClient.batchRollbackReq(backOffer, keys, this.startTs, batchKeys.getRegioId()); + if(!rollbackResult.isSuccess() && !rollbackResult.isRetry()) { + String error = String.format("Txn cleanupSingleBatch error, regionId=%s", batchKeys.getKeys()); + LOG.error(error); + return error; + } + if(!rollbackResult.isSuccess() && rollbackResult.isRetry()) { + try { + backOffer.doBackOff(BackOffFunction.BackOffFuncType.BoRegionMiss, + new GrpcException(String.format("Txn cleanupSingleBatch failed, regionId=%s", batchKeys.getRegioId()))); + // re-split keys and commit again. + String error = this.cleanupKeys(backOffer, keys); + if(error != null) { + LOG.error(error); + return error; + } + } catch (GrpcException e) { + String error = String.format("Txn cleanupSingleBatch error, re-split commit failed, regionId=%s", batchKeys.getKeys()); + LOG.error(error); + return error; + } + } + if(!rollbackResult.isSuccess()) { + String error = String.format("Txn cleanupSingleBatch error, regionId=%s, detail=%s", batchKeys.getKeys(), rollbackResult.getError()); + LOG.error(error); + return error; + } + return null; + } + + /** + * 2pc - prewrite phase + * @param backOffer + * @param keys + * @return + */ + public String prewriteKeys(BackOffer backOffer, byte[][] keys) { + + return this.doActionOnKeys(backOffer, TwoPhaseCommitType.actionPrewrite, keys); + } + + /** + * 2pc - commit phase + * @param backOffer + * @param keys + * @return + */ + public String commitKeys(BackOffer backOffer, byte[][] keys) { + + return this.doActionOnKeys(backOffer, TwoPhaseCommitType.actionCommit, keys); + } + + /** + * 2pc - cleanup phase + * @param backOffer + * @param keys + * @return + */ + public String cleanupKeys(BackOffer backOffer, byte[][] keys) { + + return this.doActionOnKeys(backOffer, TwoPhaseCommitType.actionCleanup, keys); + } + + public boolean execute() { + BackOffer prewriteBackoff = ConcreteBackOffer.newCustomBackOff(3000);//ConcreteBackOffer.prewriteMaxBackoff + byte[][] keys = new byte[keysList.size()][]; + keysList.toArray(keys); + String prewriteError = this.prewriteKeys(prewriteBackoff, keys); + if(prewriteError != null) { + LOG.error("failed on prewrite, startTs={}, detail={}", this.startTs, prewriteError); + return false; + } + TiTimestamp commitTso = kvClient.getTimestamp(); + this.commitTs = commitTso.getVersion(); + // check commitTS + if(this.commitTs <= this.startTs) { + LOG.error("invalid transaction tso with startTs={}, commitTs={}", this.startTs, this.commitTs); + return false; + } + if(isExpired(this.startTs, maxTxnTimeUse)) { + LOG.error("transaction takes too much time, startTs={}, commitTs={}", this.startTs, commitTso.getVersion()); + return false; + } + BackOffer commitBackoff = ConcreteBackOffer.newCustomBackOff(BackOffer.commitMaxBackoff); + String commitError = this.commitKeys(commitBackoff, keys); + if(commitError != null) { + LOG.error("failed on commit, startTs={}, commitTs={}", this.startTs, commitError); + return false; + } + + return true; + } + + private boolean isExpired(long lockTs, long maxTxnTimeUse) { + return System.currentTimeMillis() >= TiTimestamp.extraPhysical(lockTs) + maxTxnTimeUse; + } +} diff --git a/src/main/java/org/tikv/txn/TxnKVClient.java b/src/main/java/org/tikv/txn/TxnKVClient.java index 65e0db092dd..6e57be5c4ae 100644 --- a/src/main/java/org/tikv/txn/TxnKVClient.java +++ b/src/main/java/org/tikv/txn/TxnKVClient.java @@ -1,5 +1,347 @@ package org.tikv.txn; -public class TxnKVClient { - // TODO: To be done. -} +import com.google.common.collect.Lists; +import com.google.protobuf.ByteString; +import io.grpc.StatusRuntimeException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.tikv.common.ReadOnlyPDClient; +import org.tikv.common.TiConfiguration; +import org.tikv.common.TiSession; +import org.tikv.common.exception.GrpcException; +import org.tikv.common.exception.RegionException; +import org.tikv.common.exception.TiKVException; +import org.tikv.common.meta.TiTimestamp; +import org.tikv.common.operation.iterator.ConcreteScanIterator; +import org.tikv.common.region.RegionManager; +import org.tikv.common.region.TiRegion; +import org.tikv.common.region.TxnRegionStoreClient; +import org.tikv.common.util.BackOffFunction; +import org.tikv.common.util.BackOffer; +import org.tikv.common.util.ConcreteBackOffer; +import org.tikv.common.util.Pair; +import org.tikv.kvproto.Kvrpcpb; +import org.tikv.kvproto.Metapb; +import org.tikv.txn.type.ClientRPCResult; + +import java.util.LinkedList; +import java.util.List; +import java.util.function.Function; + +/** + * KV client of transaction + * APIs for GET/PUT/DELETE/SCAN + */ +public class TxnKVClient implements AutoCloseable{ + private final static Logger LOG = LoggerFactory.getLogger(TxnKVClient.class); + + private final TiSession session; + private final RegionManager regionManager; + private ReadOnlyPDClient pdClient; + + private TxnKVClient(String addresses) { + this.session = TiSession.create(TiConfiguration.createRawDefault(addresses)); + this.regionManager = session.getRegionManager(); + this.pdClient = session.getPDClient(); + } + + public static TxnKVClient createClient(String addresses) { + return new TxnKVClient(addresses); + } + + public TiSession getSession() { + return session; + } + + public TiTimestamp getTimestamp() { + BackOffer bo = ConcreteBackOffer.newTsoBackOff(); + TiTimestamp timestamp = new TiTimestamp(0, 0); + try { + while(true) { + try { + timestamp = pdClient.getTimestamp(bo); + break; + } catch (final TiKVException e) {//retry is exhausted + bo.doBackOff(BackOffFunction.BackOffFuncType.BoPDRPC, e); + } + } + } catch (GrpcException e1) { + LOG.error("Get tso from pd failed,", e1); + } + return timestamp; + } + + /** + * Begin a new transaction + * @return + */ + public ITransaction begin() { + return new TikvTransaction(this); + } + + public ITransaction begin(Function function) { + return new TikvTransaction(this, function); + } + + //add backoff logic when encountered region error,ErrBodyMissing, and other errors + public ClientRPCResult prewrite(BackOffer backOffer, List mutations, byte[] primary, long lockTTL, long startTs, long regionId) { + ClientRPCResult result = new ClientRPCResult(true, false, null); + //send request + Pair regionStore = regionManager.getRegionStorePairByRegionId(regionId); + TxnRegionStoreClient client = TxnRegionStoreClient.create(regionStore.first, regionStore.second, session); + try { + client.prewrite(backOffer, ByteString.copyFrom(primary), mutations, startTs, lockTTL); + } catch (final TiKVException | StatusRuntimeException e) { + result.setSuccess(false); + result.setRetry(e instanceof RegionException);//mark retryable, region error, should retry prewrite again + result.setError(e.getMessage()); + } + return result; + } + + /** + * Commit request of 2pc, + * add backoff logic when encountered region error, ErrBodyMissing, and other errors + * @param backOffer + * @param keys + * @param startTs + * @param commitTs + * @param regionId + * @return + */ + public ClientRPCResult commit(BackOffer backOffer, byte[][] keys, long startTs, long commitTs, long regionId) { + ClientRPCResult result = new ClientRPCResult(true, false, null); + //send request + Pair regionStore = regionManager.getRegionStorePairByRegionId(regionId); + TxnRegionStoreClient client = TxnRegionStoreClient.create(regionStore.first, regionStore.second, session); + List byteList = Lists.newArrayList(); + for(byte[] key : keys) { + byteList.add(ByteString.copyFrom(key)); + } + try { + client.commit(backOffer, byteList, startTs, commitTs); + } catch (final TiKVException | StatusRuntimeException e) { + result.setSuccess(false); + result.setRetry(e instanceof RegionException);//mark retryable, region error, should retry prewrite again + result.setError(e.getMessage()); + } + return result; + } + + /** + * Cleanup request of 2pc + * @param backOffer + * @param key + * @param startTs + * @param regionId + * @return + */ + public boolean cleanup(BackOffer backOffer, byte[] key, long startTs, long regionId) { + try { + Pair regionStore = regionManager.getRegionStorePairByRegionId(regionId); + TxnRegionStoreClient client = TxnRegionStoreClient.create(regionStore.first, regionStore.second, session); + //send rpc request to tikv server + client.cleanup(backOffer, ByteString.copyFrom(key), startTs); + return true; + } catch (final TiKVException e) { + LOG.error("Cleanup process error, retry end, key={}, startTs={}, regionId=%s", new String(key), startTs, regionId); + return false; + } + } + + /** + * Request for batch rollback on TiKV, retry operation should be deal with Caller + * @param backOffer + * @param keys + * @param startTs + * @param regionId + * @return + */ + public ClientRPCResult batchRollbackReq(BackOffer backOffer, byte[][] keys, long startTs, long regionId) { + ClientRPCResult result = new ClientRPCResult(true, false, null); + List byteList = Lists.newArrayList(); + for(byte[] key : keys) { + byteList.add(ByteString.copyFrom(key)); + } + try { + Pair regionStore = regionManager.getRegionStorePairByRegionId(regionId); + TxnRegionStoreClient client = TxnRegionStoreClient.create(regionStore.first, regionStore.second, session); + //send request + client.batchRollback(backOffer, byteList, startTs); + } catch (final Exception e) { + result.setSuccess(false); + result.setRetry(e instanceof RegionException);//mark retryable, region error, should retry prewrite again + result.setError(e.getMessage()); + } + return result; + } + + /** + * Get value of key from TiKV + * @param key + * @return + */ + public byte[] get(byte[] key) { + ByteString byteKey = ByteString.copyFrom(key); + BackOffer bo = ConcreteBackOffer.newGetBackOff(); + long version = 0; + ByteString value = null; + try { + Pair region = regionManager.getRegionStorePairByKey(byteKey); + TxnRegionStoreClient client = TxnRegionStoreClient.create(region.first, region.second, session); + version = getTimestamp().getVersion(); + value = client.get(bo, byteKey, version); + } catch (final TiKVException | StatusRuntimeException e) { + LOG.error("Get process error, key={}, version={}", new String(key), version); + } + + return value != null ? value.toByteArray() : new byte[0]; + } + + /** + * Put a new key-value pair to TiKV + * @param key + * @param value + * @return + */ + public boolean put(byte[] key, byte[] value) { + boolean putResult = false; + ByteString byteKey = ByteString.copyFrom(key); + ByteString byteValue = ByteString.copyFrom(value); + BackOffer bo = ConcreteBackOffer.newCustomBackOff(BackOffer.prewriteMaxBackoff); + List mutations = Lists.newArrayList( + Kvrpcpb.Mutation.newBuilder() + .setKey(byteKey).setValue(byteValue).setOp(Kvrpcpb.Op.Put) + .build() + ); + long lockTTL = 2000; + long startTS; + TiRegion region = regionManager.getRegionByKey(byteKey); + boolean prewrite = false; + while(true) { + try { + startTS = this.getTimestamp().getVersion(); + ClientRPCResult prewriteResp = this.prewrite(bo, mutations, key, lockTTL, startTS, region.getId()); + if(prewriteResp.isSuccess() || (!prewriteResp.isSuccess() && !prewriteResp.isRetry())) { + if(prewriteResp.isSuccess()) { + prewrite = true; + } + break; + } + LOG.error("Put process error, prewrite try next time, error={}", prewriteResp.getError()); + bo.doBackOff(BackOffFunction.BackOffFuncType.BoRegionMiss, new TiKVException(prewriteResp.getError())); + } catch (final TiKVException e) { + LOG.error("Put process error, 2pc prewrite failed,", e); + } + } + if(prewrite) { + long commitTs; + byte[][] keys = new byte[1][]; + keys[0] = key; + while(true) { + try { + commitTs = this.getTimestamp().getVersion(); + region = regionManager.getRegionByKey(byteKey); + ClientRPCResult commitResp = this.commit(bo, keys, startTS, commitTs, region.getId()); + if(commitResp.isSuccess() || (!commitResp.isSuccess() && !commitResp.isRetry())) { + if(commitResp.isSuccess()) { + putResult = true; + } + break; + } + LOG.error("Put process failed, commit try next time, error={}", commitResp.getError()); + bo.doBackOff(BackOffFunction.BackOffFuncType.BoRegionMiss, new TiKVException(commitResp.getError())); + } catch (final TiKVException e) { + LOG.error("Put process error, 2pc commit failed,", e); + } + } + } + return putResult; + } + + /** + * Delete a key value from TiKV + * @param key the key will be deleted + */ + public boolean delete(byte[] key) { + boolean putResult = false; + ByteString byteKey = ByteString.copyFrom(key); + BackOffer bo = ConcreteBackOffer.newCustomBackOff(BackOffer.prewriteMaxBackoff); + List mutations = Lists.newArrayList( + Kvrpcpb.Mutation.newBuilder() + .setKey(byteKey).setOp(Kvrpcpb.Op.Del) + .build() + ); + long lockTTL = 2000; + long startTS; + TiRegion region = regionManager.getRegionByKey(byteKey); + boolean prewrite = false; + while(true) { + try { + startTS = this.getTimestamp().getVersion(); + ClientRPCResult prewriteResp = this.prewrite(bo, mutations, key, lockTTL, startTS, region.getId()); + if(prewriteResp.isSuccess() || (!prewriteResp.isSuccess() && !prewriteResp.isRetry())) { + if(prewriteResp.isSuccess()) { + prewrite = true; + } + break; + } + LOG.error("Delete process error, prewrite try next time, error={}", prewriteResp.getError()); + bo.doBackOff(BackOffFunction.BackOffFuncType.BoRegionMiss, new TiKVException(prewriteResp.getError())); + } catch (final TiKVException e) { + LOG.error("Delete process error, 2pc prewrite failed,", e); + } + } + if(prewrite) { + long commitTs; + byte[][] keys = new byte[1][]; + keys[0] = key; + while(true) { + try { + commitTs = this.getTimestamp().getVersion(); + region = regionManager.getRegionByKey(byteKey); + ClientRPCResult commitResp = this.commit(bo, keys, startTS, commitTs, region.getId()); + if(commitResp.isSuccess() || (!commitResp.isSuccess() && !commitResp.isRetry())) { + if(commitResp.isSuccess()) { + putResult = true; + } + break; + } + LOG.error("Delete process failed, commit try next time, error={}", commitResp.getError()); + bo.doBackOff(BackOffFunction.BackOffFuncType.BoRegionMiss, new TiKVException(commitResp.getError())); + } catch (final TiKVException e) { + LOG.error("Put process error, 2pc commit failed,", e); + } + } + } + return putResult; + } + + /** + * Scan key-value pair from TiKV + * @param startKey start key + * @param limit max limit count + * @return + */ + public List> scan(byte[] startKey, int limit) { + ByteString byteKey = ByteString.copyFrom(startKey); + long version = getTimestamp().getVersion(); + ConcreteScanIterator iterator = new ConcreteScanIterator(byteKey, session, version); + List> result = new LinkedList<>(); + int count = 0; + while(iterator.hasNext() && count ++ < limit) { + Kvrpcpb.KvPair pair = iterator.next(); + result.add(Pair.create(pair.getKey().toByteArray(), pair.getValue().toByteArray())); + } + return result; + } + + private BackOffer defaultBackOff() { + return ConcreteBackOffer.newCustomBackOff(1000); + } + + @Override + public void close() throws Exception { + session.close(); + } +} \ No newline at end of file diff --git a/src/main/java/org/tikv/txn/pool/SecondaryCommitTaskThreadPool.java b/src/main/java/org/tikv/txn/pool/SecondaryCommitTaskThreadPool.java new file mode 100644 index 00000000000..6da62b7d1a5 --- /dev/null +++ b/src/main/java/org/tikv/txn/pool/SecondaryCommitTaskThreadPool.java @@ -0,0 +1,41 @@ +package org.tikv.txn.pool; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; + +/** + * 2PC: Secondary commit thread pool + */ +public final class SecondaryCommitTaskThreadPool implements AutoCloseable { + private final static Logger LOG = LoggerFactory.getLogger(SecondaryCommitTaskThreadPool.class); + + private ExecutorService taskThreadsPool; + + public SecondaryCommitTaskThreadPool() { + this.taskThreadsPool = Executors.newWorkStealingPool(); + } + + public String submitSecondaryTask(Runnable task) { + try { + this.taskThreadsPool.submit(task); + return null; + } catch (Exception e) { + LOG.error("Submit secondary task failed"); + return String.format("Submit secondary task failed"); + } + } + @Override + public void close() throws Exception { + if(taskThreadsPool != null) { + if (!taskThreadsPool.awaitTermination(20, TimeUnit.SECONDS)) { + taskThreadsPool.shutdownNow(); // Cancel currently executing tasks + } else { + + } + } + } +} diff --git a/src/main/java/org/tikv/txn/type/BaseResult.java b/src/main/java/org/tikv/txn/type/BaseResult.java new file mode 100644 index 00000000000..f3ddd6bbbbd --- /dev/null +++ b/src/main/java/org/tikv/txn/type/BaseResult.java @@ -0,0 +1,18 @@ +package org.tikv.txn.type; + +public class BaseResult { + + protected String errorMsg; + + public boolean hasError() { + return errorMsg != null && !errorMsg.equals(""); + } + + public String getErrorMsg() { + return errorMsg; + } + + public void setErrorMsg(String errorMsg) { + this.errorMsg = errorMsg; + } +} diff --git a/src/main/java/org/tikv/txn/type/BatchKeys.java b/src/main/java/org/tikv/txn/type/BatchKeys.java new file mode 100644 index 00000000000..7acdeea0232 --- /dev/null +++ b/src/main/java/org/tikv/txn/type/BatchKeys.java @@ -0,0 +1,45 @@ +package org.tikv.txn.type; + +import org.tikv.common.region.TiRegion; + +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + +public class BatchKeys { + private List keys; + + private Long regioId; + + public BatchKeys(){} + + public BatchKeys(Long regioId, List keysInput) { + Objects.nonNull(regioId); + Objects.nonNull(keysInput); + this.regioId = regioId; + this.keys = new ArrayList<>(); + this.keys.addAll(keysInput); + } + + public List getKeys() { + return keys; + } + + public void setKeys(List keys) { + this.keys = keys; + } + + public Long getRegioId() { + return regioId; + } + + public void setRegioId(Long regioId) { + this.regioId = regioId; + } + + public byte[][] getKeysArray() { + byte[][] result = new byte[keys.size()][]; + keys.toArray(result); + return result; + } +} diff --git a/src/main/java/org/tikv/txn/type/ClientRPCResult.java b/src/main/java/org/tikv/txn/type/ClientRPCResult.java new file mode 100644 index 00000000000..459dc36aa92 --- /dev/null +++ b/src/main/java/org/tikv/txn/type/ClientRPCResult.java @@ -0,0 +1,37 @@ +package org.tikv.txn.type; + +public class ClientRPCResult { + boolean success; + boolean retry; + String error; + + public ClientRPCResult(boolean success, boolean retry, String error) { + this.success = success; + this.retry = retry; + this.error = error; + } + + public boolean isSuccess() { + return success; + } + + public void setSuccess(boolean success) { + this.success = success; + } + + public boolean isRetry() { + return retry; + } + + public void setRetry(boolean retry) { + this.retry = retry; + } + + public String getError() { + return error; + } + + public void setError(String error) { + this.error = error; + } +} diff --git a/src/main/java/org/tikv/txn/type/GroupKeyResult.java b/src/main/java/org/tikv/txn/type/GroupKeyResult.java new file mode 100644 index 00000000000..3217860e76c --- /dev/null +++ b/src/main/java/org/tikv/txn/type/GroupKeyResult.java @@ -0,0 +1,33 @@ +package org.tikv.txn.type; + + +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class GroupKeyResult extends BaseResult{ + + private Map> groupsResult; + + private Long firstRegion; + + public GroupKeyResult() { + this.groupsResult = new HashMap<>(); + } + + public Map> getGroupsResult() { + return groupsResult; + } + + public void setGroupsResult(Map> groupsResult) { + this.groupsResult = groupsResult; + } + + public Long getFirstRegion() { + return firstRegion; + } + + public void setFirstRegion(Long firstRegion) { + this.firstRegion = firstRegion; + } +} diff --git a/src/main/java/org/tikv/txn/type/TwoPhaseCommitType.java b/src/main/java/org/tikv/txn/type/TwoPhaseCommitType.java new file mode 100644 index 00000000000..665d5b4243d --- /dev/null +++ b/src/main/java/org/tikv/txn/type/TwoPhaseCommitType.java @@ -0,0 +1,10 @@ +package org.tikv.txn.type; + +/** + * 2PC command types + */ +public enum TwoPhaseCommitType { + actionPrewrite, + actionCommit, + actionCleanup +} diff --git a/src/main/proto/analyze.proto b/src/main/proto/analyze.proto new file mode 100644 index 00000000000..cf4c09ea0c1 --- /dev/null +++ b/src/main/proto/analyze.proto @@ -0,0 +1,111 @@ +syntax = "proto2"; + +package tipb; + +option java_multiple_files = true; +option java_package = "com.pingcap.tidb.tipb"; + +import "schema.proto"; +import "gogoproto/gogo.proto"; + +option (gogoproto.marshaler_all) = true; +option (gogoproto.sizer_all) = true; +option (gogoproto.unmarshaler_all) = true; + +enum AnalyzeType { + TypeIndex = 0; + TypeColumn = 1; +} + +message AnalyzeReq { + optional AnalyzeType tp = 1 [(gogoproto.nullable) = false]; + optional uint64 start_ts = 2 [(gogoproto.nullable) = false]; + optional uint64 flags = 3 [(gogoproto.nullable) = false]; + optional int64 time_zone_offset = 4 [(gogoproto.nullable) = false]; + optional AnalyzeIndexReq idx_req = 5; + optional AnalyzeColumnsReq col_req = 6; +} + +message AnalyzeIndexReq { + // bucket_size is the max histograms bucket size. + optional int64 bucket_size = 1 [(gogoproto.nullable) = false]; + + // num_columns is the number of columns in the index. + optional int32 num_columns = 2 [(gogoproto.nullable) = false]; + + optional int32 cmsketch_depth = 3; + + optional int32 cmsketch_width = 4; +} + +message AnalyzeColumnsReq { + // bucket_size is the max histograms bucket size, we need this because when primary key is handle, + // the histogram will be directly built. + optional int64 bucket_size = 1 [(gogoproto.nullable) = false]; + + // sample_size is the max number of samples that will be collected. + optional int64 sample_size = 2 [(gogoproto.nullable) = false]; + + // sketch_size is the max sketch size. + optional int64 sketch_size = 3 [(gogoproto.nullable) = false]; + + // columns_info is the info of all the columns that needs to be analyzed. + repeated ColumnInfo columns_info = 4; + + optional int32 cmsketch_depth = 5; + + optional int32 cmsketch_width = 6; +} + +message AnalyzeColumnsResp { + // collectors is the sample collectors for columns. + repeated SampleCollector collectors = 1; + + // pk_hist is the histogram for primary key when it is the handle. + optional Histogram pk_hist = 2; +} + +message AnalyzeIndexResp { + optional Histogram hist = 1; + optional CMSketch cms = 2; +} + +// Bucket is an element of histogram. +message Bucket { + optional int64 count = 1 [(gogoproto.nullable) = false]; + optional bytes lower_bound = 2; + optional bytes upper_bound = 3; + optional int64 repeats = 4 [(gogoproto.nullable) = false]; +} + +message Histogram { + // ndv is the number of distinct values. + optional int64 ndv = 1 [(gogoproto.nullable) = false]; + + // buckets represents all the buckets. + repeated Bucket buckets = 2; +} + +// FMSketch is used to count distinct values for columns. +message FMSketch { + optional uint64 mask = 1 [(gogoproto.nullable) = false]; + repeated uint64 hashset = 2; +} + +// SampleCollector is used for collect samples and calculate the count and ndv of an column. +message SampleCollector { + repeated bytes samples = 1; + optional int64 null_count = 2 [(gogoproto.nullable) = false]; + optional int64 count = 3 [(gogoproto.nullable) = false]; + optional FMSketch fm_sketch = 4; + optional CMSketch cm_sketch = 5; + optional int64 total_size = 6; +} + +message CMSketchRow { + repeated uint32 counters = 1; +} + +message CMSketch { + repeated CMSketchRow rows = 1; +} diff --git a/src/main/proto/binlog/binlog.proto b/src/main/proto/binlog/binlog.proto new file mode 100644 index 00000000000..4c965015f5f --- /dev/null +++ b/src/main/proto/binlog/binlog.proto @@ -0,0 +1,83 @@ +syntax = "proto2"; + +package binlog; + +import "gogoproto/gogo.proto"; + +option (gogoproto.marshaler_all) = true; +option (gogoproto.sizer_all) = true; +option (gogoproto.unmarshaler_all) = true; + +enum MutationType { + Insert = 0; + Update = 1; + DeleteID = 2; // Obsolete field. + DeletePK = 3; // Obsolete field. + DeleteRow = 4; +} + +// TableMutation contains mutations in a table. +message TableMutation { + optional int64 table_id = 1 [(gogoproto.nullable) = false]; + + // The inserted row contains all column values. + repeated bytes inserted_rows = 2; + + // The updated row contains old values and new values of the row. + repeated bytes updated_rows = 3; + + // Obsolete field. + repeated int64 deleted_ids = 4; + + // Obsolete field. + repeated bytes deleted_pks = 5; + + // The row value of the deleted row. + repeated bytes deleted_rows = 6; + + // Used to apply table mutations in original sequence. + repeated MutationType sequence = 7; +} + +message PrewriteValue { + optional int64 schema_version = 1 [(gogoproto.nullable) = false]; + repeated TableMutation mutations = 2 [(gogoproto.nullable) = false]; +} + +enum BinlogType { + Prewrite = 0; // has start_ts, prewrite_key, prewrite_value. + Commit = 1; // has start_ts, commit_ts. + Rollback = 2; // has start_ts. + PreDDL = 3; // has ddl_query, ddl_job_id. + PostDDL = 4; // has ddl_job_id. +} + +// Binlog contains all the changes in a transaction, which can be used to reconstruct SQL statement, then export to +// other systems. +message Binlog { + optional BinlogType tp = 1 [(gogoproto.nullable) = false]; + + // start_ts is used in Prewrite, Commit and Rollback binlog Type. + // It is used for pairing prewrite log to commit log or rollback log. + optional int64 start_ts = 2 [(gogoproto.nullable) = false]; + + // commit_ts is used only in binlog type Commit. + optional int64 commit_ts = 3 [(gogoproto.nullable) = false]; + + // prewrite key is used only in Prewrite binlog type. + // It is the primary key of the transaction, is used to check that the transaction is + // commited or not if it failed to pair to commit log or rollback log within a time window. + optional bytes prewrite_key = 4; + + // prewrite_data is marshalled from PrewriteData type, + // we do not need to unmarshal prewrite data before the binlog have been successfully paired. + optional bytes prewrite_value = 5; + + // ddl_query is the original ddl statement query, used for PreDDL type. + optional bytes ddl_query = 6; + + // ddl_job_id is used for PreDDL and PostDDL binlog type. + // If PreDDL has matching PostDDL with the same job_id, we can execute the DDL right away, otherwise, + // we can use the job_id to check if the ddl statement has been successfully added to DDL job list. + optional int64 ddl_job_id = 7 [(gogoproto.nullable) = false]; +} diff --git a/src/main/proto/binlog/cistern.proto b/src/main/proto/binlog/cistern.proto new file mode 100644 index 00000000000..fd8cffbf179 --- /dev/null +++ b/src/main/proto/binlog/cistern.proto @@ -0,0 +1,58 @@ +syntax = "proto3"; + +package binlog; + +import "gogoproto/gogo.proto"; + +option (gogoproto.marshaler_all) = true; +option (gogoproto.unmarshaler_all) = true; +option (gogoproto.sizer_all) = true; + +// Interfaces exported by CisternServer. +service Cistern { + // DumpBinlog dumps continuous binlog items in a stream from a given position + rpc DumpBinlog(DumpBinlogReq) returns (stream DumpBinlogResp) {} + + // DumpDDLJobs dumps all history DDL jobs before a specified commitTS + rpc DumpDDLJobs(DumpDDLJobsReq) returns (DumpDDLJobsResp) {} + + // Notify notifies all living cisterns that a new pump is coming + // the living cisterns can be queried from pd + rpc Notify(NotifyReq) returns (NotifyResp) {} +} + +message DumpBinlogReq { + // beginCommitTS speicifies the position from which begin to dump binlogs. + // note that actually the result of dump starts from the one next to beginCommitTS + // it should be zero in case of the first request. + int64 beginCommitTS = 1; +} + +message DumpBinlogResp { + // CommitTS specifies the commitTS of binlog + int64 commitTS = 1; + + // payloads is bytecodes encoded from binlog item + bytes payload = 2; + + // ddljob is json bytes marshaled from corresponding ddljob struct if payload is a DDL type of binlog + bytes ddljob = 3; +} + +message DumpDDLJobsReq { + // beginCommitTS is the start point of drainer processing binlog, DumpDDLJobs() returns + // all history DDL jobs before this position, then drainer will apply these DDL jobs + // in order of job ID to restore the whole schema info at that moment. + int64 beginCommitTS = 1; +} + +message DumpDDLJobsResp { + // ddljobs is an array of JSON encoded history DDL jobs + repeated bytes ddljobs = 1; +} + +message NotifyReq { +} + +message NotifyResp { +} diff --git a/src/main/proto/binlog/pump.proto b/src/main/proto/binlog/pump.proto new file mode 100644 index 00000000000..ba059a3681e --- /dev/null +++ b/src/main/proto/binlog/pump.proto @@ -0,0 +1,68 @@ +syntax = "proto3"; + +package binlog; + +import "gogoproto/gogo.proto"; + +option (gogoproto.marshaler_all) = true; +option (gogoproto.unmarshaler_all) = true; +option (gogoproto.sizer_all) = true; + +// Interfaces exported by Pump. +service Pump { + // Writes a binlog to the local file on the pump machine. + // A response with an empty errmsg is returned if the binlog is written successfully. + rpc WriteBinlog(WriteBinlogReq) returns (WriteBinlogResp) {} + + // Sends binlog stream from a given location. + rpc PullBinlogs(PullBinlogReq) returns (stream PullBinlogResp) {} +} + +message WriteBinlogReq { + // The identifier of tidb-cluster, which is given at tidb startup. + // Must specify the clusterID for each binlog to write. + uint64 clusterID = 1; + + // Payload bytes can be decoded back to binlog struct by the protobuf. + bytes payload = 2; +} + +message WriteBinlogResp { + // An empty errmsg returned means a successful write. + // Otherwise return the error description. + string errmsg = 1; +} + +message PullBinlogReq { + // Specifies which clusterID of binlog to pull. + uint64 clusterID = 1; + + // The position from which the binlog will be sent. + Pos startFrom = 2 [(gogoproto.nullable) = false]; +} + +message PullBinlogResp { + // The binlog entity that send in a stream + Entity entity = 1 [(gogoproto.nullable) = false]; +} + +// Binlogs are stored in a number of sequential files in a directory. +// The Pos describes the position of a binlog. +message Pos { + // The suffix of binlog file, like .000001 .000002 + uint64 suffix = 1; + + // The binlog offset in a file. + int64 offset = 2; +} + +message Entity { + // The position of the binlog entity. + Pos pos = 1 [(gogoproto.nullable) = false]; + + // The payload of binlog entity. + bytes payload = 2; + + // checksum of binlog payload. + bytes checksum = 3; +} diff --git a/src/main/proto/checksum.proto b/src/main/proto/checksum.proto new file mode 100644 index 00000000000..b517eb58b06 --- /dev/null +++ b/src/main/proto/checksum.proto @@ -0,0 +1,33 @@ +syntax = "proto2"; + +package tipb; + +option java_multiple_files = true; +option java_package = "com.pingcap.tidb.tipb"; + +import "gogoproto/gogo.proto"; + +option (gogoproto.sizer_all) = true; +option (gogoproto.marshaler_all) = true; +option (gogoproto.unmarshaler_all) = true; + +enum ChecksumScanOn { + Table = 0; + Index = 1; +} + +enum ChecksumAlgorithm { + Crc64_Xor = 0; +} + +message ChecksumRequest { + optional uint64 start_ts = 1 [(gogoproto.nullable) = false]; + optional ChecksumScanOn scan_on = 2 [(gogoproto.nullable) = false]; + optional ChecksumAlgorithm algorithm = 3 [(gogoproto.nullable) = false]; +} + +message ChecksumResponse { + optional uint64 checksum = 1 [(gogoproto.nullable) = false]; + optional uint64 total_kvs = 2 [(gogoproto.nullable) = false]; + optional uint64 total_bytes = 3 [(gogoproto.nullable) = false]; +} \ No newline at end of file diff --git a/src/main/proto/coprocessor.proto b/src/main/proto/coprocessor.proto new file mode 100644 index 00000000000..fe1d5c33a7a --- /dev/null +++ b/src/main/proto/coprocessor.proto @@ -0,0 +1,34 @@ +syntax = "proto3"; +package coprocessor; + +import "errorpb.proto"; +import "kvrpcpb.proto"; +import "gogoproto/gogo.proto"; + +option (gogoproto.marshaler_all) = true; +option (gogoproto.sizer_all) = true; +option (gogoproto.unmarshaler_all) = true; + +option java_package = "org.tikv.kvproto"; + +// [start, end) +message KeyRange { + bytes start = 1; + bytes end = 2; +} + +message Request { + kvrpcpb.Context context = 1; + int64 tp = 2; + bytes data = 3; + repeated KeyRange ranges = 4; +} + +message Response { + bytes data = 1 [(gogoproto.customtype) = "github.com/pingcap/tipb/sharedbytes.SharedBytes", (gogoproto.nullable) = false]; + errorpb.Error region_error = 2; + kvrpcpb.LockInfo locked = 3; + string other_error = 4; + KeyRange range = 5; + kvrpcpb.ExecDetails exec_details = 6; +} diff --git a/src/main/proto/debugpb.proto b/src/main/proto/debugpb.proto new file mode 100644 index 00000000000..95fe5bc0b96 --- /dev/null +++ b/src/main/proto/debugpb.proto @@ -0,0 +1,228 @@ +syntax = "proto3"; +package debugpb; + +import "eraftpb.proto"; +import "kvrpcpb.proto"; +import "raft_serverpb.proto"; +import "gogoproto/gogo.proto"; + +option (gogoproto.sizer_all) = true; +option (gogoproto.marshaler_all) = true; +option (gogoproto.unmarshaler_all) = true; + +option java_package = "org.tikv.kvproto"; + +// Debug service for TiKV. +// +// Errors are defined as follow: +// - OK: Okay, we are good! +// - UNKNOWN: For unknown error. +// - INVALID_ARGUMENT: Something goes wrong within requests. +// - NOT_FOUND: It is key or region not found, it's based on context, detailed +// reason can be found in grpc message. +// Note: It bypasses raft layer. +service Debug { + // Read a value arbitrarily for a key. + // Note: Server uses key directly w/o any encoding. + rpc Get(GetRequest) returns (GetResponse) {} + + // Read raft info. + rpc RaftLog(RaftLogRequest) returns (RaftLogResponse) {} + rpc RegionInfo(RegionInfoRequest) returns (RegionInfoResponse) {} + + // Calculate size of a region. + // Note: DO NOT CALL IT IN PRODUCTION, it's really expensive. + rpc RegionSize(RegionSizeRequest) returns (RegionSizeResponse) {} + + // Scan a specific range. + // Note: DO NOT CALL IT IN PRODUCTION, it's really expensive. + // Server uses keys directly w/o any encoding. + rpc ScanMvcc(ScanMvccRequest) returns (stream ScanMvccResponse) {} + + // Compact a column family in a specified range. + // Note: Server uses keys directly w/o any encoding. + rpc Compact(CompactRequest) returns (CompactResponse) {} + + // Inject a fail point. Currently, it's only used in tests. + // Note: DO NOT CALL IT IN PRODUCTION. + rpc InjectFailPoint(InjectFailPointRequest) returns (InjectFailPointResponse) {} + // Recover from a fail point. + rpc RecoverFailPoint(RecoverFailPointRequest) returns (RecoverFailPointResponse) {} + // List all fail points. + rpc ListFailPoints(ListFailPointsRequest) returns (ListFailPointsResponse) {} + + // Get Metrics + rpc GetMetrics(GetMetricsRequest) returns (GetMetricsResponse){} + + // Do a consistent check for a region. + rpc CheckRegionConsistency(RegionConsistencyCheckRequest) returns (RegionConsistencyCheckResponse) {} + + // dynamically modify tikv's config + rpc ModifyTikvConfig(ModifyTikvConfigRequest) returns (ModifyTikvConfigResponse) {} + + // Get region properties + rpc GetRegionProperties(GetRegionPropertiesRequest) returns (GetRegionPropertiesResponse) {} +} + +enum DB { + INVALID = 0; + KV = 1; + RAFT = 2; +} + +enum MODULE { + UNUSED = 0; + KVDB = 1; + RAFTDB = 2; + READPOOL = 3; + SERVER = 4; + STORAGE = 5; + PD = 6; + METRIC = 7; + COPROCESSOR = 8; + SECURITY = 9; + IMPORT = 10; +} + +message GetRequest { + DB db = 1; + string cf = 2; + bytes key = 3; +} + +message GetResponse { + bytes value = 1; +} + +message RaftLogRequest { + uint64 region_id = 1; + uint64 log_index = 2; +} + +message RaftLogResponse { + eraftpb.Entry entry = 1; +} + +message RegionInfoRequest { + uint64 region_id = 1; +} + +message RegionInfoResponse { + raft_serverpb.RaftLocalState raft_local_state = 1; + raft_serverpb.RaftApplyState raft_apply_state = 2; + raft_serverpb.RegionLocalState region_local_state = 3; +} + +message RegionSizeRequest { + uint64 region_id = 1; + repeated string cfs = 2; +} + +message RegionSizeResponse { + message Entry { + string cf = 1; + uint64 size = 2; + } + + repeated Entry entries = 1; +} + +message ScanMvccRequest { + bytes from_key = 1; + bytes to_key = 2; + uint64 limit = 3; +} + +message ScanMvccResponse { + bytes key = 1; + kvrpcpb.MvccInfo info = 2; +} + +enum BottommostLevelCompaction { + // Skip bottommost level compaction + Skip = 0; + // Force bottommost level compaction + Force = 1; + // Compact bottommost level if there is a compaction filter. + IfHaveCompactionFilter = 2; +} + +message CompactRequest { + DB db = 1; + string cf = 2; + bytes from_key = 3; + bytes to_key = 4; + uint32 threads = 5; + BottommostLevelCompaction bottommost_level_compaction = 6; +} + +message CompactResponse { +} + +message InjectFailPointRequest { + string name = 1; + string actions = 2; +} + +message InjectFailPointResponse { +} + +message RecoverFailPointRequest { + string name = 1; +} + +message RecoverFailPointResponse { +} + +message ListFailPointsRequest { +} + +message ListFailPointsResponse { + message Entry { + string name = 1; + string actions = 2; + } + + repeated Entry entries = 1; +} + +message GetMetricsRequest { + bool all = 1; +} + +message GetMetricsResponse { + string prometheus = 1; + string rocksdb_kv = 2; + string rocksdb_raft = 3; + string jemalloc = 4; + uint64 store_id = 5; +} + +message RegionConsistencyCheckRequest { + uint64 region_id = 1; +} + +message RegionConsistencyCheckResponse { +} + +message ModifyTikvConfigRequest { + MODULE module = 1; + string config_name = 2; + string config_value = 3; +} + +message ModifyTikvConfigResponse { +} + +message Property { + string name = 1; + string value = 2; +} + +message GetRegionPropertiesRequest { + uint64 region_id = 1; +} + +message GetRegionPropertiesResponse { + repeated Property props = 1; +} diff --git a/src/main/proto/eraftpb.proto b/src/main/proto/eraftpb.proto new file mode 100644 index 00000000000..e7c25f50c6a --- /dev/null +++ b/src/main/proto/eraftpb.proto @@ -0,0 +1,101 @@ +syntax = "proto3"; +package eraftpb; + +enum EntryType { + EntryNormal = 0; + EntryConfChange = 1; +} + +// The entry is a type of change that needs to be applied. It contains two data fields. +// While the fields are built into the model; their usage is determined by the entry_type. +// +// For normal entries, the data field should contain the data change that should be applied. +// The context field can be used for any contextual data that might be relevant to the +// application of the data. +// +// For configuration changes, the data will contain the ConfChange message and the +// context will provide anything needed to assist the configuration change. The context +// if for the user to set and use in this case. +message Entry { + EntryType entry_type = 1; + uint64 term = 2; + uint64 index = 3; + bytes data = 4; + bytes context = 6; + + // Deprecated! It is kept for backward compatibility. + // TODO: remove it in the next major release. + bool sync_log = 5; +} + +message SnapshotMetadata { + ConfState conf_state = 1; + uint64 index = 2; + uint64 term = 3; +} + +message Snapshot { + bytes data = 1; + SnapshotMetadata metadata = 2; +} + +enum MessageType { + MsgHup = 0; + MsgBeat = 1; + MsgPropose = 2; + MsgAppend = 3; + MsgAppendResponse = 4; + MsgRequestVote = 5; + MsgRequestVoteResponse = 6; + MsgSnapshot = 7; + MsgHeartbeat = 8; + MsgHeartbeatResponse = 9; + MsgUnreachable = 10; + MsgSnapStatus = 11; + MsgCheckQuorum = 12; + MsgTransferLeader = 13; + MsgTimeoutNow = 14; + MsgReadIndex = 15; + MsgReadIndexResp = 16; + MsgRequestPreVote = 17; + MsgRequestPreVoteResponse = 18; +} + +message Message { + MessageType msg_type = 1; + uint64 to = 2; + uint64 from = 3; + uint64 term = 4; + uint64 log_term = 5; + uint64 index = 6; + repeated Entry entries = 7; + uint64 commit = 8; + Snapshot snapshot = 9; + bool reject = 10; + uint64 reject_hint = 11; + bytes context = 12; +} + +message HardState { + uint64 term = 1; + uint64 vote = 2; + uint64 commit = 3; +} + +message ConfState { + repeated uint64 nodes = 1; + repeated uint64 learners = 2; +} + +enum ConfChangeType { + AddNode = 0; + RemoveNode = 1; + AddLearnerNode = 2; +} + +message ConfChange { + uint64 id = 1; + ConfChangeType change_type = 2; + uint64 node_id = 3; + bytes context = 4; +} diff --git a/src/main/proto/errorpb.proto b/src/main/proto/errorpb.proto new file mode 100644 index 00000000000..3e18a77cee9 --- /dev/null +++ b/src/main/proto/errorpb.proto @@ -0,0 +1,61 @@ +syntax = "proto3"; +package errorpb; + +import "metapb.proto"; +import "gogoproto/gogo.proto"; + +option (gogoproto.marshaler_all) = true; +option (gogoproto.sizer_all) = true; +option (gogoproto.unmarshaler_all) = true; + +option java_package = "org.tikv.kvproto"; + +message NotLeader { + uint64 region_id = 1; + metapb.Peer leader = 2; +} + +message StoreNotMatch { + uint64 request_store_id = 1; + uint64 actual_store_id = 2; +} + +message RegionNotFound { + uint64 region_id = 1; +} + +message KeyNotInRegion { + bytes key = 1; + uint64 region_id = 2; + bytes start_key = 3; + bytes end_key = 4; +} + +message StaleEpoch { + repeated metapb.Region new_regions = 1; +} + +message ServerIsBusy { + string reason = 1; + uint64 backoff_ms = 2; +} + +message StaleCommand { +} + +message RaftEntryTooLarge { + uint64 region_id = 1; + uint64 entry_size = 2; +} + +message Error { + string message = 1; + NotLeader not_leader = 2; + RegionNotFound region_not_found = 3; + KeyNotInRegion key_not_in_region = 4; + StaleEpoch stale_epoch = 5; + ServerIsBusy server_is_busy = 6; + StaleCommand stale_command = 7; + StoreNotMatch store_not_match = 8; + RaftEntryTooLarge raft_entry_too_large = 9; +} diff --git a/src/main/proto/executor.proto b/src/main/proto/executor.proto new file mode 100644 index 00000000000..4ade5a39434 --- /dev/null +++ b/src/main/proto/executor.proto @@ -0,0 +1,80 @@ +syntax = "proto2"; + +package tipb; + +option java_multiple_files = true; +option java_package = "com.pingcap.tidb.tipb"; + +import "expression.proto"; +import "schema.proto"; +import "gogoproto/gogo.proto"; + +option (gogoproto.marshaler_all) = true; +option (gogoproto.sizer_all) = true; +option (gogoproto.unmarshaler_all) = true; + +enum ExecType { + TypeTableScan = 0; + TypeIndexScan = 1; + TypeSelection = 2; + TypeAggregation = 3; // TODO: Rename it to hash aggregation after support stream aggregation in TiKV. + TypeTopN = 4; + TypeLimit = 5; + TypeStreamAgg = 6; +} + +// It represents a Executor. +message Executor { + optional ExecType tp = 1 [(gogoproto.nullable) = false]; + optional TableScan tbl_scan = 2; + optional IndexScan idx_scan = 3; + optional Selection selection = 4; + optional Aggregation aggregation = 5; + optional TopN topN = 6; + optional Limit limit = 7; + optional Aggregation stream_agg= 8; +} + +message TableScan { + optional int64 table_id = 1 [(gogoproto.nullable) = false]; + repeated ColumnInfo columns = 2; + optional bool desc = 3 [(gogoproto.nullable) = false]; +} + +message IndexScan { + optional int64 table_id = 1 [(gogoproto.nullable) = false]; + optional int64 index_id = 2 [(gogoproto.nullable) = false]; + repeated ColumnInfo columns = 3; + optional bool desc = 4 [(gogoproto.nullable) = false]; + optional bool unique = 5; // check whether it is a unique index. +} + +message Selection { + // Where conditions. + repeated Expr conditions = 1; +} + +message Projection { + // Projection expressions. + repeated Expr exprs = 1; +} + +message Aggregation { + // Group by clause. + repeated Expr group_by = 1; + // Aggregate functions. + repeated Expr agg_func = 2; + // If it is a stream aggregation. + optional bool streamed = 3 [(gogoproto.nullable) = false]; +} + +message TopN { + // Order by clause. + repeated ByItem order_by = 1; + optional uint64 limit = 2 [(gogoproto.nullable) = false]; +} + +message Limit { + // Limit the result to be returned. + optional uint64 limit = 1 [(gogoproto.nullable) = false]; +} diff --git a/src/main/proto/expression.proto b/src/main/proto/expression.proto new file mode 100644 index 00000000000..d71aa99b319 --- /dev/null +++ b/src/main/proto/expression.proto @@ -0,0 +1,613 @@ +syntax = "proto2"; + +package tipb; + +option java_multiple_files = true; +option java_package = "com.pingcap.tidb.tipb"; + +import "gogoproto/gogo.proto"; + +option (gogoproto.marshaler_all) = true; +option (gogoproto.sizer_all) = true; +option (gogoproto.unmarshaler_all) = true; + +message FieldType { + optional int32 tp = 1 [(gogoproto.nullable) = false]; + optional uint32 flag = 2 [(gogoproto.nullable) = false]; + optional int32 flen = 3 [(gogoproto.nullable) = false]; + optional int32 decimal = 4 [(gogoproto.nullable) = false]; + optional int32 collate = 5 [(gogoproto.nullable) = false]; + optional string charset = 6 [(gogoproto.nullable) = false]; +} + +enum ExprType { + /* Children count 0. */ + // Values are encoded bytes. + Null = 0; + Int64 = 1; + Uint64 = 2; + Float32 = 3; + Float64 = 4; + String = 5; + Bytes = 6; + + // Mysql specific types. + MysqlBit = 101; + MysqlDecimal = 102; + MysqlDuration = 103; + MysqlEnum = 104; + MysqlHex = 105; + MysqlSet = 106; + MysqlTime = 107; + MysqlJson = 108; + + // Encoded value list. + ValueList = 151; + + // Column reference. value is int64 column ID. + ColumnRef = 201; + + /* Mysql functions, children count is function specific. */ + // Aggregate functions. + Count = 3001; + Sum = 3002; + Avg = 3003; + Min = 3004; + Max = 3005; + First = 3006; + GroupConcat = 3007; + Agg_BitAnd = 3008; + Agg_BitOr = 3009; + Agg_BitXor = 3010; + Std = 3011; + Stddev = 3012; + StddevPop = 3013; + StddevSamp = 3014; + VarPop = 3015; + VarSamp = 3016; + Variance = 3017; + JsonArrayAgg = 3018; + JsonObjectAgg = 3019; + + /* Scalar Function */ + ScalarFunc = 10000; +} + +enum ScalarFuncSig { + /* Casting */ + CastIntAsInt = 0; + CastIntAsReal = 1; + CastIntAsString = 2; + CastIntAsDecimal = 3; + CastIntAsTime = 4; + CastIntAsDuration = 5; + CastIntAsJson = 6; + + CastRealAsInt = 10; + CastRealAsReal = 11; + CastRealAsString = 12; + CastRealAsDecimal = 13; + CastRealAsTime = 14; + CastRealAsDuration = 15; + CastRealAsJson = 16; + + CastDecimalAsInt = 20; + CastDecimalAsReal = 21; + CastDecimalAsString = 22; + CastDecimalAsDecimal = 23; + CastDecimalAsTime = 24; + CastDecimalAsDuration = 25; + CastDecimalAsJson = 26; + + CastStringAsInt = 30; + CastStringAsReal = 31; + CastStringAsString = 32; + CastStringAsDecimal = 33; + CastStringAsTime = 34; + CastStringAsDuration = 35; + CastStringAsJson = 36; + + CastTimeAsInt = 40; + CastTimeAsReal = 41; + CastTimeAsString = 42; + CastTimeAsDecimal = 43; + CastTimeAsTime = 44; + CastTimeAsDuration = 45; + CastTimeAsJson = 46; + + CastDurationAsInt = 50; + CastDurationAsReal = 51; + CastDurationAsString = 52; + CastDurationAsDecimal = 53; + CastDurationAsTime = 54; + CastDurationAsDuration = 55; + CastDurationAsJson = 56; + + CastJsonAsInt = 60; + CastJsonAsReal = 61; + CastJsonAsString = 62; + CastJsonAsDecimal = 63; + CastJsonAsTime = 64; + CastJsonAsDuration = 65; + CastJsonAsJson = 66; + + /*compare*/ + CoalesceInt = 4201; + CoalesceReal = 4202; + CoalesceDecimal = 4203; + CoalesceString = 4204; + CoalesceTime = 4205; + CoalesceDuration = 4206; + // unimplemented in tidb + CoalesceJson = 4207; + LTInt = 100; + LTReal = 101; + LTDecimal = 102; + LTString = 103; + LTTime = 104; + LTDuration = 105; + LTJson = 106; + LEInt = 110; + LEReal = 111; + LEDecimal = 112; + LEString = 113; + LETime = 114; + LEDuration = 115; + LEJson = 116; + GTInt = 120; + GTReal = 121; + GTDecimal = 122; + GTString = 123; + GTTime = 124; + GTDuration = 125; + GTJson = 126; + GreatestInt = 4215; + GreatestReal = 4216; + GreatestDecimal = 4217; + GreatestString = 4218; + GreatestTime = 4219; + LeastInt = 4220; + LeastReal = 4221; + LeastDecimal = 4222; + LeastString = 4223; + LeastTime = 4224; + IntervalInt = 4225; + IntervalReal = 4226; + GEInt = 130; + GEReal = 131; + GEDecimal = 132; + GEString = 133; + GETime = 134; + GEDuration = 135; + GEJson = 136; + EQInt = 140; + EQReal = 141; + EQDecimal = 142; + EQString = 143; + EQTime = 144; + EQDuration = 145; + EQJson = 146; + NEInt = 150; + NEReal = 151; + NEDecimal = 152; + NEString = 153; + NETime = 154; + NEDuration = 155; + NEJson = 156; + NullEQInt = 160; + NullEQReal = 161; + NullEQDecimal = 162; + NullEQString = 163; + NullEQTime = 164; + NullEQDuration = 165; + NullEQJson = 166; + + /*arithmetic*/ + PlusReal = 200; + PlusDecimal = 201; + PlusInt = 203; + MinusReal = 204; + MinusDecimal = 205; + MinusInt = 207; + MultiplyReal = 208; + MultiplyDecimal = 209; + MultiplyInt = 210; + DivideReal = 211; + DivideDecimal = 212; + IntDivideInt = 213; + IntDivideDecimal = 214; + ModReal = 215; + ModDecimal = 216; + ModInt = 217; + MultiplyIntUnsigned = 218; + + /*math*/ + AbsInt = 2101; + AbsUInt = 2102; + AbsReal = 2103; + AbsDecimal = 2104; + CeilIntToDec = 2105; + CeilIntToInt = 2106; + CeilDecToInt = 2107; + CeilDecToDec = 2108; + CeilReal = 2109; + FloorIntToDec = 2110; + FloorIntToInt = 2111; + FloorDecToInt = 2112; + FloorDecToDec = 2113; + FloorReal = 2114; + RoundReal = 2121; + RoundInt = 2122; + RoundDec = 2123; + RoundWithFracReal = 2124; + RoundWithFracInt = 2125; + RoundWithFracDec = 2126; + Log1Arg = 2131; + Log2Args = 2132; + Log2 = 2133; + Log10 = 2134; + Rand = 2135; + RandWithSeed = 2136; + Pow = 2137; + Conv = 2138; + CRC32 = 2139; + Sign = 2140; + Sqrt = 2141; + Acos = 2142; + Asin = 2143; + Atan1Arg = 2144; + Atan2Args = 2145; + Cos = 2146; + Cot = 2147; + Degrees = 2148; + Exp = 2149; + PI = 2150; + Radians = 2151; + Sin = 2152; + Tan = 2153; + TruncateInt = 2154; + TruncateReal = 2155; + TruncateDecimal = 2156; + + /*op*/ + LogicalAnd = 3101; + LogicalOr = 3102; + LogicalXor = 3103; + UnaryNot = 3104; + UnaryMinusInt = 3108; + UnaryMinusReal = 3109; + UnaryMinusDecimal = 3110; + DecimalIsNull = 3111; + DurationIsNull = 3112; + RealIsNull = 3113; + StringIsNull = 3114; + TimeIsNull = 3115; + IntIsNull = 3116; + // unimplemented in tidb + JsonIsNull = 3117; + BitAndSig = 3118; + BitOrSig = 3119; + BitXorSig = 3120; + BitNegSig = 3121; + IntIsTrue = 3122; + RealIsTrue = 3123; + DecimalIsTrue = 3124; + IntIsFalse = 3125; + RealIsFalse = 3126; + DecimalIsFalse = 3127; + LeftShift = 3129; + RightShift = 3130; + + /*other*/ + BitCount = 3128; + GetParamString = 3131; + GetVar = 3132; + RowSig = 3133; + SetVar = 3134; + ValuesDecimal = 3135; + ValuesDuration = 3136; + ValuesInt = 3137; + ValuesJSON = 3138; + ValuesReal = 3139; + ValuesString = 3140; + ValuesTime = 3141; + InInt = 4001; + InReal = 4002; + InDecimal = 4003; + InString = 4004; + InTime = 4005; + InDuration = 4006; + InJson = 4007; + + /*control*/ + IfNullInt = 4101; + IfNullReal = 4102; + IfNullDecimal = 4103; + IfNullString = 4104; + IfNullTime = 4105; + IfNullDuration = 4106; + IfInt = 4107; + IfReal = 4108; + IfDecimal = 4109; + IfString = 4110; + IfTime = 4111; + IfDuration = 4112; + IfNullJson = 4113; + IfJson = 4114; + CaseWhenInt = 4208; + CaseWhenReal = 4209; + CaseWhenDecimal = 4210; + CaseWhenString = 4211; + CaseWhenTime = 4212; + CaseWhenDuration = 4213; + // unimplemented in tidb + CaseWhenJson = 4214; + + /* encryption */ + AesDecrypt = 4501; + AesEncrypt = 4502; + Compress = 4503; + MD5 = 4504; + Password = 4505; + RandomBytes = 4506; + SHA1 = 4507; + SHA2 = 4508; + Uncompress = 4509; + UncompressedLength = 4510; + + /*info*/ + Database = 4521; + FoundRows = 4522; + CurrentUser = 4523; + User = 4524; + ConnectionID = 4525; + LastInsertID = 4526; + LastInsertIDWithID = 4527; + Version = 4528; + TiDBVersion = 4529; + RowCount = 4530; + + /*miscellaneous*/ + Sleep = 4551; + Lock = 4552; + ReleaseLock = 4553; + DecimalAnyValue = 4554; + DurationAnyValue = 4555; + IntAnyValue = 4556; + JSONAnyValue = 4557; + RealAnyValue = 4558; + StringAnyValue = 4559; + TimeAnyValue = 4560; + InetAton = 4561; + InetNtoa = 4562; + Inet6Aton = 4563; + Inet6Ntoa = 4564; + IsIPv4 = 4565; + IsIPv4Compat = 4566; + IsIPv4Mapped = 4567; + IsIPv6 = 4568; + UUID = 4569; + + /*like*/ + LikeSig = 4310; + RegexpBinarySig = 4311; + RegexpSig = 4312; + + /*json*/ + JsonExtractSig = 5001; + JsonUnquoteSig = 5002; + JsonTypeSig = 5003; + JsonSetSig = 5004; + JsonInsertSig = 5005; + JsonReplaceSig = 5006; + JsonRemoveSig = 5007; + JsonMergeSig = 5008; + JsonObjectSig = 5009; + JsonArraySig = 5010; + JsonValidJsonSig = 5011; + JsonContainsSig = 5012; + JsonArrayAppendSig = 5013; + JsonArrayInsertSig = 5014; + JsonMergePatchSig = 5015; + JsonMergePreserveSig = 5016; + JsonContainsPathSig = 5017; + JsonPrettySig = 5018; + JsonQuoteSig = 5019; + JsonSearchSig = 5020; + JsonStorageSizeSig = 5021; + JsonDepthSig = 5022; + JsonKeysSig = 5023; + JsonLengthSig = 5024; + JsonKeys2ArgsSig = 5025; + JsonValidStringSig = 5026; + + /*time*/ + DateFormatSig = 6001; + DateLiteral = 6002; + DateDiff = 6003; + NullTimeDiff = 6004; + TimeStringTimeDiff = 6005; + DurationStringTimeDiff = 6006; + DurationDurationTimeDiff = 6007; + StringTimeTimeDiff = 6008; + StringDurationTimeDiff = 6009; + StringStringTimeDiff = 6010; + TimeTimeTimeDiff = 6011; + Date = 6012; + Hour = 6013; + Minute = 6014; + Second = 6015; + MicroSecond = 6016; + Month = 6017; + MonthName = 6018; + NowWithArg = 6019; + NowWithoutArg = 6020; + DayName = 6021; + DayOfMonth = 6022; + DayOfWeek = 6023; + DayOfYear = 6024; + WeekWithMode = 6025; + WeekWithoutMode = 6026; + WeekDay = 6027; + WeekOfYear = 6028; + Year = 6029; + YearWeekWithMode = 6030; + YearWeekWithoutMode = 6031; + GetFormat = 6032; + SysDateWithFsp = 6033; + SysDateWithoutFsp = 6034; + CurrentDate = 6035; + CurrentTime0Arg = 6036; + CurrentTime1Arg = 6037; + Time = 6038; + TimeLiteral = 6039; + UTCDate = 6040; + UTCTimestampWithArg = 6041; + UTCTimestampWithoutArg = 6042; + AddDatetimeAndDuration = 6043; + AddDatetimeAndString = 6044; + AddTimeDateTimeNull = 6045; + AddStringAndDuration = 6046; + AddStringAndString = 6047; + AddTimeStringNull = 6048; + AddDurationAndDuration = 6049; + AddDurationAndString = 6050; + AddTimeDurationNull = 6051; + AddDateAndDuration = 6052; + AddDateAndString = 6053; + SubDatetimeAndDuration = 6054; + SubDatetimeAndString = 6055; + SubTimeDateTimeNull = 6056; + SubStringAndDuration = 6057; + SubStringAndString = 6058; + SubTimeStringNull = 6059; + SubDurationAndDuration = 6060; + SubDurationAndString = 6061; + SubTimeDurationNull = 6062; + SubDateAndDuration = 6063; + SubDateAndString = 6064; + UnixTimestampCurrent = 6065; + UnixTimestampInt = 6066; + UnixTimestampDec = 6067; + ConvertTz = 6068; + MakeDate = 6069; + MakeTime = 6070; + PeriodAdd = 6071; + PeriodDiff = 6072; + Quarter = 6073; + SecToTime = 6074; + TimeToSec = 6075; + TimestampAdd = 6076; + ToDays = 6077; + ToSeconds = 6078; + UTCTimeWithArg = 6079; + UTCTimeWithoutArg = 6080; + Timestamp1Arg = 6081; + Timestamp2Args = 6082; + TimestampLiteral = 6083; + LastDay = 6084; + StrToDateDate = 6085; + StrToDateDatetime = 6086; + StrToDateDuration = 6087; + FromUnixTime1Arg = 6088; + FromUnixTime2Arg = 6089; + ExtractDatetime = 6090; + ExtractDuration = 6091; + AddDateStringString = 6092; + AddDateStringInt = 6093; + AddDateStringDecimal = 6094; + AddDateIntString = 6095; + AddDateIntInt = 6096; + AddDateDatetimeString = 6097; + AddDateDatetimeInt = 6098; + SubDateStringString = 6099; + SubDateStringInt = 6100; + SubDateStringDecimal = 6101; + SubDateIntString = 6102; + SubDateIntInt = 6103; + SubDateDatetimeString = 6104; + SubDateDatetimeInt = 6105; + FromDays = 6106; + TimeFormat = 6107; + TimestampDiff = 6108; + + /* String functions */ + BitLength = 7001; + Bin = 7002; + ASCII = 7003; + Char = 7004; + CharLength = 7005; + Concat = 7006; + ConcatWS = 7007; + Convert = 7008; + Elt = 7009; + ExportSet3Arg = 7010; + ExportSet4Arg = 7011; + ExportSet5Arg = 7012; + FieldInt = 7013; + FieldReal = 7014; + FieldString = 7015; + FindInSet = 7016; + Format = 7017; + FormatWithLocale = 7018; + FromBase64 = 7019; + HexIntArg = 7020; + HexStrArg = 7021; + Insert = 7022; + InsertBinary = 7023; + Instr = 7024; + InstrBinary = 7025; + LTrim = 7026; + Left = 7027; + LeftBinary = 7028; + Length = 7029; + Locate2Args = 7030; + Locate3Args = 7031; + LocateBinary2Args = 7032; + LocateBinary3Args = 7033; + Lower = 7034; + Lpad = 7035; + LpadBinary = 7036; + MakeSet = 7037; + OctInt = 7038; + OctString = 7039; + Ord = 7040; + Quote = 7041; + RTrim = 7042; + Repeat = 7043; + Replace = 7044; + Reverse = 7045; + ReverseBinary = 7046; + Right = 7047; + RightBinary = 7048; + Rpad = 7049; + RpadBinary = 7050; + Space = 7051; + Strcmp = 7052; + Substring2Args = 7053; + Substring3Args = 7054; + SubstringBinary2Args = 7055; + SubstringBinary3Args = 7056; + SubstringIndex = 7057; + ToBase64 = 7058; + Trim1Arg = 7059; + Trim2Args = 7060; + Trim3Args = 7061; + UnHex = 7062; + Upper = 7063; +} + +// Evaluators should implement evaluation functions for every expression type. +message Expr { + optional ExprType tp = 1 [(gogoproto.nullable) = false]; + optional bytes val = 2; + repeated Expr children = 3; + optional ScalarFuncSig sig = 4 [(gogoproto.nullable) = false]; + optional FieldType field_type = 5; +} + +// ByItem type for group by and order by. +message ByItem { + optional Expr expr = 1; + optional bool desc = 2 [(gogoproto.nullable) = false]; +} diff --git a/src/main/proto/gogoproto/gogo.proto b/src/main/proto/gogoproto/gogo.proto new file mode 100644 index 00000000000..bc8d889f161 --- /dev/null +++ b/src/main/proto/gogoproto/gogo.proto @@ -0,0 +1,136 @@ +// Protocol Buffers for Go with Gadgets +// +// Copyright (c) 2013, The GoGo Authors. All rights reserved. +// http://github.com/gogo/protobuf +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +syntax = "proto2"; +package gogoproto; + +import "google/protobuf/descriptor.proto"; + +option java_package = "com.google.protobuf"; +option java_outer_classname = "GoGoProtos"; +option go_package = "github.com/gogo/protobuf/gogoproto"; + +extend google.protobuf.EnumOptions { + optional bool goproto_enum_prefix = 62001; + optional bool goproto_enum_stringer = 62021; + optional bool enum_stringer = 62022; + optional string enum_customname = 62023; + optional bool enumdecl = 62024; +} + +extend google.protobuf.EnumValueOptions { + optional string enumvalue_customname = 66001; +} + +extend google.protobuf.FileOptions { + optional bool goproto_getters_all = 63001; + optional bool goproto_enum_prefix_all = 63002; + optional bool goproto_stringer_all = 63003; + optional bool verbose_equal_all = 63004; + optional bool face_all = 63005; + optional bool gostring_all = 63006; + optional bool populate_all = 63007; + optional bool stringer_all = 63008; + optional bool onlyone_all = 63009; + + optional bool equal_all = 63013; + optional bool description_all = 63014; + optional bool testgen_all = 63015; + optional bool benchgen_all = 63016; + optional bool marshaler_all = 63017; + optional bool unmarshaler_all = 63018; + optional bool stable_marshaler_all = 63019; + + optional bool sizer_all = 63020; + + optional bool goproto_enum_stringer_all = 63021; + optional bool enum_stringer_all = 63022; + + optional bool unsafe_marshaler_all = 63023; + optional bool unsafe_unmarshaler_all = 63024; + + optional bool goproto_extensions_map_all = 63025; + optional bool goproto_unrecognized_all = 63026; + optional bool gogoproto_import = 63027; + optional bool protosizer_all = 63028; + optional bool compare_all = 63029; + optional bool typedecl_all = 63030; + optional bool enumdecl_all = 63031; + + optional bool goproto_registration = 63032; + optional bool messagename_all = 63033; +} + +extend google.protobuf.MessageOptions { + optional bool goproto_getters = 64001; + optional bool goproto_stringer = 64003; + optional bool verbose_equal = 64004; + optional bool face = 64005; + optional bool gostring = 64006; + optional bool populate = 64007; + optional bool stringer = 67008; + optional bool onlyone = 64009; + + optional bool equal = 64013; + optional bool description = 64014; + optional bool testgen = 64015; + optional bool benchgen = 64016; + optional bool marshaler = 64017; + optional bool unmarshaler = 64018; + optional bool stable_marshaler = 64019; + + optional bool sizer = 64020; + + optional bool unsafe_marshaler = 64023; + optional bool unsafe_unmarshaler = 64024; + + optional bool goproto_extensions_map = 64025; + optional bool goproto_unrecognized = 64026; + + optional bool protosizer = 64028; + optional bool compare = 64029; + + optional bool typedecl = 64030; + + optional bool messagename = 64033; +} + +extend google.protobuf.FieldOptions { + optional bool nullable = 65001; + optional bool embed = 65002; + optional string customtype = 65003; + optional string customname = 65004; + optional string jsontag = 65005; + optional string moretags = 65006; + optional string casttype = 65007; + optional string castkey = 65008; + optional string castvalue = 65009; + + optional bool stdtime = 65010; + optional bool stdduration = 65011; +} diff --git a/src/main/proto/import_kvpb.proto b/src/main/proto/import_kvpb.proto new file mode 100644 index 00000000000..952c9ae7dc0 --- /dev/null +++ b/src/main/proto/import_kvpb.proto @@ -0,0 +1,133 @@ +syntax = "proto3"; + +package import_kvpb; + +import "import_sstpb.proto"; +import "gogoproto/gogo.proto"; + +option (gogoproto.sizer_all) = true; +option (gogoproto.marshaler_all) = true; +option (gogoproto.unmarshaler_all) = true; + +option java_package = "org.tikv.kvproto"; + +// ImportKV provides a service to import key-value pairs to TiKV. +// +// In order to import key-value pairs to TiKV, the user should: +// 1. Open an engine identified by an UUID. +// 2. Open write streams to write key-value batches to the opened engine. +// Different streams/clients can write to the same engine concurrently. +// 3. Close the engine after all write batches have been finished. An +// engine can only be closed when all write streams are closed. An +// engine can only be closed once, and it can not be opened again +// once it is closed. +// 4. Import the data in the engine to the target cluster. Note that +// the import process is not atomic, it requires the data to be +// idempotent on retry. An engine can only be imported after it is +// closed. An engine can be imported multiple times, but can not be +// imported concurrently. +// 5. Clean up the engine after it has been imported. Delete all data +// in the engine. An engine can not be cleaned up when it is +// writing or importing. +service ImportKV { + // Switch the target cluster to normal/import mode. + rpc SwitchMode(SwitchModeRequest) returns (SwitchModeResponse) {} + // Open an engine. + rpc OpenEngine(OpenEngineRequest) returns (OpenEngineResponse) {} + // Open a write stream to the engine. + rpc WriteEngine(stream WriteEngineRequest) returns (WriteEngineResponse) {} + // Close the engine. + rpc CloseEngine(CloseEngineRequest) returns (CloseEngineResponse) {} + // Import the engine to the target cluster. + rpc ImportEngine(ImportEngineRequest) returns (ImportEngineResponse) {} + // Clean up the engine. + rpc CleanupEngine(CleanupEngineRequest) returns (CleanupEngineResponse) {} + // Compact the target cluster for better performance. + rpc CompactCluster(CompactClusterRequest) returns (CompactClusterResponse) {} +} + +message SwitchModeRequest { + string pd_addr = 1; + import_sstpb.SwitchModeRequest request = 2; +} + +message SwitchModeResponse { +} + +message OpenEngineRequest { + bytes uuid = 1; +} + +message OpenEngineResponse { +} + +message WriteHead { + bytes uuid = 1; +} + +message Mutation { + enum OP { + Put = 0; + } + OP op = 1; + bytes key = 2; + bytes value = 3; +} + +message WriteBatch { + uint64 commit_ts = 1; + repeated Mutation mutations = 2; +} + +message WriteEngineRequest { + oneof chunk { + WriteHead head = 1; + WriteBatch batch = 2; + } +} + +message WriteEngineResponse { + Error error = 1; +} + +message CloseEngineRequest { + bytes uuid = 1; +} + +message CloseEngineResponse { + Error error = 1; +} + +message ImportEngineRequest { + bytes uuid = 1; + string pd_addr = 2; +} + +message ImportEngineResponse { +} + +message CleanupEngineRequest { + bytes uuid = 1; +} + +message CleanupEngineResponse { +} + +message CompactClusterRequest { + string pd_addr = 1; + import_sstpb.CompactRequest request = 2; +} + +message CompactClusterResponse { +} + +message Error { + message EngineNotFound { + bytes uuid = 1; + } + // This can happen if the client hasn't opened the engine, or the server + // restarts while the client is writing or closing. An unclosed engine will + // be removed on server restart, so the client should not continue but + // restart the previous job in that case. + EngineNotFound engine_not_found = 1; +} diff --git a/src/main/proto/import_sstpb.proto b/src/main/proto/import_sstpb.proto new file mode 100644 index 00000000000..1e214186432 --- /dev/null +++ b/src/main/proto/import_sstpb.proto @@ -0,0 +1,97 @@ +syntax = "proto3"; + +package import_sstpb; + +import "metapb.proto"; +import "errorpb.proto"; +import "kvrpcpb.proto"; +import "gogoproto/gogo.proto"; + +option (gogoproto.sizer_all) = true; +option (gogoproto.marshaler_all) = true; +option (gogoproto.unmarshaler_all) = true; + +option java_package = "org.tikv.kvproto"; + +// ImportSST provides a service to import a generated SST file to a region in TiKV. +// +// In order to import an SST file to a region, the user should: +// 1. Retrieve the meta of the region according to the SST file's range. +// 2. Upload the SST file to the servers where the region's peers locate in. +// 3. Issue an ingest request to the region's leader with the SST file's metadata. +// +// It's the user's responsibility to make sure that the SST file is uploaded to +// the servers where the region's peers locate in, before issue the ingest +// request to the region's leader. However, the region can be scheduled (so the +// location of the region's peers will be changed) or split/merged (so the range +// of the region will be changed), after the SST file is uploaded, but before +// the SST file is ingested. So, the region's epoch is provided in the SST +// file's metadata, to guarantee that the region's epoch must be the same +// between the SST file is uploaded and ingested later. +service ImportSST { + // Switch to normal/import mode. + rpc SwitchMode(SwitchModeRequest) returns (SwitchModeResponse) {} + // Upload an SST file to a server. + rpc Upload(stream UploadRequest) returns (UploadResponse) {} + // Ingest an uploaded SST file to a region. + rpc Ingest(IngestRequest) returns (IngestResponse) {} + // Compact the specific range for better performance. + rpc Compact(CompactRequest) returns (CompactResponse) {} +} + +enum SwitchMode { + Normal = 0; + Import = 1; +} + +message SwitchModeRequest { + SwitchMode mode = 1; +} + +message SwitchModeResponse { +} + +message Range { + bytes start = 1; + bytes end = 2; +} + +message SSTMeta { + bytes uuid = 1; + Range range = 2; + uint32 crc32 = 3; + uint64 length = 4; + string cf_name = 5; + uint64 region_id = 6; + metapb.RegionEpoch region_epoch = 7; +} + +message UploadRequest { + oneof chunk { + SSTMeta meta = 1; + bytes data = 2; + } +} + +message UploadResponse { +} + +message IngestRequest { + kvrpcpb.Context context = 1; + SSTMeta sst = 2; +} + +message IngestResponse { + errorpb.Error error = 1; +} + +message CompactRequest { + // Compact files in the range and above the output level. + // Compact all files if the range is not specified. + // Compact all files to the bottommost level if the output level is -1. + Range range = 1; + int32 output_level = 2; +} + +message CompactResponse { +} diff --git a/src/main/proto/kvrpcpb.proto b/src/main/proto/kvrpcpb.proto new file mode 100644 index 00000000000..03f575f0dd5 --- /dev/null +++ b/src/main/proto/kvrpcpb.proto @@ -0,0 +1,439 @@ +syntax = "proto3"; +package kvrpcpb; + +import "metapb.proto"; +import "errorpb.proto"; +import "gogoproto/gogo.proto"; + +option (gogoproto.marshaler_all) = true; +option (gogoproto.sizer_all) = true; +option (gogoproto.unmarshaler_all) = true; + +option java_package = "org.tikv.kvproto"; + +message LockInfo { + bytes primary_lock = 1; + uint64 lock_version = 2; + bytes key = 3; + uint64 lock_ttl = 4; +} + +message KeyError { + LockInfo locked = 1; // Client should backoff or cleanup the lock then retry. + string retryable = 2; // Client may restart the txn. e.g write conflict. + string abort = 3; // Client should abort the txn. + WriteConflict conflict = 4; // Write conflict is moved from retryable to here. +} + +message WriteConflict { + uint64 start_ts = 1; + uint64 conflict_ts = 2; + bytes key = 3; + bytes primary = 4; +} + +enum CommandPri { + Normal = 0; // Normal must the default value + Low = 1; + High = 2; +} + +enum IsolationLevel { + SI = 0; // SI = snapshot isolation + RC = 1; // RC = read committed +} + +message Context { + reserved 4; + reserved "read_quorum"; + uint64 region_id = 1; + metapb.RegionEpoch region_epoch = 2; + metapb.Peer peer = 3; + uint64 term = 5; + CommandPri priority = 6; + IsolationLevel isolation_level = 7; + bool not_fill_cache = 8; + bool sync_log = 9; + bool handle_time = 10; // true means return handle time detail + bool scan_detail = 11; // true means return scan cf's detail +} + +message HandleTime { + int64 wait_ms = 1; // time in queue + int64 process_ms = 2; // process time without wait time. +} + +message ScanInfo { + int64 total = 1; // total count + int64 processed = 2; // processed count +} + +message ScanDetail { + ScanInfo write = 1; + ScanInfo lock = 2; + ScanInfo data = 3; +} + +message ExecDetails { + HandleTime handle_time = 1;// set when ctx.handle_time = true or meet slow query + ScanDetail scan_detail = 2;// set when ctx.scan_detail = true or meet slow query +} + +message GetRequest { + Context context = 1; + bytes key = 2; + uint64 version = 3; +} + +message GetResponse { + errorpb.Error region_error = 1; + KeyError error = 2; + bytes value = 3; +} + +message ScanRequest { + Context context = 1; + bytes start_key = 2; + uint32 limit = 3; + uint64 version = 4; + bool key_only = 5; + bool reverse = 6; + // For compatibility, when scanning forward, the range to scan is [start_key, end_key), where start_key < end_key; + // and when scanning backward, it scans [end_key, start_key) in descending order, where end_key < start_key. + bytes end_key = 7; +} + +message KvPair { + KeyError error = 1; + bytes key = 2; + bytes value = 3; +} + +message ScanResponse { + errorpb.Error region_error = 1; + repeated KvPair pairs = 2; +} + +enum Op { + Put = 0; + Del = 1; + Lock = 2; + Rollback = 3; +} + +message Mutation { + Op op = 1; + bytes key = 2; + bytes value = 3; +} + +message PrewriteRequest { + Context context = 1; + repeated Mutation mutations = 2; + // primary_lock_key + bytes primary_lock = 3; + uint64 start_version = 4; + uint64 lock_ttl = 5; + bool skip_constraint_check = 6; +} + +message PrewriteResponse { + errorpb.Error region_error = 1; + repeated KeyError errors = 2; +} + +message CommitRequest { + reserved 5; + reserved "binlog"; + Context context = 1; + uint64 start_version = 2; + repeated bytes keys = 3; + uint64 commit_version = 4; +} + +message CommitResponse { + errorpb.Error region_error = 1; + KeyError error = 2; +} + +message ImportRequest { + repeated Mutation mutations = 1; + uint64 commit_version = 2; +} + +message ImportResponse { + errorpb.Error region_error = 1; + string error = 2; +} + +message BatchRollbackRequest { + Context context = 1; + uint64 start_version = 2; + repeated bytes keys = 3; +} + +message BatchRollbackResponse { + errorpb.Error region_error = 1; + KeyError error = 2; +} + +message CleanupRequest { + Context context = 1; + bytes key = 2; + uint64 start_version = 3; +} + +message CleanupResponse { + errorpb.Error region_error = 1; + KeyError error = 2; + uint64 commit_version = 3; // set this if the key is already committed +} + +message BatchGetRequest { + Context context = 1; + repeated bytes keys = 2; + uint64 version = 3; +} + +message BatchGetResponse { + errorpb.Error region_error = 1; + repeated KvPair pairs = 2; +} + +message ScanLockRequest { + Context context = 1; + uint64 max_version = 2; + bytes start_key = 3; + uint32 limit = 4; +} + +message ScanLockResponse { + errorpb.Error region_error = 1; + KeyError error = 2; + repeated LockInfo locks = 3; +} + +message TxnInfo { + uint64 txn = 1; + uint64 status = 2; +} + +message ResolveLockRequest { + Context context = 1; + uint64 start_version = 2; + // If the txn is rolled back, do not set it. + uint64 commit_version = 3; + repeated TxnInfo txn_infos = 4; +} + +message ResolveLockResponse { + errorpb.Error region_error = 1; + KeyError error = 2; +} + +message GCRequest { + Context context = 1; + uint64 safe_point = 2; +} + +message GCResponse { + errorpb.Error region_error = 1; + KeyError error = 2; +} + +message RawGetRequest { + Context context = 1; + bytes key = 2; + string cf = 3; +} + +message RawGetResponse { + errorpb.Error region_error = 1; + string error = 2; + bytes value = 3; +} + +message RawPutRequest { + Context context = 1; + bytes key = 2; + bytes value = 3; + string cf = 4; +} + +message RawPutResponse { + errorpb.Error region_error = 1; + string error = 2; +} + +message RawBatchPutRequest { + Context context = 1; + repeated KvPair pairs = 2; + string cf = 3; +} + +message RawBatchPutResponse { + errorpb.Error region_error = 1; + string error = 2; +} + +message RawBatchGetRequest { + Context context = 1; + repeated bytes keys = 2; + string cf = 3; +} + +message RawBatchGetResponse { + errorpb.Error region_error = 1; + repeated KvPair pairs = 2; +} + +message RawDeleteRequest { + Context context = 1; + bytes key = 2; + string cf = 3; +} + +message RawDeleteResponse { + errorpb.Error region_error = 1; + string error = 2; +} + +message RawBatchDeleteRequest { + Context context = 1; + repeated bytes keys = 2; + string cf = 3; +} + +message RawBatchDeleteResponse { + errorpb.Error region_error = 1; + string error = 2; +} + +message DeleteRangeRequest { + Context context = 1; + bytes start_key = 2; + bytes end_key = 3; +} + +message DeleteRangeResponse { + errorpb.Error region_error = 1; + string error = 2; +} + +message RawDeleteRangeRequest { + Context context = 1; + bytes start_key = 2; + bytes end_key = 3; + string cf = 4; +} + +message RawDeleteRangeResponse { + errorpb.Error region_error = 1; + string error = 2; +} + +message RawScanRequest { + Context context = 1; + bytes start_key = 2; + uint32 limit = 3; + bool key_only = 4; + string cf = 5; + bool reverse = 6; + // For compatibility, when scanning forward, the range to scan is [start_key, end_key), where start_key < end_key; + // and when scanning backward, it scans [end_key, start_key) in descending order, where end_key < start_key. + bytes end_key = 7; +} + +message RawScanResponse { + errorpb.Error region_error = 1; + repeated KvPair kvs = 2; +} + +message KeyRange { + bytes start_key = 1; + bytes end_key = 2; +} + +message RawBatchScanRequest { + Context context = 1; + repeated KeyRange ranges = 2; // scanning range + uint32 each_limit = 3; // max number of returning kv pairs for each scanning range + bool key_only = 4; + string cf = 5; + bool reverse = 6; +} + +message RawBatchScanResponse { + errorpb.Error region_error = 1; + repeated KvPair kvs = 2; +} + +message MvccWrite { + Op type = 1; + uint64 start_ts = 2; + uint64 commit_ts = 3; + bytes short_value = 4; +} + +message MvccValue { + uint64 start_ts = 1; + bytes value = 2; +} + +message MvccLock { + Op type = 1; + uint64 start_ts = 2; + bytes primary = 3; + bytes short_value = 4; +} + +message MvccInfo { + MvccLock lock = 1; + repeated MvccWrite writes = 2; + repeated MvccValue values = 3; +} + +message MvccGetByKeyRequest { + Context context = 1; + bytes key = 2; +} + +message MvccGetByKeyResponse { + errorpb.Error region_error = 1; + string error = 2; + MvccInfo info = 3; +} + +message MvccGetByStartTsRequest { + Context context = 1; + uint64 start_ts = 2; +} + +message MvccGetByStartTsResponse { + errorpb.Error region_error = 1; + string error = 2; + bytes key = 3; + MvccInfo info = 4; +} + +message SplitRegionRequest { + Context context = 1; + bytes split_key = 2; +} + +message SplitRegionResponse { + errorpb.Error region_error = 1; + metapb.Region left = 2; + metapb.Region right = 3; +} + +message UnsafeDestroyRangeRequest { + Context context = 1; + bytes start_key = 2; + bytes end_key = 3; +} + +message UnsafeDestroyRangeResponse { + errorpb.Error region_error = 1; + string error = 2; +} diff --git a/src/main/proto/metapb.proto b/src/main/proto/metapb.proto new file mode 100644 index 00000000000..7d56ebfd1f8 --- /dev/null +++ b/src/main/proto/metapb.proto @@ -0,0 +1,61 @@ +syntax = "proto3"; +package metapb; + +import "gogoproto/gogo.proto"; + +option (gogoproto.marshaler_all) = true; +option (gogoproto.sizer_all) = true; +option (gogoproto.unmarshaler_all) = true; + +option java_package = "org.tikv.kvproto"; + +message Cluster { + uint64 id = 1; + // max peer count for a region. + // pd will do the auto-balance if region peer count mismatches. + uint32 max_peer_count = 2; + // more attributes...... +} + +enum StoreState { + Up = 0; + Offline = 1; + Tombstone = 2; +} + +// Case insensitive key/value for replica constraints. +message StoreLabel { + string key = 1; + string value = 2; +} + +message Store { + uint64 id = 1; + string address = 2; + StoreState state = 3; + repeated StoreLabel labels = 4; + string version = 5; + // more attributes...... +} + +message RegionEpoch { + // Conf change version, auto increment when add or remove peer + uint64 conf_ver = 1; + // Region version, auto increment when split or merge + uint64 version = 2; +} + +message Region { + uint64 id = 1; + // Region key range [start_key, end_key). + bytes start_key = 2; + bytes end_key = 3; + RegionEpoch region_epoch = 4; + repeated Peer peers = 5; +} + +message Peer { + uint64 id = 1; + uint64 store_id = 2; + bool is_learner = 3; +} diff --git a/src/main/proto/pdpb.proto b/src/main/proto/pdpb.proto new file mode 100644 index 00000000000..2d839d84a88 --- /dev/null +++ b/src/main/proto/pdpb.proto @@ -0,0 +1,480 @@ +syntax = "proto3"; +package pdpb; + +import "metapb.proto"; +import "eraftpb.proto"; + +import "gogoproto/gogo.proto"; + +option (gogoproto.sizer_all) = true; +option (gogoproto.marshaler_all) = true; +option (gogoproto.unmarshaler_all) = true; + +option java_package = "org.tikv.kvproto"; + +service PD { + // GetMembers get the member list of this cluster. It does not require + // the cluster_id in request matchs the id of this cluster. + rpc GetMembers(GetMembersRequest) returns (GetMembersResponse) {} + + rpc Tso(stream TsoRequest) returns (stream TsoResponse) {} + + rpc Bootstrap(BootstrapRequest) returns (BootstrapResponse) {} + + rpc IsBootstrapped(IsBootstrappedRequest) returns (IsBootstrappedResponse) {} + + rpc AllocID(AllocIDRequest) returns (AllocIDResponse) {} + + rpc GetStore(GetStoreRequest) returns (GetStoreResponse) {} + + rpc PutStore(PutStoreRequest) returns (PutStoreResponse) {} + + rpc GetAllStores(GetAllStoresRequest) returns (GetAllStoresResponse) {} + + rpc StoreHeartbeat(StoreHeartbeatRequest) returns (StoreHeartbeatResponse) {} + + rpc RegionHeartbeat(stream RegionHeartbeatRequest) returns (stream RegionHeartbeatResponse) {} + + rpc GetRegion(GetRegionRequest) returns (GetRegionResponse) {} + + rpc GetPrevRegion(GetRegionRequest) returns (GetRegionResponse) {} + + rpc GetRegionByID(GetRegionByIDRequest) returns (GetRegionResponse) {} + + rpc AskSplit(AskSplitRequest) returns (AskSplitResponse) { + // Use AskBatchSplit instead. + option deprecated = true; + } + + rpc ReportSplit(ReportSplitRequest) returns (ReportSplitResponse) { + // Use ResportBatchSplit instead. + option deprecated = true; + } + + rpc AskBatchSplit(AskBatchSplitRequest) returns (AskBatchSplitResponse) {} + + rpc ReportBatchSplit(ReportBatchSplitRequest) returns (ReportBatchSplitResponse) {} + + rpc GetClusterConfig(GetClusterConfigRequest) returns (GetClusterConfigResponse) {} + + rpc PutClusterConfig(PutClusterConfigRequest) returns (PutClusterConfigResponse) {} + + rpc ScatterRegion(ScatterRegionRequest) returns (ScatterRegionResponse) {} + + rpc GetGCSafePoint(GetGCSafePointRequest) returns (GetGCSafePointResponse) {} + + rpc UpdateGCSafePoint(UpdateGCSafePointRequest) returns (UpdateGCSafePointResponse) {} + + rpc SyncRegions(stream SyncRegionRequest) returns (stream SyncRegionResponse) {} +} + +message RequestHeader { + // cluster_id is the ID of the cluster which be sent to. + uint64 cluster_id = 1; +} + +message ResponseHeader { + // cluster_id is the ID of the cluster which sent the response. + uint64 cluster_id = 1; + Error error = 2; +} + +enum ErrorType { + OK = 0; + UNKNOWN = 1; + NOT_BOOTSTRAPPED = 2; + STORE_TOMBSTONE = 3; + ALREADY_BOOTSTRAPPED = 4; + INCOMPATIBLE_VERSION = 5; +} + +message Error { + ErrorType type = 1; + string message = 2; +} + +message TsoRequest { + RequestHeader header = 1; + + uint32 count = 2; +} + +message Timestamp { + int64 physical = 1; + int64 logical = 2; +} + +message TsoResponse { + ResponseHeader header = 1; + + uint32 count = 2; + Timestamp timestamp = 3; +} + +message BootstrapRequest { + RequestHeader header = 1; + + metapb.Store store = 2; + metapb.Region region = 3; +} + +message BootstrapResponse { + ResponseHeader header = 1; +} + +message IsBootstrappedRequest { + RequestHeader header = 1; +} + +message IsBootstrappedResponse { + ResponseHeader header = 1; + + bool bootstrapped = 2; +} + +message AllocIDRequest { + RequestHeader header = 1; +} + +message AllocIDResponse { + ResponseHeader header = 1; + + uint64 id = 2; +} + +message GetStoreRequest { + RequestHeader header = 1; + + uint64 store_id = 2; +} + +message GetStoreResponse { + ResponseHeader header = 1; + + metapb.Store store = 2; +} + +message PutStoreRequest { + RequestHeader header = 1; + + metapb.Store store = 2; +} + +message PutStoreResponse { + ResponseHeader header = 1; +} + +message GetAllStoresRequest { + RequestHeader header = 1; +} + +message GetAllStoresResponse { + ResponseHeader header = 1; + + repeated metapb.Store stores = 2; +} + +message GetRegionRequest { + RequestHeader header = 1; + + bytes region_key = 2; +} + +message GetRegionResponse { + ResponseHeader header = 1; + + metapb.Region region = 2; + metapb.Peer leader = 3; +} + +message GetRegionByIDRequest { + RequestHeader header = 1; + + uint64 region_id = 2; +} + +// Use GetRegionResponse as the response of GetRegionByIDRequest. + +message GetClusterConfigRequest { + RequestHeader header = 1; +} + +message GetClusterConfigResponse { + ResponseHeader header = 1; + + metapb.Cluster cluster = 2; +} + +message PutClusterConfigRequest { + RequestHeader header = 1; + + metapb.Cluster cluster = 2; +} + +message PutClusterConfigResponse { + ResponseHeader header = 1; +} + +message Member { + // name is the name of the PD member. + string name = 1; + // member_id is the unique id of the PD member. + uint64 member_id = 2; + repeated string peer_urls = 3; + repeated string client_urls = 4; + int32 leader_priority = 5; +} + +message GetMembersRequest { + RequestHeader header = 1; +} + +message GetMembersResponse { + ResponseHeader header = 1; + + repeated Member members = 2; + Member leader = 3; + Member etcd_leader = 4; +} + +message PeerStats { + metapb.Peer peer = 1; + uint64 down_seconds = 2; +} + +message RegionHeartbeatRequest { + RequestHeader header = 1; + + metapb.Region region = 2; + // Leader Peer sending the heartbeat. + metapb.Peer leader = 3; + // Leader considers that these peers are down. + repeated PeerStats down_peers = 4; + // Pending peers are the peers that the leader can't consider as + // working followers. + repeated metapb.Peer pending_peers = 5; + // Bytes read/written during this period. + uint64 bytes_written = 6; + uint64 bytes_read = 7; + // Keys read/written during this period. + uint64 keys_written = 8; + uint64 keys_read = 9; + // Approximate region size. + uint64 approximate_size = 10; + reserved 11; + // Actually reported time interval + TimeInterval interval = 12; + // Approximate number of keys. + uint64 approximate_keys = 13; +} + +message ChangePeer { + metapb.Peer peer = 1; + eraftpb.ConfChangeType change_type = 2; +} + +message TransferLeader { + metapb.Peer peer = 1; +} + +message Merge { + metapb.Region target = 1; +} + +message SplitRegion { + CheckPolicy policy = 1; +} + +enum CheckPolicy { + SCAN = 0; + APPROXIMATE = 1; +} + +message RegionHeartbeatResponse { + ResponseHeader header = 1; + + // Notice, Pd only allows handling reported epoch >= current pd's. + // Leader peer reports region status with RegionHeartbeatRequest + // to pd regularly, pd will determine whether this region + // should do ChangePeer or not. + // E,g, max peer number is 3, region A, first only peer 1 in A. + // 1. Pd region state -> Peers (1), ConfVer (1). + // 2. Leader peer 1 reports region state to pd, pd finds the + // peer number is < 3, so first changes its current region + // state -> Peers (1, 2), ConfVer (1), and returns ChangePeer Adding 2. + // 3. Leader does ChangePeer, then reports Peers (1, 2), ConfVer (2), + // pd updates its state -> Peers (1, 2), ConfVer (2). + // 4. Leader may report old Peers (1), ConfVer (1) to pd before ConfChange + // finished, pd stills responses ChangePeer Adding 2, of course, we must + // guarantee the second ChangePeer can't be applied in TiKV. + ChangePeer change_peer = 2; + // Pd can return transfer_leader to let TiKV does leader transfer itself. + TransferLeader transfer_leader = 3; + // ID of the region + uint64 region_id = 4; + metapb.RegionEpoch region_epoch = 5; + // Leader of the region at the moment of the corresponding request was made. + metapb.Peer target_peer = 6; + Merge merge = 7; + // PD sends split_region to let TiKV split a region into two regions. + SplitRegion split_region = 8; +} + +message AskSplitRequest { + RequestHeader header = 1; + + metapb.Region region = 2; +} + +message AskSplitResponse { + ResponseHeader header = 1; + + // We split the region into two, first uses the origin + // parent region id, and the second uses the new_region_id. + // We must guarantee that the new_region_id is global unique. + uint64 new_region_id = 2; + // The peer ids for the new split region. + repeated uint64 new_peer_ids = 3; +} + +message ReportSplitRequest { + RequestHeader header = 1; + + metapb.Region left = 2; + metapb.Region right = 3; +} + +message ReportSplitResponse { + ResponseHeader header = 1; +} + +message AskBatchSplitRequest { + RequestHeader header = 1; + + metapb.Region region = 2; + uint32 split_count = 3; +} + +message SplitID { + uint64 new_region_id = 1; + repeated uint64 new_peer_ids = 2; +} + +message AskBatchSplitResponse { + ResponseHeader header = 1; + + repeated SplitID ids = 2; +} + +message ReportBatchSplitRequest { + RequestHeader header = 1; + + repeated metapb.Region regions = 2; +} + +message ReportBatchSplitResponse { + ResponseHeader header = 1; +} + +message TimeInterval { + // The unix timestamp in seconds of the start of this period. + uint64 start_timestamp = 1; + // The unix timestamp in seconds of the end of this period. + uint64 end_timestamp = 2; +} + +message StoreStats { + uint64 store_id = 1; + // Capacity for the store. + uint64 capacity = 2; + // Available size for the store. + uint64 available = 3; + // Total region count in this store. + uint32 region_count = 4; + // Current sending snapshot count. + uint32 sending_snap_count = 5; + // Current receiving snapshot count. + uint32 receiving_snap_count = 6; + // When the store is started (unix timestamp in seconds). + uint32 start_time = 7; + // How many region is applying snapshot. + uint32 applying_snap_count = 8; + // If the store is busy + bool is_busy = 9; + // Actually used space by db + uint64 used_size = 10; + // Bytes written for the store during this period. + uint64 bytes_written = 11; + // Keys written for the store during this period. + uint64 keys_written = 12; + // Bytes read for the store during this period. + uint64 bytes_read = 13; + // Keys read for the store during this period. + uint64 keys_read = 14; + // Actually reported time interval + TimeInterval interval = 15; +} + +message StoreHeartbeatRequest { + RequestHeader header = 1; + + StoreStats stats = 2; +} + +message StoreHeartbeatResponse { + ResponseHeader header = 1; +} + +message ScatterRegionRequest { + RequestHeader header = 1; + + uint64 region_id = 2; + + // PD will use these region information if it can't find the region. + // For example, the region is just split and hasn't report to PD yet. + metapb.Region region = 3; + metapb.Peer leader = 4; +} + +message ScatterRegionResponse { + ResponseHeader header = 1; +} + +message GetGCSafePointRequest { + RequestHeader header = 1; +} + +message GetGCSafePointResponse { + ResponseHeader header = 1; + + uint64 safe_point = 2; +} + +message UpdateGCSafePointRequest { + RequestHeader header = 1; + + uint64 safe_point = 2; +} + +message UpdateGCSafePointResponse { + ResponseHeader header = 1; + + uint64 new_safe_point = 2; +} + +message SyncRegionRequest{ + RequestHeader header = 1; + Member member = 2; + // the follower PD will use the start index to locate historical changes + // that require synchronization. + uint64 start_index = 3; +} + +message SyncRegionResponse{ + ResponseHeader header = 1; + // the leader PD will send the repsonds include + // changed regions records and the index of the first record. + repeated metapb.Region regions = 2; + uint64 start_index = 3; +} + diff --git a/src/main/proto/raft_cmdpb.proto b/src/main/proto/raft_cmdpb.proto new file mode 100644 index 00000000000..215b0e90ecf --- /dev/null +++ b/src/main/proto/raft_cmdpb.proto @@ -0,0 +1,296 @@ +syntax = "proto3"; +package raft_cmdpb; + +import "metapb.proto"; +import "errorpb.proto"; +import "eraftpb.proto"; +import "import_sstpb.proto"; + +option java_package = "org.tikv.kvproto"; + +message GetRequest { + string cf = 1; + bytes key = 2; +} + +message GetResponse { + bytes value = 1; +} + +message PutRequest { + string cf = 1; + bytes key = 2; + bytes value = 3; +} + +message PutResponse {} + +message DeleteRequest { + string cf = 1; + bytes key = 2; +} + +message DeleteResponse {} + +message DeleteRangeRequest { + string cf = 1; + bytes start_key = 2; + bytes end_key = 3; +} + +message DeleteRangeResponse {} + +message SnapRequest {} + +message SnapResponse { + metapb.Region region = 1; +} + +message PrewriteRequest { + bytes key = 1; + bytes value = 2; + bytes lock = 3; +} + +message PrewriteResponse {} + +message IngestSSTRequest { + import_sstpb.SSTMeta sst = 1; +} + +message IngestSSTResponse {} + +enum CmdType { + Invalid = 0; + Get = 1; + Put = 3; + Delete = 4; + Snap = 5; + Prewrite = 6; + DeleteRange = 7; + IngestSST = 8; +} + +message Request { + CmdType cmd_type = 1; + GetRequest get = 2; + PutRequest put = 4; + DeleteRequest delete = 5; + SnapRequest snap = 6; + PrewriteRequest prewrite = 7; + DeleteRangeRequest delete_range = 8; + IngestSSTRequest ingest_sst = 9; +} + +message Response { + CmdType cmd_type = 1; + GetResponse get = 2; + PutResponse put = 4; + DeleteResponse delete = 5; + SnapResponse snap = 6; + PrewriteResponse prewrite = 7; + DeleteRangeResponse delte_range = 8; + IngestSSTResponse ingest_sst = 9; +} + +message ChangePeerRequest { + // This can be only called in internal RaftStore now. + eraftpb.ConfChangeType change_type = 1; + metapb.Peer peer = 2; +} + +message ChangePeerResponse { + metapb.Region region = 1; +} + +message SplitRequest { + // This can be only called in internal RaftStore now. + // The split_key must be in the been splitting region. + bytes split_key = 1; + // We split the region into two, first uses the origin + // parent region id, and the second uses the new_region_id. + // We must guarantee that the new_region_id is global unique. + uint64 new_region_id = 2; + // The peer ids for the new split region. + repeated uint64 new_peer_ids = 3; + // If true, right region derive the origin region_id, + // left region use new_region_id. + // Will be ignored in batch split, use `BatchSplitRequest::right_derive` instead. + bool right_derive = 4 [deprecated=true]; +} + +message SplitResponse { + metapb.Region left = 1; + metapb.Region right = 2; +} + +message BatchSplitRequest { + repeated SplitRequest requests = 1; + // If true, the last region derive the origin region_id, + // other regions use new ids. + bool right_derive = 2; +} + +message BatchSplitResponse { + repeated metapb.Region regions = 1; +} + +message CompactLogRequest { + uint64 compact_index = 1; + uint64 compact_term = 2; +} + +message CompactLogResponse {} + +message TransferLeaderRequest { + metapb.Peer peer = 1; +} + +message TransferLeaderResponse {} + +message VerifyHashRequest { + uint64 index = 1; + bytes hash = 2; +} + +message VerifyHashResponse {} + +message PrepareMergeRequest { + uint64 min_index = 1; + metapb.Region target = 2; +} + +message PrepareMergeResponse {} + +message CommitMergeRequest { + metapb.Region source = 1; + uint64 commit = 2; + repeated eraftpb.Entry entries = 3; +} + +message CommitMergeResponse {} + +message RollbackMergeRequest { + uint64 commit = 1; +} + +message RollbackMergeResponse {} + +enum AdminCmdType { + InvalidAdmin = 0; + ChangePeer = 1; + // Use `BatchSplit` instead. + Split = 2 [deprecated=true]; + CompactLog = 3; + TransferLeader = 4; + ComputeHash = 5; + VerifyHash = 6; + PrepareMerge = 7; + CommitMerge = 8; + RollbackMerge = 9; + BatchSplit = 10; +} + +message AdminRequest { + AdminCmdType cmd_type = 1; + ChangePeerRequest change_peer = 2; + SplitRequest split = 3 [deprecated=true]; + CompactLogRequest compact_log = 4; + TransferLeaderRequest transfer_leader = 5; + VerifyHashRequest verify_hash = 6; + PrepareMergeRequest prepare_merge = 7; + CommitMergeRequest commit_merge = 8; + RollbackMergeRequest rollback_merge = 9; + BatchSplitRequest splits = 10; +} + +message AdminResponse { + AdminCmdType cmd_type = 1; + ChangePeerResponse change_peer = 2; + SplitResponse split = 3 [deprecated=true]; + CompactLogResponse compact_log = 4; + TransferLeaderResponse transfer_leader = 5; + VerifyHashResponse verify_hash = 6; + PrepareMergeResponse prepare_merge = 7; + CommitMergeResponse commit_merge = 8; + RollbackMergeResponse rollback_merge = 9; + BatchSplitResponse splits = 10; +} + +// For get the leader of the region. +message RegionLeaderRequest {} + +message RegionLeaderResponse { + metapb.Peer leader = 1; +} + +// For getting more information of the region. +// We add some admin operations (ChangePeer, Split...) into the pb job list, +// then pd server will peek the first one, handle it and then pop it from the job lib. +// But sometimes, the pd server may crash before popping. When another pd server +// starts and finds the job is running but not finished, it will first check whether +// the raft server already has handled this job. +// E,g, for ChangePeer, if we add Peer10 into region1 and find region1 has already had +// Peer10, we can think this ChangePeer is finished, and can pop this job from job list +// directly. +message RegionDetailRequest {} + +message RegionDetailResponse { + metapb.Region region = 1; + metapb.Peer leader = 2; +} + + +enum StatusCmdType { + InvalidStatus = 0; + RegionLeader = 1; + RegionDetail = 2; +} + +message StatusRequest { + StatusCmdType cmd_type = 1; + RegionLeaderRequest region_leader = 2; + RegionDetailRequest region_detail = 3; +} + +message StatusResponse { + StatusCmdType cmd_type = 1; + RegionLeaderResponse region_leader = 2; + RegionDetailResponse region_detail = 3; +} + +message RaftRequestHeader { + uint64 region_id = 1; + metapb.Peer peer = 2; + // true for read linearization + bool read_quorum = 3; + // 16 bytes, to distinguish request. + bytes uuid = 4; + + metapb.RegionEpoch region_epoch = 5; + uint64 term = 6; + + bool sync_log = 7; +} + +message RaftResponseHeader { + errorpb.Error error = 1; + bytes uuid = 2; + uint64 current_term = 3; +} + +message RaftCmdRequest { + RaftRequestHeader header = 1; + // We can't enclose normal requests and administrator request + // at same time. + repeated Request requests = 2; + AdminRequest admin_request = 3; + StatusRequest status_request = 4; +} + +message RaftCmdResponse { + RaftResponseHeader header = 1; + repeated Response responses = 2; + AdminResponse admin_response = 3; + StatusResponse status_response = 4; +} diff --git a/src/main/proto/raft_serverpb.proto b/src/main/proto/raft_serverpb.proto new file mode 100644 index 00000000000..104d205a440 --- /dev/null +++ b/src/main/proto/raft_serverpb.proto @@ -0,0 +1,91 @@ +syntax = "proto3"; +package raft_serverpb; + +import "eraftpb.proto"; +import "metapb.proto"; + +option java_package = "org.tikv.kvproto"; + +message RaftMessage { + uint64 region_id = 1; + metapb.Peer from_peer = 2; + metapb.Peer to_peer = 3; + eraftpb.Message message = 4; + metapb.RegionEpoch region_epoch = 5; + // true means to_peer is a tombstone peer and it should remove itself. + bool is_tombstone = 6; + // Region key range [start_key, end_key). + bytes start_key = 7; + bytes end_key = 8; + // If it has value, to_peer should be removed if merge is never going to complete. + metapb.Region merge_target = 9; +} + +message RaftTruncatedState { + uint64 index = 1; + uint64 term = 2; +} + +message SnapshotCFFile { + string cf = 1; + uint64 size = 2; + uint32 checksum = 3; +} + +message SnapshotMeta { + repeated SnapshotCFFile cf_files = 1; +} + +message SnapshotChunk { + RaftMessage message = 1; + bytes data = 2; +} + +message Done {} + +message KeyValue { + bytes key = 1; + bytes value = 2; +} + +message RaftSnapshotData { + metapb.Region region = 1; + uint64 file_size = 2; + repeated KeyValue data = 3; + uint64 version = 4; + SnapshotMeta meta = 5; +} + +message StoreIdent { + uint64 cluster_id = 1; + uint64 store_id = 2; +} + +message RaftLocalState { + eraftpb.HardState hard_state = 1; + uint64 last_index = 2; +} + +message RaftApplyState { + uint64 applied_index = 1; + RaftTruncatedState truncated_state = 2; +} + +enum PeerState { + Normal = 0; + Applying = 1; + Tombstone = 2; + Merging = 3; +} + +message MergeState { + uint64 min_index = 1; + metapb.Region target = 2; + uint64 commit = 3; +} + +message RegionLocalState { + PeerState state = 1; + metapb.Region region = 2; + MergeState merge_state = 3; +} diff --git a/src/main/proto/schema.proto b/src/main/proto/schema.proto new file mode 100644 index 00000000000..d7468417f15 --- /dev/null +++ b/src/main/proto/schema.proto @@ -0,0 +1,42 @@ +syntax = "proto2"; + +package tipb; + +option java_multiple_files = true; +option java_package = "com.pingcap.tidb.tipb"; + +import "gogoproto/gogo.proto"; + +option (gogoproto.marshaler_all) = true; +option (gogoproto.sizer_all) = true; +option (gogoproto.unmarshaler_all) = true; + +message TableInfo { + optional int64 table_id = 1 [(gogoproto.nullable) = false]; + repeated ColumnInfo columns = 2; +} + +message ColumnInfo { + optional int64 column_id = 1 [(gogoproto.nullable) = false]; + optional int32 tp = 2 [(gogoproto.nullable) = false]; // MySQL type. + optional int32 collation = 3 [(gogoproto.nullable) = false]; + optional int32 columnLen = 4 [(gogoproto.nullable) = false]; + optional int32 decimal = 5 [(gogoproto.nullable) = false]; + optional int32 flag = 6 [(gogoproto.nullable) = false]; + repeated string elems = 7; + optional bytes default_val = 8; // Encoded datum. + optional bool pk_handle = 21 [(gogoproto.nullable) = false]; // PK handle column value is row handle. +} + +message IndexInfo { + optional int64 table_id = 1 [(gogoproto.nullable) = false]; + optional int64 index_id = 2 [(gogoproto.nullable) = false]; + repeated ColumnInfo columns = 3; + optional bool unique = 4 [(gogoproto.nullable) = false]; +} + +// KeyRange is the encoded index key range, low is closed, high is open. (low <= x < high) +message KeyRange { + optional bytes low = 1; + optional bytes high = 2; +} diff --git a/src/main/proto/select.proto b/src/main/proto/select.proto new file mode 100644 index 00000000000..435040a6efa --- /dev/null +++ b/src/main/proto/select.proto @@ -0,0 +1,113 @@ +syntax = "proto2"; + +package tipb; + +option java_multiple_files = true; +option java_package = "com.pingcap.tidb.tipb"; + +import "executor.proto"; +import "gogoproto/gogo.proto"; + +option (gogoproto.marshaler_all) = true; +option (gogoproto.sizer_all) = true; +option (gogoproto.unmarshaler_all) = true; + +// values are all in text format. +message Row { + optional bytes handle = 1; + optional bytes data = 2; +} + +message Error { + optional int32 code = 1 [(gogoproto.nullable) = false]; + optional string msg = 2 [(gogoproto.nullable) = false]; +} + +// Response for SelectRequest. +message SelectResponse { + optional Error error = 1; + + // Result rows. + repeated Row rows = 2; + + // Use multiple chunks to reduce memory allocation and + // avoid allocating large contiguous memory. + repeated Chunk chunks = 3 [(gogoproto.nullable) = false]; + repeated Error warnings = 4; + repeated int64 output_counts = 5; + optional int64 warning_count = 6; + optional bytes row_batch_data = 7 [(gogoproto.customtype) = "github.com/pingcap/tipb/sharedbytes.SharedBytes", (gogoproto.nullable) = false]; +} + +// Chunk contains multiple rows data and rows meta. +message Chunk { + // Data for all rows in the chunk. + optional bytes rows_data = 3 [(gogoproto.customtype) = "github.com/pingcap/tipb/sharedbytes.SharedBytes", (gogoproto.nullable) = false]; + + // Meta data for every row. + repeated RowMeta rows_meta = 4 [(gogoproto.nullable) = false]; +} + +// RowMeta contains row handle and length of a row. +message RowMeta { + optional int64 handle = 1 [(gogoproto.nullable) = false]; + optional int64 length = 2 [(gogoproto.nullable) = false]; +} + +// DAGRequest represents the request that will be handled with DAG mode. +message DAGRequest { + // Transaction start timestamp. + optional uint64 start_ts = 1 [(gogoproto.nullable) = false]; + + // It represents push down Executors. + repeated Executor executors = 2; + + // time zone offset in seconds + optional int64 time_zone_offset = 3 [(gogoproto.nullable) = false]; + + // flags are used to store flags that change the execution mode, it contains: + // ignore_truncate = 1 + // truncate error should be ignore if set. + // truncate_as_warning = 1 << 1 + // when ignored_truncate is not set, return warning instead of error if this flag is set. + // ... + // add more when needed. + optional uint64 flags = 4 [(gogoproto.nullable) = false]; + + // It represents which columns we should output. + repeated uint32 output_offsets = 5; + + // It represents whether we collect the detailed scan counts in each range. + optional bool collect_range_counts = 6; + + // It indicates the maximum number of warning, + // which is the number of messages that SHOW WARNINGS displays. + optional uint64 max_warning_count = 7; + + // It indicates the encode type of response. + optional EncodeType encode_type = 8 [(gogoproto.nullable) = false]; + + // It indicates the sql_mode. + optional uint64 sql_mode = 9; + + // It indicates whether the sql mode is strict. + optional bool is_strict_sql_mode = 10; + + // supply offset is not enough since we have daylight saving time present in some regions + optional string time_zone_name = 11 [(gogoproto.nullable) = false]; +} + +enum EncodeType { + TypeDefault = 0; + TypeArrow = 1; +} + +message StreamResponse { + optional Error error = 1; + // Data for all rows + optional bytes data = 3 [(gogoproto.customtype) = "github.com/pingcap/tipb/sharedbytes.SharedBytes", (gogoproto.nullable) = false]; + repeated Error warnings = 4; + // output row count for each executor + repeated int64 output_counts = 5; + optional int64 warning_count = 6; +} diff --git a/src/main/proto/tikvpb.proto b/src/main/proto/tikvpb.proto new file mode 100644 index 00000000000..0e6cd7ec140 --- /dev/null +++ b/src/main/proto/tikvpb.proto @@ -0,0 +1,60 @@ +syntax = "proto3"; +package tikvpb; + +import "coprocessor.proto"; +import "kvrpcpb.proto"; +import "raft_serverpb.proto"; + +import "gogoproto/gogo.proto"; + +option (gogoproto.sizer_all) = true; +option (gogoproto.marshaler_all) = true; +option (gogoproto.unmarshaler_all) = true; + +option java_package = "org.tikv.kvproto"; + +// Serve as a distributed kv database. +service Tikv { + // KV commands with mvcc/txn supported. + rpc KvGet(kvrpcpb.GetRequest) returns (kvrpcpb.GetResponse) {} + rpc KvScan(kvrpcpb.ScanRequest) returns (kvrpcpb.ScanResponse) {} + rpc KvPrewrite(kvrpcpb.PrewriteRequest) returns (kvrpcpb.PrewriteResponse) {} + rpc KvCommit(kvrpcpb.CommitRequest) returns (kvrpcpb.CommitResponse) {} + rpc KvImport(kvrpcpb.ImportRequest) returns (kvrpcpb.ImportResponse) {} + rpc KvCleanup(kvrpcpb.CleanupRequest) returns (kvrpcpb.CleanupResponse) {} + rpc KvBatchGet(kvrpcpb.BatchGetRequest) returns (kvrpcpb.BatchGetResponse) {} + rpc KvBatchRollback(kvrpcpb.BatchRollbackRequest) returns (kvrpcpb.BatchRollbackResponse) {} + rpc KvScanLock(kvrpcpb.ScanLockRequest) returns (kvrpcpb.ScanLockResponse) {} + rpc KvResolveLock(kvrpcpb.ResolveLockRequest) returns (kvrpcpb.ResolveLockResponse) {} + rpc KvGC(kvrpcpb.GCRequest) returns (kvrpcpb.GCResponse) {} + rpc KvDeleteRange(kvrpcpb.DeleteRangeRequest) returns (kvrpcpb.DeleteRangeResponse) {} + + // RawKV commands. + rpc RawGet(kvrpcpb.RawGetRequest) returns (kvrpcpb.RawGetResponse) {} + rpc RawBatchGet(kvrpcpb.RawBatchGetRequest) returns (kvrpcpb.RawBatchGetResponse) {} + rpc RawPut(kvrpcpb.RawPutRequest) returns (kvrpcpb.RawPutResponse) {} + rpc RawBatchPut(kvrpcpb.RawBatchPutRequest) returns (kvrpcpb.RawBatchPutResponse) {} + rpc RawDelete(kvrpcpb.RawDeleteRequest) returns (kvrpcpb.RawDeleteResponse) {} + rpc RawBatchDelete(kvrpcpb.RawBatchDeleteRequest) returns (kvrpcpb.RawBatchDeleteResponse) {} + rpc RawScan(kvrpcpb.RawScanRequest) returns (kvrpcpb.RawScanResponse) {} + rpc RawDeleteRange(kvrpcpb.RawDeleteRangeRequest) returns (kvrpcpb.RawDeleteRangeResponse) {} + rpc RawBatchScan(kvrpcpb.RawBatchScanRequest) returns (kvrpcpb.RawBatchScanResponse) {} + + // Store commands (to the whole tikv but not a certain region) + rpc UnsafeDestroyRange(kvrpcpb.UnsafeDestroyRangeRequest) returns (kvrpcpb.UnsafeDestroyRangeResponse) {} + + // SQL push down commands. + rpc Coprocessor(coprocessor.Request) returns (coprocessor.Response) {} + rpc CoprocessorStream(coprocessor.Request) returns (stream coprocessor.Response) {} + + // Raft commands (tikv <-> tikv). + rpc Raft(stream raft_serverpb.RaftMessage) returns (raft_serverpb.Done) {} + rpc Snapshot(stream raft_serverpb.SnapshotChunk) returns (raft_serverpb.Done) {} + + // Region commands. + rpc SplitRegion (kvrpcpb.SplitRegionRequest) returns (kvrpcpb.SplitRegionResponse) {} + + // transaction debugger commands. + rpc MvccGetByKey(kvrpcpb.MvccGetByKeyRequest) returns (kvrpcpb.MvccGetByKeyResponse) {} + rpc MvccGetByStartTs(kvrpcpb.MvccGetByStartTsRequest) returns (kvrpcpb.MvccGetByStartTsResponse) {} +} diff --git a/src/test/java/org/tikv/common/KVMockServer.java b/src/test/java/org/tikv/common/KVMockServer.java index 7c1ef2fa9ff..82bdf8c1960 100644 --- a/src/test/java/org/tikv/common/KVMockServer.java +++ b/src/test/java/org/tikv/common/KVMockServer.java @@ -101,15 +101,15 @@ private void verifyContext(Context context) throws Exception { @Override public void rawGet( - org.tikv.kvproto.Kvrpcpb.RawGetRequest request, - io.grpc.stub.StreamObserver responseObserver) { + Kvrpcpb.RawGetRequest request, + io.grpc.stub.StreamObserver responseObserver) { try { verifyContext(request.getContext()); ByteString key = request.getKey(); Kvrpcpb.RawGetResponse.Builder builder = Kvrpcpb.RawGetResponse.newBuilder(); Integer errorCode = errorMap.remove(key); - Errorpb.Error.Builder errBuilder = Errorpb.Error.newBuilder(); + Error.Builder errBuilder = Error.newBuilder(); if (errorCode != null) { setErrorInfo(errorCode, errBuilder); builder.setRegionError(errBuilder.build()); @@ -125,15 +125,15 @@ public void rawGet( /** */ public void rawPut( - org.tikv.kvproto.Kvrpcpb.RawPutRequest request, - io.grpc.stub.StreamObserver responseObserver) { + Kvrpcpb.RawPutRequest request, + io.grpc.stub.StreamObserver responseObserver) { try { verifyContext(request.getContext()); ByteString key = request.getKey(); Kvrpcpb.RawPutResponse.Builder builder = Kvrpcpb.RawPutResponse.newBuilder(); Integer errorCode = errorMap.remove(key); - Errorpb.Error.Builder errBuilder = Errorpb.Error.newBuilder(); + Error.Builder errBuilder = Error.newBuilder(); if (errorCode != null) { setErrorInfo(errorCode, errBuilder); builder.setRegionError(errBuilder.build()); @@ -146,19 +146,19 @@ public void rawPut( } } - private void setErrorInfo(int errorCode, Errorpb.Error.Builder errBuilder) { + private void setErrorInfo(int errorCode, Error.Builder errBuilder) { if (errorCode == NOT_LEADER) { - errBuilder.setNotLeader(Errorpb.NotLeader.getDefaultInstance()); + errBuilder.setNotLeader(NotLeader.getDefaultInstance()); } else if (errorCode == REGION_NOT_FOUND) { errBuilder.setRegionNotFound(Errorpb.RegionNotFound.getDefaultInstance()); } else if (errorCode == KEY_NOT_IN_REGION) { errBuilder.setKeyNotInRegion(Errorpb.KeyNotInRegion.getDefaultInstance()); } else if (errorCode == STALE_EPOCH) { - errBuilder.setStaleEpoch(Errorpb.StaleEpoch.getDefaultInstance()); + errBuilder.setStaleEpoch(StaleEpoch.getDefaultInstance()); } else if (errorCode == STALE_COMMAND) { errBuilder.setStaleCommand(Errorpb.StaleCommand.getDefaultInstance()); } else if (errorCode == SERVER_IS_BUSY) { - errBuilder.setServerIsBusy(Errorpb.ServerIsBusy.getDefaultInstance()); + errBuilder.setServerIsBusy(ServerIsBusy.getDefaultInstance()); } else if (errorCode == STORE_NOT_MATCH) { errBuilder.setStoreNotMatch(Errorpb.StoreNotMatch.getDefaultInstance()); } else if (errorCode == RAFT_ENTRY_TOO_LARGE) { @@ -168,15 +168,15 @@ private void setErrorInfo(int errorCode, Errorpb.Error.Builder errBuilder) { /** */ public void rawDelete( - org.tikv.kvproto.Kvrpcpb.RawDeleteRequest request, - io.grpc.stub.StreamObserver responseObserver) { + Kvrpcpb.RawDeleteRequest request, + io.grpc.stub.StreamObserver responseObserver) { try { verifyContext(request.getContext()); ByteString key = request.getKey(); Kvrpcpb.RawDeleteResponse.Builder builder = Kvrpcpb.RawDeleteResponse.newBuilder(); Integer errorCode = errorMap.remove(key); - Errorpb.Error.Builder errBuilder = Errorpb.Error.newBuilder(); + Error.Builder errBuilder = Error.newBuilder(); if (errorCode != null) { setErrorInfo(errorCode, errBuilder); builder.setRegionError(errBuilder.build()); @@ -190,8 +190,8 @@ public void rawDelete( @Override public void kvGet( - org.tikv.kvproto.Kvrpcpb.GetRequest request, - io.grpc.stub.StreamObserver responseObserver) { + Kvrpcpb.GetRequest request, + io.grpc.stub.StreamObserver responseObserver) { try { verifyContext(request.getContext()); if (request.getVersion() == 0) { @@ -222,8 +222,8 @@ public void kvGet( @Override public void kvScan( - org.tikv.kvproto.Kvrpcpb.ScanRequest request, - io.grpc.stub.StreamObserver responseObserver) { + Kvrpcpb.ScanRequest request, + io.grpc.stub.StreamObserver responseObserver) { try { verifyContext(request.getContext()); if (request.getVersion() == 0) { @@ -236,7 +236,7 @@ public void kvScan( Integer errorCode = errorMap.remove(key); if (errorCode != null) { if (errorCode == ABORT) { - errBuilder.setServerIsBusy(Errorpb.ServerIsBusy.getDefaultInstance()); + errBuilder.setServerIsBusy(ServerIsBusy.getDefaultInstance()); } builder.setRegionError(errBuilder.build()); } else { @@ -262,8 +262,8 @@ public void kvScan( @Override public void kvBatchGet( - org.tikv.kvproto.Kvrpcpb.BatchGetRequest request, - io.grpc.stub.StreamObserver responseObserver) { + Kvrpcpb.BatchGetRequest request, + io.grpc.stub.StreamObserver responseObserver) { try { verifyContext(request.getContext()); if (request.getVersion() == 0) { @@ -278,7 +278,7 @@ public void kvBatchGet( Integer errorCode = errorMap.remove(key); if (errorCode != null) { if (errorCode == ABORT) { - errBuilder.setServerIsBusy(Errorpb.ServerIsBusy.getDefaultInstance()); + errBuilder.setServerIsBusy(ServerIsBusy.getDefaultInstance()); } builder.setRegionError(errBuilder.build()); break; @@ -297,8 +297,8 @@ public void kvBatchGet( @Override public void coprocessor( - org.tikv.kvproto.Coprocessor.Request requestWrap, - io.grpc.stub.StreamObserver responseObserver) { + Coprocessor.Request requestWrap, + io.grpc.stub.StreamObserver responseObserver) { try { verifyContext(requestWrap.getContext()); @@ -311,8 +311,8 @@ public void coprocessor( Coprocessor.Response.Builder builderWrap = Coprocessor.Response.newBuilder(); SelectResponse.Builder builder = SelectResponse.newBuilder(); - org.tikv.kvproto.Errorpb.Error.Builder errBuilder = - org.tikv.kvproto.Errorpb.Error.newBuilder(); + Error.Builder errBuilder = + Error.newBuilder(); for (Coprocessor.KeyRange keyRange : keyRanges) { Integer errorCode = errorMap.remove(keyRange.getStart()); diff --git a/src/test/java/org/tikv/common/MockServerTest.java b/src/test/java/org/tikv/common/MockServerTest.java index d28227a832d..2d46100902e 100644 --- a/src/test/java/org/tikv/common/MockServerTest.java +++ b/src/test/java/org/tikv/common/MockServerTest.java @@ -1,10 +1,11 @@ package org.tikv.common; +import com.google.common.annotations.VisibleForTesting; import com.google.protobuf.ByteString; import org.junit.After; import org.junit.Before; -import org.tikv.common.TiConfiguration.KVMode; import org.tikv.common.region.TiRegion; +import org.tikv.kvproto.Coprocessor; import org.tikv.kvproto.Kvrpcpb; import org.tikv.kvproto.Metapb; import org.tikv.kvproto.Pdpb; @@ -39,8 +40,7 @@ public void setUp() throws Exception { .build(); region = - new TiRegion( - r, r.getPeers(0), Kvrpcpb.IsolationLevel.RC, Kvrpcpb.CommandPri.Low, KVMode.TXN); + new TiRegion(r, r.getPeers(0), Kvrpcpb.IsolationLevel.RC, Kvrpcpb.CommandPri.Low, "KV"); pdServer.addGetRegionResp(Pdpb.GetRegionResponse.newBuilder().setRegion(r).build()); server = new KVMockServer(); port = server.start(region); @@ -52,6 +52,10 @@ public void setUp() throws Exception { @After public void tearDown() throws Exception { server.stop(); - session.close(); + } + + @VisibleForTesting + protected static Coprocessor.KeyRange createByteStringRange(ByteString sKey, ByteString eKey) { + return Coprocessor.KeyRange.newBuilder().setStart(sKey).setEnd(eKey).build(); } } diff --git a/src/test/java/org/tikv/common/PDClientTest.java b/src/test/java/org/tikv/common/PDClientTest.java index d3701566b3e..d34147c758b 100644 --- a/src/test/java/org/tikv/common/PDClientTest.java +++ b/src/test/java/org/tikv/common/PDClientTest.java @@ -56,7 +56,6 @@ public void setup() throws IOException { @After public void tearDown() { - session.close(); server.stop(); } @@ -196,14 +195,14 @@ public void testGetStore() throws Exception { GrpcUtils.makeStore( storeId, testAddress, - Metapb.StoreState.Up, + StoreState.Up, GrpcUtils.makeStoreLabel("k1", "v1"), GrpcUtils.makeStoreLabel("k2", "v2")))); try (PDClient client = session.getPDClient()) { Store r = client.getStore(defaultBackOff(), 0); assertEquals(r.getId(), storeId); assertEquals(r.getAddress(), testAddress); - assertEquals(r.getState(), Metapb.StoreState.Up); + assertEquals(r.getState(), StoreState.Up); assertEquals(r.getLabels(0).getKey(), "k1"); assertEquals(r.getLabels(1).getKey(), "k2"); assertEquals(r.getLabels(0).getValue(), "v1"); @@ -212,7 +211,7 @@ public void testGetStore() throws Exception { server.addGetStoreResp( GrpcUtils.makeGetStoreResponse( server.getClusterId(), - GrpcUtils.makeStore(storeId, testAddress, Metapb.StoreState.Tombstone))); + GrpcUtils.makeStore(storeId, testAddress, StoreState.Tombstone))); assertEquals(StoreState.Tombstone, client.getStore(defaultBackOff(), 0).getState()); } } @@ -227,14 +226,14 @@ public void testGetStoreAsync() throws Exception { GrpcUtils.makeStore( storeId, testAddress, - Metapb.StoreState.Up, + StoreState.Up, GrpcUtils.makeStoreLabel("k1", "v1"), GrpcUtils.makeStoreLabel("k2", "v2")))); try (PDClient client = session.getPDClient()) { Store r = client.getStoreAsync(defaultBackOff(), 0).get(); assertEquals(r.getId(), storeId); assertEquals(r.getAddress(), testAddress); - assertEquals(r.getState(), Metapb.StoreState.Up); + assertEquals(r.getState(), StoreState.Up); assertEquals(r.getLabels(0).getKey(), "k1"); assertEquals(r.getLabels(1).getKey(), "k2"); assertEquals(r.getLabels(0).getValue(), "v1"); @@ -243,7 +242,7 @@ public void testGetStoreAsync() throws Exception { server.addGetStoreResp( GrpcUtils.makeGetStoreResponse( server.getClusterId(), - GrpcUtils.makeStore(storeId, testAddress, Metapb.StoreState.Tombstone))); + GrpcUtils.makeStore(storeId, testAddress, StoreState.Tombstone))); assertEquals( StoreState.Tombstone, client.getStoreAsync(defaultBackOff(), 0).get().getState()); } @@ -261,7 +260,7 @@ public void testRetryPolicy() throws Exception { server.addGetStoreResp(null); server.addGetStoreResp( GrpcUtils.makeGetStoreResponse( - server.getClusterId(), GrpcUtils.makeStore(storeId, "", Metapb.StoreState.Up))); + server.getClusterId(), GrpcUtils.makeStore(storeId, "", StoreState.Up))); try (PDClient client = session.getPDClient()) { Callable storeCallable = () -> client.getStore(ConcreteBackOffer.newCustomBackOff(5000), 0); @@ -283,7 +282,7 @@ public void testRetryPolicy() throws Exception { server.addGetStoreResp( GrpcUtils.makeGetStoreResponse( - server.getClusterId(), GrpcUtils.makeStore(storeId, "", Metapb.StoreState.Up))); + server.getClusterId(), GrpcUtils.makeStore(storeId, "", StoreState.Up))); try { client.getStore(defaultBackOff(), 0); } catch (GrpcException e) { diff --git a/src/test/java/org/tikv/common/PDMockServer.java b/src/test/java/org/tikv/common/PDMockServer.java index 81e9c64bd2d..9a0588054ba 100644 --- a/src/test/java/org/tikv/common/PDMockServer.java +++ b/src/test/java/org/tikv/common/PDMockServer.java @@ -37,8 +37,8 @@ public void addGetMemberResp(GetMembersResponse r) { getMembersResp.addLast(Optional.ofNullable(r)); } - private final Deque> getMembersResp = - new LinkedBlockingDeque>(); + private final Deque> getMembersResp = + new LinkedBlockingDeque>(); @Override public void getMembers(GetMembersRequest request, StreamObserver resp) { diff --git a/src/test/java/org/tikv/common/RegionManagerTest.java b/src/test/java/org/tikv/common/RegionManagerTest.java index e55bb36616c..39e786f93ff 100644 --- a/src/test/java/org/tikv/common/RegionManagerTest.java +++ b/src/test/java/org/tikv/common/RegionManagerTest.java @@ -34,7 +34,6 @@ public class RegionManagerTest { private static final long CLUSTER_ID = 1024; private static final String LOCAL_ADDR = "127.0.0.1"; private RegionManager mgr; - private TiSession session; @Before public void setup() throws IOException { @@ -48,14 +47,13 @@ public void setup() throws IOException { GrpcUtils.makeMember(2, "http://" + LOCAL_ADDR + ":" + (server.port + 2)))); TiConfiguration conf = TiConfiguration.createDefault("127.0.0.1:" + server.port); - session = TiSession.create(conf); - mgr = new RegionManager(session.getPDClient()); + TiSession session = TiSession.create(conf); + mgr = session.getRegionManager(); } @After public void tearDown() { server.stop(); - session.close(); } @Test @@ -88,7 +86,7 @@ public void getRegionByKey() throws Exception { try { mgr.getRegionByKey(searchKeyNotExists); fail(); - } catch (Exception ignored) { + } catch (Exception e) { } } @@ -118,7 +116,7 @@ public void getStoreByKey() throws Exception { GrpcUtils.makeStore( storeId, testAddress, - Metapb.StoreState.Up, + StoreState.Up, GrpcUtils.makeStoreLabel("k1", "v1"), GrpcUtils.makeStoreLabel("k2", "v2")))); Pair pair = mgr.getRegionStorePairByKey(searchKey); @@ -171,7 +169,7 @@ public void getStoreById() throws Exception { GrpcUtils.makeStore( storeId, testAddress, - Metapb.StoreState.Up, + StoreState.Up, GrpcUtils.makeStoreLabel("k1", "v1"), GrpcUtils.makeStoreLabel("k2", "v2")))); Store store = mgr.getStoreById(storeId); @@ -192,7 +190,7 @@ public void getStoreById() throws Exception { try { mgr.getStoreById(storeId); fail(); - } catch (Exception ignored) { + } catch (Exception e) { } } } diff --git a/src/test/java/org/tikv/common/RegionStoreClientTest.java b/src/test/java/org/tikv/common/RegionStoreClientTest.java index e6aad30e9f8..01acfc0ca02 100644 --- a/src/test/java/org/tikv/common/RegionStoreClientTest.java +++ b/src/test/java/org/tikv/common/RegionStoreClientTest.java @@ -21,9 +21,7 @@ import com.google.protobuf.ByteString; import java.util.List; import org.junit.Test; -import org.tikv.common.region.RegionManager; import org.tikv.common.region.RegionStoreClient; -import org.tikv.common.region.RegionStoreClient.RegionStoreClientBuilder; import org.tikv.common.util.BackOffer; import org.tikv.common.util.ConcreteBackOffer; import org.tikv.kvproto.Kvrpcpb; @@ -39,13 +37,7 @@ private RegionStoreClient createClient() { .setState(Metapb.StoreState.Up) .build(); - RegionStoreClientBuilder builder = - new RegionStoreClientBuilder( - session.getConf(), - session.getChannelFactory(), - new RegionManager(session.getPDClient())); - - return builder.build(region, store); + return RegionStoreClient.create(region, store, session); } @Test diff --git a/src/test/java/org/tikv/common/codec/CodecTest.java b/src/test/java/org/tikv/common/codec/CodecTest.java index ee474b69aa1..ac361d1e68a 100644 --- a/src/test/java/org/tikv/common/codec/CodecTest.java +++ b/src/test/java/org/tikv/common/codec/CodecTest.java @@ -158,7 +158,7 @@ private static byte[] toBytes(int[] arr) { @Test public void writeBytesTest() throws Exception { CodecDataOutput cdo = new CodecDataOutput(); - Codec.BytesCodec.writeBytes(cdo, "abcdefghijk".getBytes()); + BytesCodec.writeBytes(cdo, "abcdefghijk".getBytes()); byte[] result = cdo.toBytes(); byte[] expected = toBytes( @@ -168,19 +168,19 @@ public void writeBytesTest() throws Exception { assertArrayEquals(expected, result); cdo.reset(); - Codec.BytesCodec.writeBytes(cdo, "abcdefghijk".getBytes()); + BytesCodec.writeBytes(cdo, "abcdefghijk".getBytes()); result = BytesCodec.readBytes(new CodecDataInput(cdo.toBytes())); expected = toBytes(new int[] {97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107}); assertArrayEquals(expected, result); cdo.reset(); - Codec.BytesCodec.writeBytes(cdo, "fYfSp".getBytes()); + BytesCodec.writeBytes(cdo, "fYfSp".getBytes()); result = cdo.toBytes(); expected = toBytes(new int[] {102, 89, 102, 83, 112, 0, 0, 0, 252}); assertArrayEquals(expected, result); cdo.reset(); - Codec.BytesCodec.writeBytesRaw(cdo, "fYfSp".getBytes()); + BytesCodec.writeBytesRaw(cdo, "fYfSp".getBytes()); result = cdo.toBytes(); expected = toBytes(new int[] {102, 89, 102, 83, 112}); assertArrayEquals(expected, result); diff --git a/src/test/java/org/tikv/raw/RawKVClientTest.java b/src/test/java/org/tikv/raw/RawKVClientTest.java index 41eac01901f..50fb046ed96 100644 --- a/src/test/java/org/tikv/raw/RawKVClientTest.java +++ b/src/test/java/org/tikv/raw/RawKVClientTest.java @@ -1,17 +1,13 @@ package org.tikv.raw; import com.google.protobuf.ByteString; -import java.io.IOException; import java.util.*; import java.util.concurrent.*; import java.util.stream.Collectors; import org.apache.commons.lang3.RandomStringUtils; import org.apache.log4j.Logger; -import org.junit.After; import org.junit.Before; import org.junit.Test; -import org.tikv.common.TiConfiguration; -import org.tikv.common.TiSession; import org.tikv.common.codec.KeyUtils; import org.tikv.common.exception.TiKVException; import org.tikv.common.key.Key; @@ -19,7 +15,6 @@ import org.tikv.kvproto.Kvrpcpb; public class RawKVClientTest { - private static final String DEFAULT_PD_ADDRESS = "127.0.0.1:2379"; private static final String RAW_PREFIX = "raw_\\u0001_"; private static final int KEY_POOL_SIZE = 1000000; private static final int TEST_CASES = 10000; @@ -38,7 +33,6 @@ public class RawKVClientTest { private final ExecutorCompletionService completionService = new ExecutorCompletionService<>(executors); private static final Logger logger = Logger.getLogger(RawKVClientTest.class); - private TiSession session; static { orderedKeys = new ArrayList<>(); @@ -64,12 +58,11 @@ private static ByteString getRandomValue() { } @Before - public void setup() throws IOException { + public void setClient() { try { - session = TiSession.create(TiConfiguration.createDefault(DEFAULT_PD_ADDRESS)); initialized = false; if (client == null) { - client = session.createRawClient(); + client = RawKVClient.create(); } data = new TreeMap<>(bsc); initialized = true; @@ -78,11 +71,6 @@ public void setup() throws IOException { } } - @After - public void tearDown() { - session.close(); - } - @Test public void simpleTest() { if (!initialized) return; diff --git a/src/test/java/org/tikv/txn/LockResolverTest.java b/src/test/java/org/tikv/txn/LockResolverTest.java new file mode 100644 index 00000000000..0f736da43d1 --- /dev/null +++ b/src/test/java/org/tikv/txn/LockResolverTest.java @@ -0,0 +1,516 @@ +/* + * Copyright 2017 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.tikv.txn; + +import static junit.framework.TestCase.*; +import static org.tikv.common.util.BackOffFunction.BackOffFuncType.BoTxnLock; + +import com.google.protobuf.ByteString; +import java.lang.reflect.Field; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.function.Supplier; +import org.junit.Before; +import org.junit.Test; +import org.tikv.common.PDClient; +import org.tikv.common.ReadOnlyPDClient; +import org.tikv.common.TiConfiguration; +import org.tikv.common.TiSession; +import org.tikv.common.exception.KeyException; +import org.tikv.common.exception.RegionException; +import org.tikv.common.meta.TiTimestamp; +import org.tikv.common.operation.KVErrorHandler; +import org.tikv.common.region.RegionStoreClient; +import org.tikv.common.region.TiRegion; +import org.tikv.common.util.BackOffer; +import org.tikv.common.util.ConcreteBackOffer; +import org.tikv.common.util.Pair; +import org.tikv.kvproto.Coprocessor; +import org.tikv.kvproto.Kvrpcpb.*; +import org.tikv.kvproto.Metapb.Store; +import org.tikv.kvproto.TikvGrpc; + +public class LockResolverTest { + private TiSession session; + private static final int DefaultTTL = 10; + private boolean init = false; + private BackOffer backOffer = ConcreteBackOffer.newCustomBackOff(1000); + private ReadOnlyPDClient pdClient; + + private void putKV(String key, String value, long startTS, long commitTS) { + Mutation m = + Mutation.newBuilder() + .setKey(ByteString.copyFromUtf8(key)) + .setOp(Op.Put) + .setValue(ByteString.copyFromUtf8(value)) + .build(); + + boolean res = prewrite(Arrays.asList(m), startTS, m); + assertTrue(res); + res = commit(startTS, commitTS, Arrays.asList(ByteString.copyFromUtf8(key))); + assertTrue(res); + } + + private boolean prewrite(List mutations, long startTS, Mutation primary) { + if (mutations.size() == 0) return true; + + for (Mutation m : mutations) { + Pair pair = session.getRegionManager().getRegionStorePairByKey(m.getKey()); + + RegionStoreClient client = RegionStoreClient.create(pair.first, pair.second, session); + + Supplier factory = + () -> + PrewriteRequest.newBuilder() + .addAllMutations(Arrays.asList(m)) + .setPrimaryLock(primary.getKey()) + .setStartVersion(startTS) + .setLockTtl(DefaultTTL) + .setContext(pair.first.getContext()) + .build(); + + KVErrorHandler handler = + new KVErrorHandler<>( + session.getRegionManager(), + client, + pair.first, + resp -> resp.hasRegionError() ? resp.getRegionError() : null); + + PrewriteResponse resp = + client.callWithRetry(backOffer, TikvGrpc.METHOD_KV_PREWRITE, factory, handler); + + if (resp.hasRegionError()) { + throw new RegionException(resp.getRegionError()); + } + + if (resp.getErrorsCount() == 0) { + continue; + } + + List locks = new ArrayList<>(); + for (KeyError err : resp.getErrorsList()) { + if (err.hasLocked()) { + Lock lock = new Lock(err.getLocked()); + locks.add(lock); + } else { + throw new KeyException(err); + } + } + + LockResolverClient resolver = null; + try { + Field field = RegionStoreClient.class.getDeclaredField("lockResolverClient"); + assert (field != null); + field.setAccessible(true); + resolver = (LockResolverClient) (field.get(client)); + } catch (Exception e) { + fail(); + } + + assertNotNull(resolver); + + if (!resolver.resolveLocks(backOffer, locks)) { + backOffer.doBackOff(BoTxnLock, new KeyException(resp.getErrorsList().get(0))); + } + + prewrite(Arrays.asList(m), startTS, primary); + } + + return true; + } + + private boolean lockKey( + String key, + String value, + String primaryKey, + String primaryValue, + boolean commitPrimary, + long startTs, + long commitTS) { + List mutations = new ArrayList<>(); + mutations.add( + Mutation.newBuilder() + .setKey(ByteString.copyFromUtf8(primaryKey)) + .setValue(ByteString.copyFromUtf8(primaryValue)) + .setOp(Op.Put) + .build()); + if (!key.equals(primaryKey)) { + mutations.add( + Mutation.newBuilder() + .setKey(ByteString.copyFromUtf8(key)) + .setValue(ByteString.copyFromUtf8(value)) + .setOp(Op.Put) + .build()); + } + if (!prewrite(mutations, startTs, mutations.get(0))) return false; + + if (commitPrimary) { + if (!key.equals(primaryKey)) { + if (!commit( + startTs, + commitTS, + Arrays.asList(ByteString.copyFromUtf8(primaryKey), ByteString.copyFromUtf8(key)))) { + return false; + } + } else { + if (!commit(startTs, commitTS, Arrays.asList(ByteString.copyFromUtf8(primaryKey)))) { + return false; + } + } + } + + return true; + } + + private boolean commit(long startTS, long commitTS, List keys) { + if (keys.size() == 0) return true; + + for (ByteString k : keys) { + Pair pair = session.getRegionManager().getRegionStorePairByKey(k); + + RegionStoreClient client = RegionStoreClient.create(pair.first, pair.second, session); + Supplier factory = + () -> + CommitRequest.newBuilder() + .setStartVersion(startTS) + .setCommitVersion(commitTS) + .addAllKeys(Arrays.asList(k)) + .setContext(pair.first.getContext()) + .build(); + + KVErrorHandler handler = + new KVErrorHandler<>( + session.getRegionManager(), + client, + pair.first, + resp -> resp.hasRegionError() ? resp.getRegionError() : null); + + CommitResponse resp = + client.callWithRetry(backOffer, TikvGrpc.METHOD_KV_COMMIT, factory, handler); + + if (resp.hasRegionError()) { + throw new RegionException(resp.getRegionError()); + } + + if (resp.hasError()) { + throw new KeyException(resp.getError()); + } + } + return true; + } + + private void putAlphabet() { + for (int i = 0; i < 26; i++) { + long startTs = pdClient.getTimestamp(backOffer).getVersion(); + long endTs = pdClient.getTimestamp(backOffer).getVersion(); + while (startTs == endTs) { + endTs = pdClient.getTimestamp(backOffer).getVersion(); + } + putKV(String.valueOf((char) ('a' + i)), String.valueOf((char) ('a' + i)), startTs, endTs); + } + for (int i = 0; i < 26; i++) { + Pair pair = + session + .getRegionManager() + .getRegionStorePairByKey(ByteString.copyFromUtf8(String.valueOf((char) ('a' + i)))); + RegionStoreClient client = RegionStoreClient.create(pair.first, pair.second, session); + ByteString v = + client.get( + backOffer, + ByteString.copyFromUtf8(String.valueOf((char) ('a' + i))), + pdClient.getTimestamp(backOffer).getVersion()); + assertEquals(v.toStringUtf8(), String.valueOf((char) ('a' + i))); + } + } + + private void prepareAlphabetLocks() { + TiTimestamp startTs = pdClient.getTimestamp(backOffer); + TiTimestamp endTs = pdClient.getTimestamp(backOffer); + while (startTs == endTs) { + endTs = pdClient.getTimestamp(backOffer); + } + putKV("c", "cc", startTs.getVersion(), endTs.getVersion()); + startTs = pdClient.getTimestamp(backOffer); + endTs = pdClient.getTimestamp(backOffer); + while (startTs == endTs) { + endTs = pdClient.getTimestamp(backOffer); + } + + assertTrue(lockKey("c", "c", "z1", "z1", true, startTs.getVersion(), endTs.getVersion())); + startTs = pdClient.getTimestamp(backOffer); + endTs = pdClient.getTimestamp(backOffer); + while (startTs == endTs) { + endTs = pdClient.getTimestamp(backOffer); + } + assertTrue(lockKey("d", "dd", "z2", "z2", false, startTs.getVersion(), endTs.getVersion())); + } + + private BackOffer defaultBackOff() { + return ConcreteBackOffer.newCustomBackOff(1000); + } + + private class RetryException extends RuntimeException { + public RetryException() {} + } + + @Before + public void setUp() throws Exception { + TiConfiguration conf = TiConfiguration.createDefault("127.0.0.1:2379"); + session = TiSession.create(conf); + try { + pdClient = PDClient.create(session); + } catch (Exception e) { + init = false; + } + } + + @Test + public void getSITest() throws Exception { + if (!init) { + System.out.println("PD client not initialized. Test skipped"); + return; + } + session.getConf().setIsolationLevel(IsolationLevel.SI); + putAlphabet(); + prepareAlphabetLocks(); + + for (int i = 0; i < 26; i++) { + Pair pair = + session + .getRegionManager() + .getRegionStorePairByKey(ByteString.copyFromUtf8(String.valueOf((char) ('a' + i)))); + RegionStoreClient client = RegionStoreClient.create(pair.first, pair.second, session); + ByteString v = + client.get( + backOffer, + ByteString.copyFromUtf8(String.valueOf((char) ('a' + i))), + pdClient.getTimestamp(backOffer).getVersion()); + assertEquals(v.toStringUtf8(), String.valueOf((char) ('a' + i))); + } + + session.getConf().setIsolationLevel(IsolationLevel.RC); + } + + @Test + public void getRCTest() { + if (!init) { + System.out.println("PD client not initialized. Test skipped"); + return; + } + session.getConf().setIsolationLevel(IsolationLevel.RC); + putAlphabet(); + prepareAlphabetLocks(); + + for (int i = 0; i < 26; i++) { + Pair pair = + session + .getRegionManager() + .getRegionStorePairByKey(ByteString.copyFromUtf8(String.valueOf((char) ('a' + i)))); + RegionStoreClient client = RegionStoreClient.create(pair.first, pair.second, session); + ByteString v = + client.get( + backOffer, + ByteString.copyFromUtf8(String.valueOf((char) ('a' + i))), + pdClient.getTimestamp(backOffer).getVersion()); + assertEquals(v.toStringUtf8(), String.valueOf((char) ('a' + i))); + } + } + + @Test + public void cleanLockTest() { + if (!init) { + System.out.println("PD client not initialized. Test skipped"); + return; + } + session.getConf().setIsolationLevel(IsolationLevel.SI); + for (int i = 0; i < 26; i++) { + String k = String.valueOf((char) ('a' + i)); + TiTimestamp startTs = pdClient.getTimestamp(backOffer); + TiTimestamp endTs = pdClient.getTimestamp(backOffer); + lockKey(k, k, k, k, false, startTs.getVersion(), endTs.getVersion()); + } + + List mutations = new ArrayList<>(); + List keys = new ArrayList<>(); + for (int i = 0; i < 26; i++) { + String k = String.valueOf((char) ('a' + i)); + String v = String.valueOf((char) ('a' + i + 1)); + Mutation m = + Mutation.newBuilder() + .setKey(ByteString.copyFromUtf8(k)) + .setOp(Op.Put) + .setValue(ByteString.copyFromUtf8(v)) + .build(); + mutations.add(m); + keys.add(ByteString.copyFromUtf8(k)); + } + + TiTimestamp startTs = pdClient.getTimestamp(backOffer); + TiTimestamp endTs = pdClient.getTimestamp(backOffer); + + boolean res = prewrite(mutations, startTs.getVersion(), mutations.get(0)); + assertTrue(res); + res = commit(startTs.getVersion(), endTs.getVersion(), keys); + assertTrue(res); + + for (int i = 0; i < 26; i++) { + Pair pair = + session + .getRegionManager() + .getRegionStorePairByKey(ByteString.copyFromUtf8(String.valueOf((char) ('a' + i)))); + RegionStoreClient client = RegionStoreClient.create(pair.first, pair.second, session); + ByteString v = + client.get( + backOffer, + ByteString.copyFromUtf8(String.valueOf((char) ('a' + i))), + pdClient.getTimestamp(backOffer).getVersion()); + assertEquals(v.toStringUtf8(), String.valueOf((char) ('a' + i + 1))); + } + + session.getConf().setIsolationLevel(IsolationLevel.RC); + } + + @Test + public void txnStatusTest() { + if (!init) { + System.out.println("PD client not initialized. Test skipped"); + return; + } + session.getConf().setIsolationLevel(IsolationLevel.SI); + TiTimestamp startTs = pdClient.getTimestamp(backOffer); + TiTimestamp endTs = pdClient.getTimestamp(backOffer); + + putKV("a", "a", startTs.getVersion(), endTs.getVersion()); + Pair pair = + session + .getRegionManager() + .getRegionStorePairByKey(ByteString.copyFromUtf8(String.valueOf((char) ('a')))); + RegionStoreClient client = RegionStoreClient.create(pair.first, pair.second, session); + long status = + client.lockResolverClient.getTxnStatus( + backOffer, startTs.getVersion(), ByteString.copyFromUtf8(String.valueOf((char) ('a')))); + assertEquals(status, endTs.getVersion()); + + startTs = pdClient.getTimestamp(backOffer); + endTs = pdClient.getTimestamp(backOffer); + + lockKey("a", "a", "a", "a", true, startTs.getVersion(), endTs.getVersion()); + pair = + session + .getRegionManager() + .getRegionStorePairByKey(ByteString.copyFromUtf8(String.valueOf((char) ('a')))); + client = RegionStoreClient.create(pair.first, pair.second, session); + status = + client.lockResolverClient.getTxnStatus( + backOffer, startTs.getVersion(), ByteString.copyFromUtf8(String.valueOf((char) ('a')))); + assertEquals(status, endTs.getVersion()); + + startTs = pdClient.getTimestamp(backOffer); + endTs = pdClient.getTimestamp(backOffer); + + lockKey("a", "a", "a", "a", false, startTs.getVersion(), endTs.getVersion()); + pair = + session + .getRegionManager() + .getRegionStorePairByKey(ByteString.copyFromUtf8(String.valueOf((char) ('a')))); + client = RegionStoreClient.create(pair.first, pair.second, session); + status = + client.lockResolverClient.getTxnStatus( + backOffer, startTs.getVersion(), ByteString.copyFromUtf8(String.valueOf((char) ('a')))); + assertNotSame(status, endTs.getVersion()); + + session.getConf().setIsolationLevel(IsolationLevel.RC); + } + + @Test + public void SITest() { + if (!init) { + System.out.println("PD client not initialized. Test skipped"); + return; + } + session.getConf().setIsolationLevel(IsolationLevel.SI); + TiTimestamp startTs = pdClient.getTimestamp(backOffer); + TiTimestamp endTs = pdClient.getTimestamp(backOffer); + + putKV("a", "a", startTs.getVersion(), endTs.getVersion()); + + startTs = pdClient.getTimestamp(backOffer); + endTs = pdClient.getTimestamp(backOffer); + + lockKey("a", "aa", "a", "aa", false, startTs.getVersion(), endTs.getVersion()); + + Pair pair = + session + .getRegionManager() + .getRegionStorePairByKey(ByteString.copyFromUtf8(String.valueOf((char) ('a')))); + RegionStoreClient client = RegionStoreClient.create(pair.first, pair.second, session); + ByteString v = + client.get( + backOffer, + ByteString.copyFromUtf8(String.valueOf((char) ('a'))), + pdClient.getTimestamp(backOffer).getVersion()); + assertEquals(v.toStringUtf8(), String.valueOf((char) ('a'))); + + try { + commit(startTs.getVersion(), endTs.getVersion(), Arrays.asList(ByteString.copyFromUtf8("a"))); + fail(); + } catch (KeyException e) { + assertNotNull(e.getKeyErr().getRetryable()); + } + session.getConf().setIsolationLevel(IsolationLevel.RC); + } + + @Test + public void RCTest() { + if (!init) { + System.out.println("PD client not initialized. Test skipped"); + return; + } + session.getConf().setIsolationLevel(IsolationLevel.RC); + TiTimestamp startTs = pdClient.getTimestamp(backOffer); + TiTimestamp endTs = pdClient.getTimestamp(backOffer); + + putKV("a", "a", startTs.getVersion(), endTs.getVersion()); + + startTs = pdClient.getTimestamp(backOffer); + endTs = pdClient.getTimestamp(backOffer); + + lockKey("a", "aa", "a", "aa", false, startTs.getVersion(), endTs.getVersion()); + + Pair pair = + session + .getRegionManager() + .getRegionStorePairByKey(ByteString.copyFromUtf8(String.valueOf((char) ('a')))); + RegionStoreClient client = RegionStoreClient.create(pair.first, pair.second, session); + ByteString v = + client.get( + backOffer, + ByteString.copyFromUtf8(String.valueOf((char) ('a'))), + pdClient.getTimestamp(backOffer).getVersion()); + assertEquals(v.toStringUtf8(), String.valueOf((char) ('a'))); + + try { + commit(startTs.getVersion(), endTs.getVersion(), Arrays.asList(ByteString.copyFromUtf8("a"))); + } catch (KeyException e) { + fail(); + } + } + + private static Coprocessor.KeyRange createByteStringRange(ByteString sKey, ByteString eKey) { + return Coprocessor.KeyRange.newBuilder().setStart(sKey).setEnd(eKey).build(); + } +} diff --git a/src/test/java/org/tikv/txn/TxnKVClientTest.java b/src/test/java/org/tikv/txn/TxnKVClientTest.java new file mode 100644 index 00000000000..17f1f553648 --- /dev/null +++ b/src/test/java/org/tikv/txn/TxnKVClientTest.java @@ -0,0 +1,275 @@ +package org.tikv.txn; + +import com.google.common.collect.Lists; +import com.google.protobuf.ByteString; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.tikv.common.region.TiRegion; +import org.tikv.common.util.ConcreteBackOffer; +import org.tikv.common.util.Pair; +import org.tikv.kvproto.Kvrpcpb; + +import java.io.InputStream; +import java.security.SecureRandom; +import java.util.List; +import java.util.Properties; +import java.util.function.Function; + +public class TxnKVClientTest { + + TxnKVClient txnClient; + + final String configPath = "kv-config.properties"; + + Properties configPros; + + @Before + public void setClient() { + try { + InputStream is = getClass().getClassLoader().getResourceAsStream(configPath); + this.configPros = new Properties(); + this.configPros.load(is); + String pdAddress = this.configPros.getProperty("kv.pd.address"); + System.out.println("pdAddress is: " + pdAddress); + Assert.assertNotNull("pd address should not null", pdAddress); + txnClient = TxnKVClient.createClient(pdAddress); + } catch (Exception e) { + System.out.println("Cannot initialize txn client. Test skipped."); + } + } + + @Test + public void testScan() { + byte[] startKey = ByteString.copyFromUtf8("test").toByteArray(); + List> result = txnClient.scan(startKey, 10); + for(Pair item : result) { + String key = new String(item.first); + String value = new String(item.second); + System.out.println("Key=" + key + ", Value=" + value); + } + } + + @Test + public void testGet(){ + String key = "txn_test_set_0001"; + byte[] rawValue = txnClient.get(key.getBytes()); + String value = new String(rawValue); + System.out.println("Value=" + value); + } + + @Test + public void testPut() { + String key = "test_AAAAGAaJwbZnjgaPvypwZTiuMBFirzPf"; + String value = "put_value"; + boolean result = txnClient.put(key.getBytes(), value.getBytes()); + System.out.println("Put result=" + result); + } + + @Test + public void testTxnCommitSuccess() { + String key = "test_AAAAGAaJwbZnjgaPvypwZTiuMBFirzPf"; + String value = "put_value 2222"; + ITransaction txn = this.txnClient.begin(); + try { + txn.set(key.getBytes(), value.getBytes()); + txn.set("txn_test_set_0001".getBytes(), value.getBytes()); + boolean result = txn.commit(); + System.out.println("commit result=" + result); + } catch (Exception e) { + txn.rollback(); + } + } + + @Test + public void testTxnCommitWithConflict() { + ByteString key = ByteString.copyFromUtf8("test_AAAAGAaJwbZnjgaPvypwZTiuMBFirzPf_primary_001"); + ByteString value = ByteString.copyFromUtf8("put_value primary"); + ByteString key2 = ByteString.copyFromUtf8("txn_test_secondary_0001"); + ByteString value2 = ByteString.copyFromUtf8("put_value secondary"); + long startVersion = txnClient.getTimestamp().getVersion(); + ITransaction txn = this.txnClient.begin(); + txn.set(key.toByteArray(), value.toByteArray()); + txn.set(key2.toByteArray(), value2.toByteArray()); + try { + Kvrpcpb.Mutation mutation = Kvrpcpb.Mutation.newBuilder() + .setKey(key) + .setValue(value) + .setOp(Kvrpcpb.Op.Put) + .build(); + System.out.println("startVersion=" + startVersion); + TiRegion region = txnClient.getSession().getRegionManager().getRegionByKey(key); + txnClient.prewrite(ConcreteBackOffer.newCustomBackOff(2000), Lists.newArrayList(mutation), key.toByteArray(), + 1000, startVersion, region.getId()); + boolean result = txn.commit(); + System.out.println("commit result=" + result); + } catch (Exception e) { + txn.rollback(); + } + } + + /** + * func BackOff(attempts uint) int { + * upper := int(math.Min(float64(retryBackOffCap), float64(retryBackOffBase)*math.Pow(2.0, float64(attempts)))) + * sleep := time.Duration(rand.Intn(upper)) * time.Millisecond + * time.Sleep(sleep) + * return int(sleep) + * } + */ + int retryBackOffCap = 100; + int retryBackOffBase = 1; + int maxRetryCnt = 100; + SecureRandom random = new SecureRandom(); + + private int backoff(int attempts) { + int upper = (int)(Math.min(retryBackOffCap, retryBackOffBase * Math.pow(2.0, attempts))); + int sleep = random.nextInt(upper); + try { + Thread.sleep(sleep); + System.out.println("sleep " + sleep + " at attempts " + attempts); + } catch (InterruptedException e) { + e.printStackTrace(); + } + return sleep; + } + + /** + * + * @param function + * @return + */ + private boolean runTxnWithRetry(Function function) { + for(int i=0 ; i < maxRetryCnt; i++) { + ITransaction txn = txnClient.begin(); + Boolean result = function.apply(txn); + if(!result) { + txn.rollback(); + continue; + } + boolean commit = txn.commit(); + if(commit) { + return true; + } + backoff(i); + } + return false; + } + + @Test + public void testCommit_NoConflict_Success() { + ByteString key = ByteString.copyFromUtf8("test_AAAAGAaJwbZnjgaPvypwZTiuMBFirzPf_primary_001"); + System.out.println("old value=" + new String(txnClient.get(key.toByteArray()))); + + boolean commit1 = runTxnWithRetry((txn) -> { + byte[] txn1Value = txn.get(key.toByteArray()); + System.out.println("txn1 start ts=" + txn.getStartTS()); + long txn1NewValue = Long.valueOf(new String(txn1Value)) + 1; + System.out.println("txn1 new value=" + txn1NewValue); + txn.set(key.toByteArray(), ByteString.copyFromUtf8(txn1NewValue + "").toByteArray()); + //txn.lockKeys(Key.toRawKey(key.toByteArray())); + return true; + }); + boolean commit2 = runTxnWithRetry((txn) -> { + byte[] txn2Value = txn.get(key.toByteArray()); + System.out.println("txn2 start ts=" + txn.getStartTS()); + long txn2NewValue = Long.valueOf(new String(txn2Value)) + 1; + System.out.println("txn2 new value=" + txn2NewValue); + txn.set(key.toByteArray(), ByteString.copyFromUtf8(txn2NewValue + "").toByteArray()); + return true; + }); + + System.out.println("commit result1=" + commit1); + System.out.println("commit result2=" + commit2); + + System.out.println("new value=" + new String(txnClient.get(key.toByteArray()))); + } + + @Test + public void testCommit_Retry() { + ByteString key = ByteString.copyFromUtf8("test_AAAAGAaJwbZnjgaPvypwZTiuMBFirzPf_primary_001"); + System.out.println("old value=" + new String(txnClient.get(key.toByteArray()))); + + ITransaction txn1 = txnClient.begin((txn) -> { + byte[] txn1Value = txn.get(key.toByteArray()); + System.out.println("txn1 start ts=" + txn.getStartTS()); + long txn1NewValue = Long.valueOf(new String(txn1Value)) + 1; + System.out.println("txn1 new value=" + txn1NewValue); + txn.set(key.toByteArray(), ByteString.copyFromUtf8(txn1NewValue + "").toByteArray()); + return true; + }); + + ITransaction txn2 = txnClient.begin((txn) -> { + byte[] txn2Value = txn.get(key.toByteArray()); + System.out.println("txn2 start ts=" + txn.getStartTS()); + long txn2NewValue = Long.valueOf(new String(txn2Value)) + 1; + System.out.println("txn2 new value=" + txn2NewValue); + txn.set(key.toByteArray(), ByteString.copyFromUtf8(txn2NewValue + "").toByteArray()); + return true; + }); + boolean commit1 = txn1.commit(); + boolean commit2 = txn2.commit(); + + System.out.println("commit result1=" + commit1); + System.out.println("commit result2=" + commit2); + + System.out.println("new value=" + new String(txnClient.get(key.toByteArray()))); + } + + @Test + public void testCommit_Conflict_RetrySuccess() { + ByteString key = ByteString.copyFromUtf8("test_AAAAGAaJwbZnjgaPvypwZTiuMBFirzPf_primary_001"); + System.out.println("old value=" + new String(txnClient.get(key.toByteArray()))); + + ITransaction txn1 = txnClient.begin((txn) -> { + byte[] txn1Value = txn.get(key.toByteArray()); + System.out.println("txn1 start ts=" + txn.getStartTS()); + long txn1NewValue = Long.valueOf(new String(txn1Value)) + 1; + System.out.println("txn1 new value=" + txn1NewValue); + txn.set(key.toByteArray(), ByteString.copyFromUtf8(txn1NewValue + "").toByteArray()); + return true; + }); + + ITransaction txn2 = txnClient.begin((txn) -> { + byte[] txn2Value = txn.get(key.toByteArray()); + System.out.println("txn2 start ts=" + txn.getStartTS()); + long txn2NewValue = Long.valueOf(new String(txn2Value)) + 1; + System.out.println("txn2 new value=" + txn2NewValue); + txn.set(key.toByteArray(), ByteString.copyFromUtf8(txn2NewValue + "").toByteArray()); + return true; + }); + boolean commit2 = txn2.commit(); + boolean commit1 = txn1.commit(); + + System.out.println("commit result1=" + commit1); + System.out.println("commit result2=" + commit2); + + System.out.println("new value=" + new String(txnClient.get(key.toByteArray()))); + } + + @Test + public void testCommit_Fail() { + ByteString key = ByteString.copyFromUtf8("test_AAAAGAaJwbZnjgaPvypwZTiuMBFirzPf_primary_001"); + System.out.println("old value=" + new String(txnClient.get(key.toByteArray()))); + + ITransaction txn1 = txnClient.begin(); + byte[] txn2Value = txn1.get(key.toByteArray()); + System.out.println("txn2 start ts=" + txn1.getStartTS()); + long txn2NewValue = Long.valueOf(new String(txn2Value)) + 1; + System.out.println("txn2 new value=" + txn2NewValue); + txn1.set(key.toByteArray(), ByteString.copyFromUtf8(txn2NewValue + "").toByteArray()); + boolean commit1 = runTxnWithRetry((txn) -> { + byte[] txn1Value = txn.get(key.toByteArray()); + System.out.println("txn1 start ts=" + txn.getStartTS()); + long txn1NewValue = Long.valueOf(new String(txn1Value)) + 1; + System.out.println("txn1 new value=" + txn1NewValue); + txn.set(key.toByteArray(), ByteString.copyFromUtf8(txn1NewValue + "").toByteArray()); + return true; + }); + + boolean commit2 = txn1.commit(); + System.out.println("commit result1=" + commit1); + System.out.println("commit result2=" + commit2); + + System.out.println("new value=" + new String(txnClient.get(key.toByteArray()))); + } +}