From aff58ee63801de743d2277a151e8696dc92b7c5c Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Mon, 18 Nov 2019 18:26:48 -0800 Subject: [PATCH 01/28] add s3 input source for native batch ingestion --- .../druid/data/input/AbstractInputSource.java | 6 +- .../apache/druid/data/input/InputSource.java | 3 +- .../impl/InputEntityIteratingReader.java | 2 +- .../apache/druid/data/input/s3/S3Entity.java | 78 ++++++ .../druid/data/input/s3/S3InputSource.java | 259 ++++++++++++++++++ .../input/s3/S3InputSourceDruidModule.java | 49 ++++ .../firehose/s3/StaticS3FirehoseFactory.java | 75 +---- ...rg.apache.druid.initialization.DruidModule | 1 + .../data/input/s3/S3InputSourceTest.java | 204 ++++++++++++++ .../s3/StaticS3FirehoseFactoryTest.java | 91 +----- 10 files changed, 602 insertions(+), 166 deletions(-) create mode 100644 extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java create mode 100644 extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java create mode 100644 extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSourceDruidModule.java create mode 100644 extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java diff --git a/core/src/main/java/org/apache/druid/data/input/AbstractInputSource.java b/core/src/main/java/org/apache/druid/data/input/AbstractInputSource.java index 94ddcd6b79e7..d41db9c362c5 100644 --- a/core/src/main/java/org/apache/druid/data/input/AbstractInputSource.java +++ b/core/src/main/java/org/apache/druid/data/input/AbstractInputSource.java @@ -23,6 +23,7 @@ import javax.annotation.Nullable; import java.io.File; +import java.io.IOException; /** * Abstract class for {@link InputSource}. This class provides a default implementation of {@link #reader} with @@ -36,7 +37,7 @@ public InputSourceReader reader( InputRowSchema inputRowSchema, @Nullable InputFormat inputFormat, @Nullable File temporaryDirectory - ) + ) throws IOException { if (needsFormat()) { return formattableReader( @@ -53,12 +54,13 @@ protected InputSourceReader formattableReader( InputRowSchema inputRowSchema, InputFormat inputFormat, @Nullable File temporaryDirectory - ) + ) throws IOException { throw new UnsupportedOperationException("Implement this method properly if needsFormat() = true"); } protected InputSourceReader fixedFormatReader(InputRowSchema inputRowSchema, @Nullable File temporaryDirectory) + throws IOException { throw new UnsupportedOperationException("Implement this method properly if needsFormat() = false"); } diff --git a/core/src/main/java/org/apache/druid/data/input/InputSource.java b/core/src/main/java/org/apache/druid/data/input/InputSource.java index 6109c592604c..88e1afd56fa8 100644 --- a/core/src/main/java/org/apache/druid/data/input/InputSource.java +++ b/core/src/main/java/org/apache/druid/data/input/InputSource.java @@ -28,6 +28,7 @@ import javax.annotation.Nullable; import java.io.File; +import java.io.IOException; /** * InputSource abstracts the storage system where input data is stored. It creates an {@link InputSourceReader} @@ -77,5 +78,5 @@ InputSourceReader reader( InputRowSchema inputRowSchema, @Nullable InputFormat inputFormat, @Nullable File temporaryDirectory - ); + ) throws IOException; } diff --git a/core/src/main/java/org/apache/druid/data/input/impl/InputEntityIteratingReader.java b/core/src/main/java/org/apache/druid/data/input/impl/InputEntityIteratingReader.java index 385bc5f14591..1869756eaa13 100644 --- a/core/src/main/java/org/apache/druid/data/input/impl/InputEntityIteratingReader.java +++ b/core/src/main/java/org/apache/druid/data/input/impl/InputEntityIteratingReader.java @@ -46,7 +46,7 @@ public class InputEntityIteratingReader implements InputSourceReader private final Iterator sourceIterator; private final File temporaryDirectory; - InputEntityIteratingReader( + public InputEntityIteratingReader( InputRowSchema inputRowSchema, InputFormat inputFormat, Stream sourceStream, diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java new file mode 100644 index 000000000000..078051ab2c21 --- /dev/null +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.data.input.s3; + +import com.amazonaws.services.s3.model.AmazonS3Exception; +import com.amazonaws.services.s3.model.S3Object; +import com.google.common.base.Predicate; +import org.apache.druid.data.input.InputEntity; +import org.apache.druid.java.util.common.ISE; +import org.apache.druid.storage.s3.S3Utils; +import org.apache.druid.storage.s3.ServerSideEncryptingAmazonS3; +import org.apache.druid.utils.CompressionUtils; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.io.InputStream; +import java.net.URI; + +public class S3Entity implements InputEntity +{ + private final ServerSideEncryptingAmazonS3 s3Client; + private final URI uri; + + public S3Entity(ServerSideEncryptingAmazonS3 s3Client, URI uri) + { + this.s3Client = s3Client; + this.uri = uri; + } + + @Nullable + @Override + public URI getUri() + { + return uri; + } + + @Override + public InputStream open() throws IOException + { + try { + // Get data of the given object and open an input stream + final String bucket = uri.getAuthority(); + final String key = S3Utils.extractS3Key(uri); + + final S3Object s3Object = s3Client.getObject(bucket, key); + if (s3Object == null) { + throw new ISE("Failed to get an s3 object for bucket[%s] and key[%s]", bucket, key); + } + return CompressionUtils.decompress(s3Object.getObjectContent(), uri.toString()); + } + catch (AmazonS3Exception e) { + throw new IOException(e); + } + } + + @Override + public Predicate getFetchRetryCondition() + { + return S3Utils.S3RETRY; + } +} diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java new file mode 100644 index 000000000000..d36eb1d2ec2c --- /dev/null +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java @@ -0,0 +1,259 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.data.input.s3; + +import com.amazonaws.services.s3.model.AmazonS3Exception; +import com.amazonaws.services.s3.model.ObjectMetadata; +import com.amazonaws.services.s3.model.S3ObjectSummary; +import com.fasterxml.jackson.annotation.JacksonInject; +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Lists; +import org.apache.druid.data.input.AbstractInputSource; +import org.apache.druid.data.input.InputFormat; +import org.apache.druid.data.input.InputRowSchema; +import org.apache.druid.data.input.InputSourceReader; +import org.apache.druid.data.input.InputSplit; +import org.apache.druid.data.input.SplitHintSpec; +import org.apache.druid.data.input.impl.InputEntityIteratingReader; +import org.apache.druid.data.input.impl.SplittableInputSource; +import org.apache.druid.java.util.common.IAE; +import org.apache.druid.java.util.common.IOE; +import org.apache.druid.java.util.common.StringUtils; +import org.apache.druid.java.util.common.logger.Logger; +import org.apache.druid.storage.s3.S3Utils; +import org.apache.druid.storage.s3.ServerSideEncryptingAmazonS3; + +import javax.annotation.Nullable; +import java.io.File; +import java.io.IOException; +import java.net.URI; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Iterator; +import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +public class S3InputSource extends AbstractInputSource implements SplittableInputSource +{ + private static final Logger log = new Logger(S3InputSource.class); + private static final int MAX_LISTING_LENGTH = 1024; + + private final ServerSideEncryptingAmazonS3 s3Client; + private final List uris; + private final List prefixes; + private Collection cacheSplitUris = null; + + @JsonCreator + public S3InputSource( + @JacksonInject("s3Client") ServerSideEncryptingAmazonS3 s3Client, + @JsonProperty("uris") @Nullable List uris, + @JsonProperty("prefixes") @Nullable List prefixes + ) + { + this.s3Client = Preconditions.checkNotNull(s3Client, "s3Client"); + this.uris = uris == null ? new ArrayList<>() : uris; + this.prefixes = prefixes == null ? new ArrayList<>() : prefixes; + + if (!this.uris.isEmpty() && !this.prefixes.isEmpty()) { + throw new IAE("uris and prefixes cannot be used together"); + } + + if (this.uris.isEmpty() && this.prefixes.isEmpty()) { + throw new IAE("uris or prefixes must be specified"); + } + + for (final URI inputURI : this.uris) { + Preconditions.checkArgument("s3".equals(inputURI.getScheme()), "input uri scheme == s3 (%s)", inputURI); + } + + for (final URI inputURI : this.prefixes) { + Preconditions.checkArgument("s3".equals(inputURI.getScheme()), "input uri scheme == s3 (%s)", inputURI); + } + } + + @JsonProperty + public List getUris() + { + return uris; + } + + @JsonProperty("prefixes") + public List getPrefixes() + { + return prefixes; + } + + @Override + public Stream> createSplits(InputFormat inputFormat, @Nullable SplitHintSpec splitHintSpec) + throws IOException + { + if (cacheSplitUris == null) { + initalizeSplitUris(); + } + return cacheSplitUris.stream().map(InputSplit::new); + } + + @Override + public int getNumSplits(InputFormat inputFormat, @Nullable SplitHintSpec splitHintSpec) throws IOException + { + if (cacheSplitUris == null) { + initalizeSplitUris(); + } + return cacheSplitUris.size(); + } + + @Override + public SplittableInputSource withSplit(InputSplit split) + { + return new S3InputSource(s3Client, ImmutableList.of(split.get()), ImmutableList.of()); + } + + @Override + public boolean needsFormat() + { + return true; + } + + @Override + protected InputSourceReader formattableReader( + InputRowSchema inputRowSchema, + InputFormat inputFormat, + @Nullable File temporaryDirectory + ) throws IOException + { + return new InputEntityIteratingReader( + inputRowSchema, + inputFormat, + // formattableReader() is supposed to be called in each task that actually creates segments. + // The task should already have only one split in parallel indexing, + // while there's no need to make splits using splitHintSpec in sequential indexing. + createSplits(inputFormat, null).map(split -> new S3Entity(s3Client, split.get())), + temporaryDirectory + ); + } + + private void initalizeSplitUris() throws IOException + { + cacheSplitUris = !uris.isEmpty() ? uris : getUrisFromPrefix(s3Client, prefixes); + } + + public static Collection getUrisFromPrefix(ServerSideEncryptingAmazonS3 s3Client, List prefixes) + throws IOException + { + final List objects = new ArrayList<>(); + for (URI uri : prefixes) { + final String bucket = uri.getAuthority(); + final String prefix = S3Utils.extractS3Key(uri); + + try { + final Iterator objectSummaryIterator = S3Utils.objectSummaryIterator( + s3Client, + bucket, + prefix, + MAX_LISTING_LENGTH + ); + objects.addAll(Lists.newArrayList(objectSummaryIterator)); + } + catch (AmazonS3Exception outerException) { + log.error(outerException, "Exception while listing on %s", uri); + + if (outerException.getStatusCode() == 403) { + // The "Access Denied" means users might not have a proper permission for listing on the given uri. + // Usually this is not a problem, but the uris might be the full paths to input objects instead of prefixes. + // In this case, users should be able to get objects if they have a proper permission for GetObject. + + log.warn("Access denied for %s. Try to get the object from the uri without listing", uri); + try { + final ObjectMetadata objectMetadata = s3Client.getObjectMetadata(bucket, prefix); + + if (!S3Utils.isDirectoryPlaceholder(prefix, objectMetadata)) { + objects.add(S3Utils.getSingleObjectSummary(s3Client, bucket, prefix)); + } else { + throw new IOE( + "[%s] is a directory placeholder, " + + "but failed to get the object list under the directory due to permission", + uri + ); + } + } + catch (AmazonS3Exception innerException) { + throw new IOException(innerException); + } + } else { + throw new IOException(outerException); + } + } + } + return objects.stream().map(S3InputSource::toUri).collect(Collectors.toList()); + } + + /** + * Create an {@link URI} from the given {@link S3ObjectSummary}. The result URI is composed as below. + * + *
+   * {@code s3://{BUCKET_NAME}/{OBJECT_KEY}}
+   * 
+ */ + private static URI toUri(S3ObjectSummary object) + { + final String originalAuthority = object.getBucketName(); + final String originalPath = object.getKey(); + final String authority = originalAuthority.endsWith("/") ? + originalAuthority.substring(0, originalAuthority.length() - 1) : + originalAuthority; + final String path = originalPath.startsWith("/") ? originalPath.substring(1) : originalPath; + + return URI.create(StringUtils.format("s3://%s/%s", authority, path)); + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + S3InputSource that = (S3InputSource) o; + return Objects.equals(uris, that.uris) && + Objects.equals(prefixes, that.prefixes); + } + + @Override + public int hashCode() + { + return Objects.hash(uris, prefixes); + } + + @Override + public String toString() + { + return "S3InputSource{" + + "uris=" + uris + + ", prefixes=" + prefixes + + '}'; + } +} diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSourceDruidModule.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSourceDruidModule.java new file mode 100644 index 000000000000..c445f7f29a80 --- /dev/null +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSourceDruidModule.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.data.input.s3; + +import com.fasterxml.jackson.databind.Module; +import com.fasterxml.jackson.databind.jsontype.NamedType; +import com.fasterxml.jackson.databind.module.SimpleModule; +import com.google.common.collect.ImmutableList; +import com.google.inject.Binder; +import org.apache.druid.initialization.DruidModule; + +import java.util.List; + +/** + * Druid module to wire up native batch support for S3 input + */ +public class S3InputSourceDruidModule implements DruidModule +{ + @Override + public List getJacksonModules() + { + return ImmutableList.of( + new SimpleModule().registerSubtypes(new NamedType(S3InputSource.class, "s3")) + ); + } + + @Override + public void configure(Binder binder) + { + + } +} diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/firehose/s3/StaticS3FirehoseFactory.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/firehose/s3/StaticS3FirehoseFactory.java index 1c693778eec0..4cd9223f8bc3 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/firehose/s3/StaticS3FirehoseFactory.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/firehose/s3/StaticS3FirehoseFactory.java @@ -21,23 +21,19 @@ import com.amazonaws.services.s3.model.AmazonS3Exception; import com.amazonaws.services.s3.model.GetObjectRequest; -import com.amazonaws.services.s3.model.ObjectMetadata; import com.amazonaws.services.s3.model.S3Object; -import com.amazonaws.services.s3.model.S3ObjectSummary; import com.fasterxml.jackson.annotation.JacksonInject; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.Preconditions; import com.google.common.base.Predicate; -import com.google.common.collect.Lists; import org.apache.druid.data.input.FiniteFirehoseFactory; import org.apache.druid.data.input.InputSplit; import org.apache.druid.data.input.impl.StringInputRowParser; import org.apache.druid.data.input.impl.prefetch.PrefetchableTextFilesFirehoseFactory; +import org.apache.druid.data.input.s3.S3InputSource; import org.apache.druid.java.util.common.IAE; -import org.apache.druid.java.util.common.IOE; import org.apache.druid.java.util.common.ISE; -import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.java.util.common.logger.Logger; import org.apache.druid.storage.s3.S3Utils; import org.apache.druid.storage.s3.ServerSideEncryptingAmazonS3; @@ -49,10 +45,8 @@ import java.util.ArrayList; import java.util.Collection; import java.util.Collections; -import java.util.Iterator; import java.util.List; import java.util.Objects; -import java.util.stream.Collectors; /** * Builds firehoses that read from a predefined list of S3 objects and then dry up. @@ -115,56 +109,10 @@ public List getPrefixes() @Override protected Collection initObjects() throws IOException { - // Here, the returned s3 objects contain minimal information without data. - // Getting data is deferred until openObjectStream() is called for each object. if (!uris.isEmpty()) { return uris; } else { - final List objects = new ArrayList<>(); - for (URI uri : prefixes) { - final String bucket = uri.getAuthority(); - final String prefix = S3Utils.extractS3Key(uri); - - try { - final Iterator objectSummaryIterator = S3Utils.objectSummaryIterator( - s3Client, - bucket, - prefix, - MAX_LISTING_LENGTH - ); - objects.addAll(Lists.newArrayList(objectSummaryIterator)); - } - catch (AmazonS3Exception outerException) { - log.error(outerException, "Exception while listing on %s", uri); - - if (outerException.getStatusCode() == 403) { - // The "Access Denied" means users might not have a proper permission for listing on the given uri. - // Usually this is not a problem, but the uris might be the full paths to input objects instead of prefixes. - // In this case, users should be able to get objects if they have a proper permission for GetObject. - - log.warn("Access denied for %s. Try to get the object from the uri without listing", uri); - try { - final ObjectMetadata objectMetadata = s3Client.getObjectMetadata(bucket, prefix); - - if (!S3Utils.isDirectoryPlaceholder(prefix, objectMetadata)) { - objects.add(S3Utils.getSingleObjectSummary(s3Client, bucket, prefix)); - } else { - throw new IOE( - "[%s] is a directory placeholder, " - + "but failed to get the object list under the directory due to permission", - uri - ); - } - } - catch (AmazonS3Exception innerException) { - throw new IOException(innerException); - } - } else { - throw new IOException(outerException); - } - } - } - return objects.stream().map(StaticS3FirehoseFactory::toUri).collect(Collectors.toList()); + return S3InputSource.getUrisFromPrefix(s3Client, prefixes); } } @@ -273,23 +221,4 @@ public FiniteFirehoseFactory withSplit(InputSplit - * {@code s3://{BUCKET_NAME}/{OBJECT_KEY}} - * - */ - private static URI toUri(S3ObjectSummary object) - { - final String originalAuthority = object.getBucketName(); - final String originalPath = object.getKey(); - final String authority = originalAuthority.endsWith("/") ? - originalAuthority.substring(0, originalAuthority.length() - 1) : - originalAuthority; - final String path = originalPath.startsWith("/") ? originalPath.substring(1) : originalPath; - - return URI.create(StringUtils.format("s3://%s/%s", authority, path)); - } } diff --git a/extensions-core/s3-extensions/src/main/resources/META-INF/services/org.apache.druid.initialization.DruidModule b/extensions-core/s3-extensions/src/main/resources/META-INF/services/org.apache.druid.initialization.DruidModule index ccd137b82bd8..51968bbb97bf 100644 --- a/extensions-core/s3-extensions/src/main/resources/META-INF/services/org.apache.druid.initialization.DruidModule +++ b/extensions-core/s3-extensions/src/main/resources/META-INF/services/org.apache.druid.initialization.DruidModule @@ -15,3 +15,4 @@ org.apache.druid.storage.s3.S3StorageDruidModule org.apache.druid.firehose.s3.S3FirehoseDruidModule +org.apache.druid.data.input.s3.S3InputSourceDruidModule diff --git a/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java b/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java new file mode 100644 index 000000000000..7adcb932defd --- /dev/null +++ b/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java @@ -0,0 +1,204 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.data.input.s3; + +import com.amazonaws.services.s3.AmazonS3; +import com.amazonaws.services.s3.AmazonS3Client; +import com.amazonaws.services.s3.model.ListObjectsV2Request; +import com.amazonaws.services.s3.model.ListObjectsV2Result; +import com.amazonaws.services.s3.model.S3ObjectSummary; +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.databind.DeserializationContext; +import com.fasterxml.jackson.databind.Module; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.deser.std.StdDeserializer; +import com.fasterxml.jackson.databind.module.SimpleModule; +import com.fasterxml.jackson.module.guice.ObjectMapperModule; +import com.google.common.collect.ImmutableList; +import com.google.inject.Binder; +import com.google.inject.Guice; +import com.google.inject.Injector; +import com.google.inject.Provides; +import org.apache.druid.data.input.InputSplit; +import org.apache.druid.data.input.impl.JsonInputFormat; +import org.apache.druid.initialization.DruidModule; +import org.apache.druid.java.util.common.parsers.JSONPathSpec; +import org.apache.druid.storage.s3.NoopServerSideEncryption; +import org.apache.druid.storage.s3.S3Utils; +import org.apache.druid.storage.s3.ServerSideEncryptingAmazonS3; +import org.easymock.EasyMock; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import java.io.IOException; +import java.net.URI; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +public class S3InputSourceTest +{ + private static final AmazonS3Client S3_CLIENT = EasyMock.createNiceMock(AmazonS3Client.class); + private static final ServerSideEncryptingAmazonS3 SERVICE = new ServerSideEncryptingAmazonS3( + S3_CLIENT, + new NoopServerSideEncryption() + ); + + @Test + public void testSerde() throws Exception + { + final ObjectMapper mapper = createS3ObjectMapper(); + + final List uris = Arrays.asList( + new URI("s3://foo/bar/file.gz"), + new URI("s3://bar/foo/file2.gz") + ); + + final List prefixes = Arrays.asList( + new URI("s3://foo/bar"), + new URI("s3://bar/foo") + ); + + final S3InputSource withUris = new S3InputSource(SERVICE, uris,null); + final S3InputSource serdeWithUris = mapper.readValue(mapper.writeValueAsString(withUris), S3InputSource.class); + Assert.assertEquals(withUris, serdeWithUris); + + final S3InputSource withPrefixes = new S3InputSource(SERVICE, null, prefixes); + final S3InputSource serdeWithPrefixes = mapper.readValue(mapper.writeValueAsString(withPrefixes), S3InputSource.class); + Assert.assertEquals(withPrefixes, serdeWithPrefixes); + } + + @Test + public void testWithUrisSplit() throws IOException + { + final List uris = Arrays.asList( + URI.create("s3://foo/bar/file.gz"), + URI.create("s3://bar/foo/file2.gz") + ); + + S3InputSource inputSource = new S3InputSource(SERVICE, uris, null); + + Stream> splits = inputSource.createSplits( + new JsonInputFormat(JSONPathSpec.DEFAULT, null), + null + ); + + Assert.assertEquals(uris, splits.map(InputSplit::get).collect(Collectors.toList())); + } + + @Test + public void testWithPrefixesSplit() throws IOException + { + final List prefixes = Arrays.asList( + URI.create("s3://foo/bar"), + URI.create("s3://bar/foo") + ); + + final List expectedUris = Arrays.asList( + URI.create("s3://foo/bar/file.gz"), + URI.create("s3://bar/foo/file2.gz") + ); + addExpectedPrefixObjects(SERVICE, URI.create("s3://foo/bar"), ImmutableList.of(prefixes.get(0))); + addExpectedPrefixObjects(SERVICE, URI.create("s3://bar/foo"), ImmutableList.of(prefixes.get(1))); + EasyMock.replay(S3_CLIENT); + + S3InputSource inputSource = new S3InputSource(SERVICE, null, prefixes); + + Stream> splits = inputSource.createSplits( + new JsonInputFormat(JSONPathSpec.DEFAULT, null), + null + ); + + Assert.assertEquals(expectedUris, splits.map(InputSplit::get).collect(Collectors.toList())); + } + + private static void addExpectedPrefixObjects(ServerSideEncryptingAmazonS3 service, URI prefix, List uris) + { + final String s3Bucket = prefix.getAuthority(); + final ListObjectsV2Result result = new ListObjectsV2Result(); + result.setBucketName(s3Bucket); + result.setKeyCount(1); + for (URI uri : uris) { + final String key = S3Utils.extractS3Key(uri); + final S3ObjectSummary objectSummary = new S3ObjectSummary(); + objectSummary.setBucketName(s3Bucket); + objectSummary.setKey(key); + result.getObjectSummaries().add(objectSummary); + } + EasyMock.expect(service.listObjectsV2(EasyMock.anyObject(ListObjectsV2Request.class))).andReturn(result).once(); + } + + public static ObjectMapper createS3ObjectMapper() + { + DruidModule baseModule = new TestS3Module(); + final Injector injector = Guice.createInjector( + new ObjectMapperModule(), + baseModule + ); + final ObjectMapper baseMapper = injector.getInstance(ObjectMapper.class); + + baseModule.getJacksonModules().forEach(baseMapper::registerModule); + return baseMapper; + } + + public static class TestS3Module implements DruidModule + { + @Override + public List getJacksonModules() + { + // Deserializer is need for AmazonS3Client even though it is injected. + // See https://github.com/FasterXML/jackson-databind/issues/962. + return ImmutableList.of(new SimpleModule().addDeserializer(AmazonS3.class, new ItemDeserializer())); + } + + @Override + public void configure(Binder binder) + { + + } + + @Provides + public ServerSideEncryptingAmazonS3 getAmazonS3Client() + { + return SERVICE; + } + } + + public static class ItemDeserializer extends StdDeserializer + { + public ItemDeserializer() + { + this(null); + } + + public ItemDeserializer(Class vc) + { + super(vc); + } + + @Override + public AmazonS3 deserialize(JsonParser jp, DeserializationContext ctxt) + { + throw new UnsupportedOperationException(); + } + } +} \ No newline at end of file diff --git a/extensions-core/s3-extensions/src/test/java/org/apache/druid/firehose/s3/StaticS3FirehoseFactoryTest.java b/extensions-core/s3-extensions/src/test/java/org/apache/druid/firehose/s3/StaticS3FirehoseFactoryTest.java index 5f89860fbee3..c809bf37040d 100644 --- a/extensions-core/s3-extensions/src/test/java/org/apache/druid/firehose/s3/StaticS3FirehoseFactoryTest.java +++ b/extensions-core/s3-extensions/src/test/java/org/apache/druid/firehose/s3/StaticS3FirehoseFactoryTest.java @@ -19,28 +19,12 @@ package org.apache.druid.firehose.s3; -import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.AmazonS3Client; -import com.amazonaws.services.s3.model.ListObjectsV2Request; -import com.amazonaws.services.s3.model.ListObjectsV2Result; -import com.amazonaws.services.s3.model.S3ObjectSummary; -import com.fasterxml.jackson.core.JsonParser; -import com.fasterxml.jackson.databind.DeserializationContext; -import com.fasterxml.jackson.databind.Module; import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.deser.std.StdDeserializer; -import com.fasterxml.jackson.databind.module.SimpleModule; -import com.fasterxml.jackson.module.guice.ObjectMapperModule; -import com.google.common.collect.ImmutableList; -import com.google.inject.Binder; -import com.google.inject.Guice; -import com.google.inject.Injector; -import com.google.inject.Provides; import org.apache.druid.data.input.FiniteFirehoseFactory; import org.apache.druid.data.input.impl.StringInputRowParser; -import org.apache.druid.initialization.DruidModule; +import org.apache.druid.data.input.s3.S3InputSourceTest; import org.apache.druid.storage.s3.NoopServerSideEncryption; -import org.apache.druid.storage.s3.S3Utils; import org.apache.druid.storage.s3.ServerSideEncryptingAmazonS3; import org.easymock.EasyMock; import org.junit.Assert; @@ -66,7 +50,7 @@ public class StaticS3FirehoseFactoryTest @Test public void testSerde() throws Exception { - final ObjectMapper mapper = createObjectMapper(new TestS3Module()); + final ObjectMapper mapper = S3InputSourceTest.createS3ObjectMapper(); final List uris = Arrays.asList( new URI("s3://foo/bar/file.gz"), @@ -101,9 +85,6 @@ public void testWithSplit() throws IOException ); uris.sort(Comparator.comparing(URI::toString)); - uris.forEach(StaticS3FirehoseFactoryTest::addExpectedObjject); - EasyMock.replay(S3_CLIENT); - final StaticS3FirehoseFactory factory = new StaticS3FirehoseFactory( SERVICE, uris, @@ -131,72 +112,4 @@ public void testWithSplit() throws IOException Assert.assertEquals(uris.get(i), subFactoryUris.get(0)); } } - - private static void addExpectedObjject(URI uri) - { - final String s3Bucket = uri.getAuthority(); - final String key = S3Utils.extractS3Key(uri); - final S3ObjectSummary objectSummary = new S3ObjectSummary(); - objectSummary.setBucketName(s3Bucket); - objectSummary.setKey(key); - final ListObjectsV2Result result = new ListObjectsV2Result(); - result.setBucketName(s3Bucket); - result.setKeyCount(1); - result.getObjectSummaries().add(objectSummary); - EasyMock.expect(SERVICE.listObjectsV2(EasyMock.anyObject(ListObjectsV2Request.class))).andReturn(result); - } - - private static ObjectMapper createObjectMapper(DruidModule baseModule) - { - final Injector injector = Guice.createInjector( - new ObjectMapperModule(), - baseModule - ); - final ObjectMapper baseMapper = injector.getInstance(ObjectMapper.class); - - baseModule.getJacksonModules().forEach(baseMapper::registerModule); - return baseMapper; - } - - private static class TestS3Module implements DruidModule - { - @Override - public List getJacksonModules() - { - // Deserializer is need for AmazonS3Client even though it is injected. - // See https://github.com/FasterXML/jackson-databind/issues/962. - return ImmutableList.of(new SimpleModule().addDeserializer(AmazonS3.class, new ItemDeserializer())); - } - - @Override - public void configure(Binder binder) - { - - } - - @Provides - public ServerSideEncryptingAmazonS3 getAmazonS3Client() - { - return SERVICE; - } - } - - public static class ItemDeserializer extends StdDeserializer - { - public ItemDeserializer() - { - this(null); - } - - public ItemDeserializer(Class vc) - { - super(vc); - } - - @Override - public AmazonS3 deserialize(JsonParser jp, DeserializationContext ctxt) - { - throw new UnsupportedOperationException(); - } - } } From 8d14aecfa87b5fcc4da185f32f2e1f73fce54a96 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Mon, 18 Nov 2019 19:01:16 -0800 Subject: [PATCH 02/28] add docs --- docs/development/extensions-core/s3.md | 48 ++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/docs/development/extensions-core/s3.md b/docs/development/extensions-core/s3.md index d145f07d38ad..5cee9de23c4f 100644 --- a/docs/development/extensions-core/s3.md +++ b/docs/development/extensions-core/s3.md @@ -98,6 +98,54 @@ You can enable [server-side encryption](https://docs.aws.amazon.com/AmazonS3/lat - kms: [Server-side encryption with AWS KMS–Managed Keys](https://docs.aws.amazon.com/AmazonS3/latest/dev/UsingKMSEncryption.html) - custom: [Server-side encryption with Customer-Provided Encryption Keys](https://docs.aws.amazon.com/AmazonS3/latest/dev/ServerSideEncryptionCustomerKeys.html) + + + +## S3 batch ingestion input source + +This extension also provides an input source for Druid native batch ingestion to support reading objects directly from S3. Objects can be specified either via a list of S3 URI strings or a list of S3 location prefixes, which will attempt to list the contents and ingest all objects contained in the locations. The S3 input source is splittable and can be used by [native parallel index tasks](../../ingestion/native-batch.md#parallel-task), where each worker task of `index_parallel` will read a single object. + +Sample spec: + +```json +... + "ioConfig": { + "type": "index_parallel", + "inputSource": { + "type": "s3", + "uris": ["s3://foo/bar/file.json", "s3://bar/foo/file2.json"] + }, + "inputFormat": { + "type": "json" + }, + ... + }, +... +``` + +```json +... + "ioConfig": { + "type": "index_parallel", + "inputSource": { + "type": "s3", + "prefixes": ["s3://foo/bar", "s3://bar/foo"] + }, + "inputFormat": { + "type": "json" + }, + ... + }, +... +``` + +|property|description|default|required?| +|--------|-----------|-------|---------| +|type|This should be `s3`.|N/A|yes| +|uris|JSON array of URIs where s3 files to be ingested are located.|N/A|`uris` or `prefixes` must be set| +|prefixes|JSON array of URI prefixes for the locations of s3 files to be ingested.|N/A|`uris` or `prefixes` must be set| + + ## StaticS3Firehose From ef4cc318d107dfb473c6b1eeabb8f041fac4060a Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Tue, 19 Nov 2019 01:17:08 -0800 Subject: [PATCH 03/28] fixes --- .../org/apache/druid/data/input/AbstractInputSource.java | 1 - .../org/apache/druid/data/input/s3/S3InputSourceTest.java | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/core/src/main/java/org/apache/druid/data/input/AbstractInputSource.java b/core/src/main/java/org/apache/druid/data/input/AbstractInputSource.java index d41db9c362c5..34dfe1c4abaa 100644 --- a/core/src/main/java/org/apache/druid/data/input/AbstractInputSource.java +++ b/core/src/main/java/org/apache/druid/data/input/AbstractInputSource.java @@ -60,7 +60,6 @@ protected InputSourceReader formattableReader( } protected InputSourceReader fixedFormatReader(InputRowSchema inputRowSchema, @Nullable File temporaryDirectory) - throws IOException { throw new UnsupportedOperationException("Implement this method properly if needsFormat() = false"); } diff --git a/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java b/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java index 7adcb932defd..9959b7f4bab5 100644 --- a/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java +++ b/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java @@ -45,7 +45,6 @@ import org.apache.druid.storage.s3.ServerSideEncryptingAmazonS3; import org.easymock.EasyMock; import org.junit.Assert; -import org.junit.Before; import org.junit.Test; import java.io.IOException; @@ -117,8 +116,8 @@ public void testWithPrefixesSplit() throws IOException URI.create("s3://foo/bar/file.gz"), URI.create("s3://bar/foo/file2.gz") ); - addExpectedPrefixObjects(SERVICE, URI.create("s3://foo/bar"), ImmutableList.of(prefixes.get(0))); - addExpectedPrefixObjects(SERVICE, URI.create("s3://bar/foo"), ImmutableList.of(prefixes.get(1))); + addExpectedPrefixObjects(SERVICE, URI.create("s3://foo/bar"), ImmutableList.of(expectedUris.get(0))); + addExpectedPrefixObjects(SERVICE, URI.create("s3://bar/foo"), ImmutableList.of(expectedUris.get(1))); EasyMock.replay(S3_CLIENT); S3InputSource inputSource = new S3InputSource(SERVICE, null, prefixes); From 11cb7cc584a39fa800c86b0fafd732a6f3342683 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Tue, 19 Nov 2019 03:38:57 -0800 Subject: [PATCH 04/28] checkstyle --- .../org/apache/druid/data/input/s3/S3InputSourceTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java b/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java index 9959b7f4bab5..7e63f462513a 100644 --- a/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java +++ b/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java @@ -77,7 +77,7 @@ public void testSerde() throws Exception new URI("s3://bar/foo") ); - final S3InputSource withUris = new S3InputSource(SERVICE, uris,null); + final S3InputSource withUris = new S3InputSource(SERVICE, uris, null); final S3InputSource serdeWithUris = mapper.readValue(mapper.writeValueAsString(withUris), S3InputSource.class); Assert.assertEquals(withUris, serdeWithUris); @@ -200,4 +200,4 @@ public AmazonS3 deserialize(JsonParser jp, DeserializationContext ctxt) throw new UnsupportedOperationException(); } } -} \ No newline at end of file +} From 56228073b652796d86b5834e3a297911b12116bc Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Tue, 19 Nov 2019 19:48:05 -0800 Subject: [PATCH 05/28] lazy splits --- .../StaticAzureBlobStoreFirehoseFactory.java | 2 +- .../StaticCloudFilesFirehoseFactory.java | 2 +- .../apache/druid/data/input/s3/S3Entity.java | 4 +- .../druid/data/input/s3/S3InputSource.java | 209 ++++++++++++------ .../firehose/s3/StaticS3FirehoseFactory.java | 54 ++++- 5 files changed, 193 insertions(+), 78 deletions(-) diff --git a/extensions-contrib/azure-extensions/src/main/java/org/apache/druid/firehose/azure/StaticAzureBlobStoreFirehoseFactory.java b/extensions-contrib/azure-extensions/src/main/java/org/apache/druid/firehose/azure/StaticAzureBlobStoreFirehoseFactory.java index fa4c74e1ba4a..d27970adea8e 100644 --- a/extensions-contrib/azure-extensions/src/main/java/org/apache/druid/firehose/azure/StaticAzureBlobStoreFirehoseFactory.java +++ b/extensions-contrib/azure-extensions/src/main/java/org/apache/druid/firehose/azure/StaticAzureBlobStoreFirehoseFactory.java @@ -50,7 +50,7 @@ public class StaticAzureBlobStoreFirehoseFactory extends PrefetchableTextFilesFi @JsonCreator public StaticAzureBlobStoreFirehoseFactory( - @JacksonInject("azureStorage") AzureStorage azureStorage, + @JacksonInject AzureStorage azureStorage, @JsonProperty("blobs") List blobs, @JsonProperty("maxCacheCapacityBytes") Long maxCacheCapacityBytes, @JsonProperty("maxFetchCapacityBytes") Long maxFetchCapacityBytes, diff --git a/extensions-contrib/cloudfiles-extensions/src/main/java/org/apache/druid/firehose/cloudfiles/StaticCloudFilesFirehoseFactory.java b/extensions-contrib/cloudfiles-extensions/src/main/java/org/apache/druid/firehose/cloudfiles/StaticCloudFilesFirehoseFactory.java index fbab98a384ee..f0de9f7e98de 100644 --- a/extensions-contrib/cloudfiles-extensions/src/main/java/org/apache/druid/firehose/cloudfiles/StaticCloudFilesFirehoseFactory.java +++ b/extensions-contrib/cloudfiles-extensions/src/main/java/org/apache/druid/firehose/cloudfiles/StaticCloudFilesFirehoseFactory.java @@ -50,7 +50,7 @@ public class StaticCloudFilesFirehoseFactory extends PrefetchableTextFilesFireho @JsonCreator public StaticCloudFilesFirehoseFactory( - @JacksonInject("objectApi") CloudFilesApi cloudFilesApi, + @JacksonInject CloudFilesApi cloudFilesApi, @JsonProperty("blobs") List blobs, @JsonProperty("maxCacheCapacityBytes") Long maxCacheCapacityBytes, @JsonProperty("maxFetchCapacityBytes") Long maxFetchCapacityBytes, diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java index 078051ab2c21..a62eda3599d3 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java @@ -28,7 +28,6 @@ import org.apache.druid.storage.s3.ServerSideEncryptingAmazonS3; import org.apache.druid.utils.CompressionUtils; -import javax.annotation.Nullable; import java.io.IOException; import java.io.InputStream; import java.net.URI; @@ -38,13 +37,12 @@ public class S3Entity implements InputEntity private final ServerSideEncryptingAmazonS3 s3Client; private final URI uri; - public S3Entity(ServerSideEncryptingAmazonS3 s3Client, URI uri) + S3Entity(ServerSideEncryptingAmazonS3 s3Client, URI uri) { this.s3Client = s3Client; this.uri = uri; } - @Nullable @Override public URI getUri() { diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java index d36eb1d2ec2c..dbc54e213a94 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java @@ -20,6 +20,8 @@ package org.apache.druid.data.input.s3; import com.amazonaws.services.s3.model.AmazonS3Exception; +import com.amazonaws.services.s3.model.ListObjectsV2Request; +import com.amazonaws.services.s3.model.ListObjectsV2Result; import com.amazonaws.services.s3.model.ObjectMetadata; import com.amazonaws.services.s3.model.S3ObjectSummary; import com.fasterxml.jackson.annotation.JacksonInject; @@ -27,7 +29,7 @@ import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; -import com.google.common.collect.Lists; +import com.google.common.collect.Iterators; import org.apache.druid.data.input.AbstractInputSource; import org.apache.druid.data.input.InputFormat; import org.apache.druid.data.input.InputRowSchema; @@ -37,7 +39,8 @@ import org.apache.druid.data.input.impl.InputEntityIteratingReader; import org.apache.druid.data.input.impl.SplittableInputSource; import org.apache.druid.java.util.common.IAE; -import org.apache.druid.java.util.common.IOE; +import org.apache.druid.java.util.common.ISE; +import org.apache.druid.java.util.common.RE; import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.java.util.common.logger.Logger; import org.apache.druid.storage.s3.S3Utils; @@ -45,15 +48,14 @@ import javax.annotation.Nullable; import java.io.File; -import java.io.IOException; import java.net.URI; import java.util.ArrayList; -import java.util.Collection; import java.util.Iterator; import java.util.List; +import java.util.NoSuchElementException; import java.util.Objects; -import java.util.stream.Collectors; import java.util.stream.Stream; +import java.util.stream.StreamSupport; public class S3InputSource extends AbstractInputSource implements SplittableInputSource { @@ -63,11 +65,10 @@ public class S3InputSource extends AbstractInputSource implements SplittableInpu private final ServerSideEncryptingAmazonS3 s3Client; private final List uris; private final List prefixes; - private Collection cacheSplitUris = null; @JsonCreator public S3InputSource( - @JacksonInject("s3Client") ServerSideEncryptingAmazonS3 s3Client, + @JacksonInject ServerSideEncryptingAmazonS3 s3Client, @JsonProperty("uris") @Nullable List uris, @JsonProperty("prefixes") @Nullable List prefixes ) @@ -107,21 +108,22 @@ public List getPrefixes() @Override public Stream> createSplits(InputFormat inputFormat, @Nullable SplitHintSpec splitHintSpec) - throws IOException { - if (cacheSplitUris == null) { - initalizeSplitUris(); + if (!uris.isEmpty()) { + return uris.stream().map(InputSplit::new); } - return cacheSplitUris.stream().map(InputSplit::new); + + return StreamSupport.stream(getIterableObjectsFromPrefixes().spliterator(), false); } @Override - public int getNumSplits(InputFormat inputFormat, @Nullable SplitHintSpec splitHintSpec) throws IOException + public int getNumSplits(InputFormat inputFormat, @Nullable SplitHintSpec splitHintSpec) { - if (cacheSplitUris == null) { - initalizeSplitUris(); + if (!uris.isEmpty()) { + return uris.size(); } - return cacheSplitUris.size(); + + return (int) StreamSupport.stream(getIterableObjectsFromPrefixes().spliterator(), false).count(); } @Override @@ -141,7 +143,7 @@ protected InputSourceReader formattableReader( InputRowSchema inputRowSchema, InputFormat inputFormat, @Nullable File temporaryDirectory - ) throws IOException + ) { return new InputEntityIteratingReader( inputRowSchema, @@ -154,60 +156,6 @@ protected InputSourceReader formattableReader( ); } - private void initalizeSplitUris() throws IOException - { - cacheSplitUris = !uris.isEmpty() ? uris : getUrisFromPrefix(s3Client, prefixes); - } - - public static Collection getUrisFromPrefix(ServerSideEncryptingAmazonS3 s3Client, List prefixes) - throws IOException - { - final List objects = new ArrayList<>(); - for (URI uri : prefixes) { - final String bucket = uri.getAuthority(); - final String prefix = S3Utils.extractS3Key(uri); - - try { - final Iterator objectSummaryIterator = S3Utils.objectSummaryIterator( - s3Client, - bucket, - prefix, - MAX_LISTING_LENGTH - ); - objects.addAll(Lists.newArrayList(objectSummaryIterator)); - } - catch (AmazonS3Exception outerException) { - log.error(outerException, "Exception while listing on %s", uri); - - if (outerException.getStatusCode() == 403) { - // The "Access Denied" means users might not have a proper permission for listing on the given uri. - // Usually this is not a problem, but the uris might be the full paths to input objects instead of prefixes. - // In this case, users should be able to get objects if they have a proper permission for GetObject. - - log.warn("Access denied for %s. Try to get the object from the uri without listing", uri); - try { - final ObjectMetadata objectMetadata = s3Client.getObjectMetadata(bucket, prefix); - - if (!S3Utils.isDirectoryPlaceholder(prefix, objectMetadata)) { - objects.add(S3Utils.getSingleObjectSummary(s3Client, bucket, prefix)); - } else { - throw new IOE( - "[%s] is a directory placeholder, " - + "but failed to get the object list under the directory due to permission", - uri - ); - } - } - catch (AmazonS3Exception innerException) { - throw new IOException(innerException); - } - } else { - throw new IOException(outerException); - } - } - } - return objects.stream().map(S3InputSource::toUri).collect(Collectors.toList()); - } /** * Create an {@link URI} from the given {@link S3ObjectSummary}. The result URI is composed as below. @@ -216,7 +164,7 @@ public static Collection getUrisFromPrefix(ServerSideEncryptingAmazonS3 s3C * {@code s3://{BUCKET_NAME}/{OBJECT_KEY}} * */ - private static URI toUri(S3ObjectSummary object) + public static URI toUri(S3ObjectSummary object) { final String originalAuthority = object.getBucketName(); final String originalPath = object.getKey(); @@ -256,4 +204,123 @@ public String toString() ", prefixes=" + prefixes + '}'; } + + private Iterable> getIterableObjectsFromPrefixes() + { + return () -> objectFetchingIterator(s3Client, prefixes.iterator()); + } + + private static Iterator> objectFetchingIterator( + final ServerSideEncryptingAmazonS3 s3Client, + final Iterator prefixes + ) + { + return new Iterator>() + { + private ListObjectsV2Request request; + private ListObjectsV2Result result; + private URI currentUri; + private String currentBucket; + private String currentPrefix; + private Iterator objectSummaryIterator; + + { + prepareNextRequest(); + fetchNextBatch(); + } + + private void prepareNextRequest() + { + currentUri = prefixes.next(); + currentBucket = currentUri.getAuthority(); + currentPrefix = S3Utils.extractS3Key(currentUri); + + request = new ListObjectsV2Request() + .withBucketName(currentBucket) + .withPrefix(currentPrefix) + .withMaxKeys(S3InputSource.MAX_LISTING_LENGTH); + } + + private void fetchNextBatch() + { + try { + result = S3Utils.retryS3Operation(() -> s3Client.listObjectsV2(request)); + objectSummaryIterator = result.getObjectSummaries().iterator(); + request.setContinuationToken(result.getContinuationToken()); + } + catch (AmazonS3Exception outerException) { + log.error(outerException, "Exception while listing on %s", currentUri); + + if (outerException.getStatusCode() == 403) { + // The "Access Denied" means users might not have a proper permission for listing on the given uri. + // Usually this is not a problem, but the uris might be the full paths to input objects instead of prefixes. + // In this case, users should be able to get objects if they have a proper permission for GetObject. + + log.warn("Access denied for %s. Try to get the object from the uri without listing", currentUri); + try { + final ObjectMetadata objectMetadata = + S3Utils.retryS3Operation(() -> s3Client.getObjectMetadata(currentBucket, currentPrefix)); + + if (!S3Utils.isDirectoryPlaceholder(currentPrefix, objectMetadata)) { + + objectSummaryIterator = Iterators.singletonIterator( + S3Utils.getSingleObjectSummary(s3Client, currentBucket, currentPrefix) + ); + } else { + throw new RE( + "[%s] is a directory placeholder, " + + "but failed to get the object list under the directory due to permission", + currentUri + ); + } + } + catch (Exception innerException) { + throw new RuntimeException(innerException); + } + } else { + throw new RuntimeException(outerException); + } + } + catch (Exception ex) { + throw new RuntimeException(ex); + } + } + + @Override + public boolean hasNext() + { + return objectSummaryIterator.hasNext() || result.isTruncated() || prefixes.hasNext(); + } + + @Override + public InputSplit next() + { + if (!hasNext()) { + throw new NoSuchElementException(); + } + + if (objectSummaryIterator.hasNext()) { + return new InputSplit<>(toUri(objectSummaryIterator.next())); + } + + if (result.isTruncated()) { + fetchNextBatch(); + } else if (prefixes.hasNext()) { + prepareNextRequest(); + fetchNextBatch(); + } + + if (!objectSummaryIterator.hasNext()) { + throw new ISE( + "Failed to further iterate on bucket[%s] and prefix[%s]. The last continuationToken was [%s]", + currentBucket, + currentPrefix, + result.getContinuationToken() + ); + } + + return new InputSplit<>(toUri(objectSummaryIterator.next())); + } + }; + } } diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/firehose/s3/StaticS3FirehoseFactory.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/firehose/s3/StaticS3FirehoseFactory.java index 4cd9223f8bc3..9a25c29d31ef 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/firehose/s3/StaticS3FirehoseFactory.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/firehose/s3/StaticS3FirehoseFactory.java @@ -21,18 +21,22 @@ import com.amazonaws.services.s3.model.AmazonS3Exception; import com.amazonaws.services.s3.model.GetObjectRequest; +import com.amazonaws.services.s3.model.ObjectMetadata; import com.amazonaws.services.s3.model.S3Object; +import com.amazonaws.services.s3.model.S3ObjectSummary; import com.fasterxml.jackson.annotation.JacksonInject; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.Preconditions; import com.google.common.base.Predicate; +import com.google.common.collect.Lists; import org.apache.druid.data.input.FiniteFirehoseFactory; import org.apache.druid.data.input.InputSplit; import org.apache.druid.data.input.impl.StringInputRowParser; import org.apache.druid.data.input.impl.prefetch.PrefetchableTextFilesFirehoseFactory; import org.apache.druid.data.input.s3.S3InputSource; import org.apache.druid.java.util.common.IAE; +import org.apache.druid.java.util.common.IOE; import org.apache.druid.java.util.common.ISE; import org.apache.druid.java.util.common.logger.Logger; import org.apache.druid.storage.s3.S3Utils; @@ -45,8 +49,10 @@ import java.util.ArrayList; import java.util.Collection; import java.util.Collections; +import java.util.Iterator; import java.util.List; import java.util.Objects; +import java.util.stream.Collectors; /** * Builds firehoses that read from a predefined list of S3 objects and then dry up. @@ -62,7 +68,7 @@ public class StaticS3FirehoseFactory extends PrefetchableTextFilesFirehoseFactor @JsonCreator public StaticS3FirehoseFactory( - @JacksonInject("s3Client") ServerSideEncryptingAmazonS3 s3Client, + @JacksonInject ServerSideEncryptingAmazonS3 s3Client, @JsonProperty("uris") List uris, @JsonProperty("prefixes") List prefixes, @JsonProperty("maxCacheCapacityBytes") Long maxCacheCapacityBytes, @@ -112,7 +118,51 @@ protected Collection initObjects() throws IOException if (!uris.isEmpty()) { return uris; } else { - return S3InputSource.getUrisFromPrefix(s3Client, prefixes); + final List objects = new ArrayList<>(); + for (URI uri : prefixes) { + final String bucket = uri.getAuthority(); + final String prefix = S3Utils.extractS3Key(uri); + + try { + final Iterator objectSummaryIterator = S3Utils.objectSummaryIterator( + s3Client, + bucket, + prefix, + MAX_LISTING_LENGTH + ); + objects.addAll(Lists.newArrayList(objectSummaryIterator)); + } + catch (AmazonS3Exception outerException) { + log.error(outerException, "Exception while listing on %s", uri); + + if (outerException.getStatusCode() == 403) { + // The "Access Denied" means users might not have a proper permission for listing on the given uri. + // Usually this is not a problem, but the uris might be the full paths to input objects instead of prefixes. + // In this case, users should be able to get objects if they have a proper permission for GetObject. + + log.warn("Access denied for %s. Try to get the object from the uri without listing", uri); + try { + final ObjectMetadata objectMetadata = s3Client.getObjectMetadata(bucket, prefix); + + if (!S3Utils.isDirectoryPlaceholder(prefix, objectMetadata)) { + objects.add(S3Utils.getSingleObjectSummary(s3Client, bucket, prefix)); + } else { + throw new IOE( + "[%s] is a directory placeholder, " + + "but failed to get the object list under the directory due to permission", + uri + ); + } + } + catch (AmazonS3Exception innerException) { + throw new IOException(innerException); + } + } else { + throw new IOException(outerException); + } + } + } + return objects.stream().map(S3InputSource::toUri).collect(Collectors.toList()); } } From f7bb70e18d637fe84457cd66f08696035237d0e7 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Wed, 20 Nov 2019 01:46:45 -0800 Subject: [PATCH 06/28] fixes and hella tests --- .../druid/data/input/AbstractInputSource.java | 5 +- .../druid/data/input/s3/S3InputSource.java | 15 +- .../data/input/s3/S3InputSourceTest.java | 175 ++++++++++++++---- 3 files changed, 151 insertions(+), 44 deletions(-) diff --git a/core/src/main/java/org/apache/druid/data/input/AbstractInputSource.java b/core/src/main/java/org/apache/druid/data/input/AbstractInputSource.java index 34dfe1c4abaa..94ddcd6b79e7 100644 --- a/core/src/main/java/org/apache/druid/data/input/AbstractInputSource.java +++ b/core/src/main/java/org/apache/druid/data/input/AbstractInputSource.java @@ -23,7 +23,6 @@ import javax.annotation.Nullable; import java.io.File; -import java.io.IOException; /** * Abstract class for {@link InputSource}. This class provides a default implementation of {@link #reader} with @@ -37,7 +36,7 @@ public InputSourceReader reader( InputRowSchema inputRowSchema, @Nullable InputFormat inputFormat, @Nullable File temporaryDirectory - ) throws IOException + ) { if (needsFormat()) { return formattableReader( @@ -54,7 +53,7 @@ protected InputSourceReader formattableReader( InputRowSchema inputRowSchema, InputFormat inputFormat, @Nullable File temporaryDirectory - ) throws IOException + ) { throw new UnsupportedOperationException("Implement this method properly if needsFormat() = true"); } diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java index dbc54e213a94..9c8859df5dc4 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java @@ -210,6 +210,12 @@ private Iterable> getIterableObjectsFromPrefixes() return () -> objectFetchingIterator(s3Client, prefixes.iterator()); } + /** + * iterator which fetches batches of {@link #MAX_LISTING_LENGTH} objects given a set of prefixes, using + * {@link ServerSideEncryptingAmazonS3#listObjectsV2}, with a fallback to + * {@link ServerSideEncryptingAmazonS3#getObjectMetadata} to check if the 'prefix' is an object in the event the + * list objects call responds with a 403 http status code + */ private static Iterator> objectFetchingIterator( final ServerSideEncryptingAmazonS3 s3Client, final Iterator prefixes @@ -262,10 +268,11 @@ private void fetchNextBatch() S3Utils.retryS3Operation(() -> s3Client.getObjectMetadata(currentBucket, currentPrefix)); if (!S3Utils.isDirectoryPlaceholder(currentPrefix, objectMetadata)) { - - objectSummaryIterator = Iterators.singletonIterator( - S3Utils.getSingleObjectSummary(s3Client, currentBucket, currentPrefix) - ); + // it's not a directory, so just generate an object summary + S3ObjectSummary fabricated = new S3ObjectSummary(); + fabricated.setBucketName(currentBucket); + fabricated.setKey(currentPrefix); + objectSummaryIterator = Iterators.singletonIterator(fabricated); } else { throw new RE( "[%s] is a directory placeholder, " diff --git a/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java b/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java index 7e63f462513a..6283270daec7 100644 --- a/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java +++ b/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java @@ -21,8 +21,14 @@ import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.AmazonS3Client; +import com.amazonaws.services.s3.Headers; +import com.amazonaws.services.s3.model.AmazonS3Exception; +import com.amazonaws.services.s3.model.GetObjectMetadataRequest; +import com.amazonaws.services.s3.model.GetObjectRequest; import com.amazonaws.services.s3.model.ListObjectsV2Request; import com.amazonaws.services.s3.model.ListObjectsV2Result; +import com.amazonaws.services.s3.model.ObjectMetadata; +import com.amazonaws.services.s3.model.S3Object; import com.amazonaws.services.s3.model.S3ObjectSummary; import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.databind.DeserializationContext; @@ -36,17 +42,28 @@ import com.google.inject.Guice; import com.google.inject.Injector; import com.google.inject.Provides; +import org.apache.druid.data.input.InputRow; +import org.apache.druid.data.input.InputRowSchema; +import org.apache.druid.data.input.InputSourceReader; import org.apache.druid.data.input.InputSplit; +import org.apache.druid.data.input.impl.CsvInputFormat; +import org.apache.druid.data.input.impl.DimensionsSpec; import org.apache.druid.data.input.impl.JsonInputFormat; +import org.apache.druid.data.input.impl.TimestampSpec; import org.apache.druid.initialization.DruidModule; +import org.apache.druid.java.util.common.DateTimes; +import org.apache.druid.java.util.common.StringUtils; +import org.apache.druid.java.util.common.parsers.CloseableIterator; import org.apache.druid.java.util.common.parsers.JSONPathSpec; import org.apache.druid.storage.s3.NoopServerSideEncryption; import org.apache.druid.storage.s3.S3Utils; import org.apache.druid.storage.s3.ServerSideEncryptingAmazonS3; import org.easymock.EasyMock; +import org.joda.time.DateTime; import org.junit.Assert; import org.junit.Test; +import java.io.ByteArrayInputStream; import java.io.IOException; import java.net.URI; import java.util.Arrays; @@ -62,75 +79,132 @@ public class S3InputSourceTest new NoopServerSideEncryption() ); + private static final List EXPECTED_URIS = Arrays.asList( + URI.create("s3://foo/bar/file.csv"), + URI.create("s3://bar/foo/file2.csv") + ); + + private static final List PREFIXES = Arrays.asList( + URI.create("s3://foo/bar"), + URI.create("s3://bar/foo") + ); + + private static final DateTime NOW = DateTimes.nowUtc(); + private static final byte[] CONTENT = StringUtils.toUtf8(StringUtils.format("%d,hello,world", NOW.getMillis())); + @Test - public void testSerde() throws Exception + public void testSerdeWithUris() throws Exception { final ObjectMapper mapper = createS3ObjectMapper(); - final List uris = Arrays.asList( - new URI("s3://foo/bar/file.gz"), - new URI("s3://bar/foo/file2.gz") - ); - - final List prefixes = Arrays.asList( - new URI("s3://foo/bar"), - new URI("s3://bar/foo") - ); - - final S3InputSource withUris = new S3InputSource(SERVICE, uris, null); + final S3InputSource withUris = new S3InputSource(SERVICE, EXPECTED_URIS, null); final S3InputSource serdeWithUris = mapper.readValue(mapper.writeValueAsString(withUris), S3InputSource.class); Assert.assertEquals(withUris, serdeWithUris); + } - final S3InputSource withPrefixes = new S3InputSource(SERVICE, null, prefixes); - final S3InputSource serdeWithPrefixes = mapper.readValue(mapper.writeValueAsString(withPrefixes), S3InputSource.class); + @Test + public void testSerdeWithPrefixes() throws Exception + { + final ObjectMapper mapper = createS3ObjectMapper(); + + final S3InputSource withPrefixes = new S3InputSource(SERVICE, null, PREFIXES); + final S3InputSource serdeWithPrefixes = + mapper.readValue(mapper.writeValueAsString(withPrefixes), S3InputSource.class); Assert.assertEquals(withPrefixes, serdeWithPrefixes); } @Test - public void testWithUrisSplit() throws IOException + public void testWithUrisSplit() { - final List uris = Arrays.asList( - URI.create("s3://foo/bar/file.gz"), - URI.create("s3://bar/foo/file2.gz") + S3InputSource inputSource = new S3InputSource(SERVICE, EXPECTED_URIS, null); + + Stream> splits = inputSource.createSplits( + new JsonInputFormat(JSONPathSpec.DEFAULT, null), + null ); - S3InputSource inputSource = new S3InputSource(SERVICE, uris, null); + Assert.assertEquals(EXPECTED_URIS, splits.map(InputSplit::get).collect(Collectors.toList())); + } + + @Test + public void testWithPrefixesSplit() + { + EasyMock.reset(S3_CLIENT); + addExpectedPrefixObjects(PREFIXES.get(0), ImmutableList.of(EXPECTED_URIS.get(0))); + addExpectedPrefixObjects(PREFIXES.get(1), ImmutableList.of(EXPECTED_URIS.get(1))); + EasyMock.replay(S3_CLIENT); + + S3InputSource inputSource = new S3InputSource(SERVICE, null, PREFIXES); Stream> splits = inputSource.createSplits( new JsonInputFormat(JSONPathSpec.DEFAULT, null), null ); - Assert.assertEquals(uris, splits.map(InputSplit::get).collect(Collectors.toList())); + Assert.assertEquals(EXPECTED_URIS, splits.map(InputSplit::get).collect(Collectors.toList())); } @Test - public void testWithPrefixesSplit() throws IOException + public void testWithPrefixesWhereOneIsUrisAndNoListPermissionSplit() { - final List prefixes = Arrays.asList( - URI.create("s3://foo/bar"), - URI.create("s3://bar/foo") + EasyMock.reset(S3_CLIENT); + addExpectedPrefixObjects(PREFIXES.get(0), ImmutableList.of(EXPECTED_URIS.get(0))); + addExpectedNonPrefixObjectsWithNoListPermission(EXPECTED_URIS.get(1)); + EasyMock.replay(S3_CLIENT); + + S3InputSource inputSource = new S3InputSource( + SERVICE, + null, + ImmutableList.of(PREFIXES.get(0), EXPECTED_URIS.get(1)) ); - final List expectedUris = Arrays.asList( - URI.create("s3://foo/bar/file.gz"), - URI.create("s3://bar/foo/file2.gz") + Stream> splits = inputSource.createSplits( + new JsonInputFormat(JSONPathSpec.DEFAULT, null), + null ); - addExpectedPrefixObjects(SERVICE, URI.create("s3://foo/bar"), ImmutableList.of(expectedUris.get(0))); - addExpectedPrefixObjects(SERVICE, URI.create("s3://bar/foo"), ImmutableList.of(expectedUris.get(1))); + + Assert.assertEquals(EXPECTED_URIS, splits.map(InputSplit::get).collect(Collectors.toList())); + } + + @Test + public void testReader() throws IOException + { + EasyMock.reset(S3_CLIENT); + addExpectedPrefixObjects(PREFIXES.get(0), ImmutableList.of(EXPECTED_URIS.get(0))); + addExpectedNonPrefixObjectsWithNoListPermission(EXPECTED_URIS.get(1)); + addExpectedGetObjectMock(EXPECTED_URIS.get(0)); + addExpectedGetObjectMock(EXPECTED_URIS.get(1)); EasyMock.replay(S3_CLIENT); - S3InputSource inputSource = new S3InputSource(SERVICE, null, prefixes); + S3InputSource inputSource = new S3InputSource( + SERVICE, + null, + ImmutableList.of(PREFIXES.get(0), EXPECTED_URIS.get(1)) + ); - Stream> splits = inputSource.createSplits( - new JsonInputFormat(JSONPathSpec.DEFAULT, null), + InputRowSchema someSchema = new InputRowSchema( + new TimestampSpec("time", "auto", null), + new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("dim1", "dim2"))), + ImmutableList.of("count") + ); + + InputSourceReader reader = inputSource.reader( + someSchema, + new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, 0), null ); - Assert.assertEquals(expectedUris, splits.map(InputSplit::get).collect(Collectors.toList())); + CloseableIterator iterator = reader.read(); + + while (iterator.hasNext()) { + InputRow nextRow = iterator.next(); + Assert.assertEquals(NOW, nextRow.getTimestamp()); + Assert.assertEquals("hello", nextRow.getDimension("dim1").get(0)); + Assert.assertEquals("world", nextRow.getDimension("dim2").get(0)); + } } - private static void addExpectedPrefixObjects(ServerSideEncryptingAmazonS3 service, URI prefix, List uris) + private static void addExpectedPrefixObjects(URI prefix, List uris) { final String s3Bucket = prefix.getAuthority(); final ListObjectsV2Result result = new ListObjectsV2Result(); @@ -143,7 +217,34 @@ private static void addExpectedPrefixObjects(ServerSideEncryptingAmazonS3 servic objectSummary.setKey(key); result.getObjectSummaries().add(objectSummary); } - EasyMock.expect(service.listObjectsV2(EasyMock.anyObject(ListObjectsV2Request.class))).andReturn(result).once(); + EasyMock.expect(S3_CLIENT.listObjectsV2(EasyMock.anyObject(ListObjectsV2Request.class))).andReturn(result).once(); + } + + private static void addExpectedNonPrefixObjectsWithNoListPermission(URI uri) + { + AmazonS3Exception boom = new AmazonS3Exception("oh dang, you can't list that bucket friend"); + boom.setStatusCode(403); + EasyMock.expect(S3_CLIENT.listObjectsV2(EasyMock.anyObject(ListObjectsV2Request.class))).andThrow(boom).once(); + + ObjectMetadata metadata = new ObjectMetadata(); + metadata.setContentLength(CONTENT.length); + metadata.setContentEncoding("text/csv"); + metadata.setHeader(Headers.ETAG, "some-totally-real-etag-base64-hash-i-guess"); + EasyMock.expect(S3_CLIENT.getObjectMetadata(EasyMock.anyObject(GetObjectMetadataRequest.class))) + .andReturn(metadata) + .once(); + } + + private static void addExpectedGetObjectMock(URI uri) + { + final String s3Bucket = uri.getAuthority(); + final String key = S3Utils.extractS3Key(uri); + + S3Object someObject = new S3Object(); + someObject.setBucketName(s3Bucket); + someObject.setKey(key); + someObject.setObjectContent(new ByteArrayInputStream(CONTENT)); + EasyMock.expect(S3_CLIENT.getObject(EasyMock.anyObject(GetObjectRequest.class))).andReturn(someObject).once(); } public static ObjectMapper createS3ObjectMapper() @@ -184,12 +285,12 @@ public ServerSideEncryptingAmazonS3 getAmazonS3Client() public static class ItemDeserializer extends StdDeserializer { - public ItemDeserializer() + ItemDeserializer() { this(null); } - public ItemDeserializer(Class vc) + ItemDeserializer(Class vc) { super(vc); } From a4f6ae9ae2f81381d865e87ec5e1219d275f299c Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Wed, 20 Nov 2019 05:00:55 -0800 Subject: [PATCH 07/28] fix it --- extensions-core/s3-extensions/pom.xml | 14 +++++++++++++- .../druid/data/input/s3/S3InputSourceTest.java | 3 ++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/extensions-core/s3-extensions/pom.xml b/extensions-core/s3-extensions/pom.xml index 8854400b9a12..25e3c04d6d94 100644 --- a/extensions-core/s3-extensions/pom.xml +++ b/extensions-core/s3-extensions/pom.xml @@ -113,8 +113,20 @@ aws-java-sdk-s3 provided - + + joda-time + joda-time + 2.10.5 + provided + + + org.apache.druid + druid-core + ${project.parent.version} + test-jar + test + org.apache.druid druid-server diff --git a/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java b/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java index 6283270daec7..e792d2ce520f 100644 --- a/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java +++ b/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java @@ -58,6 +58,7 @@ import org.apache.druid.storage.s3.NoopServerSideEncryption; import org.apache.druid.storage.s3.S3Utils; import org.apache.druid.storage.s3.ServerSideEncryptingAmazonS3; +import org.apache.druid.testing.InitializedNullHandlingTest; import org.easymock.EasyMock; import org.joda.time.DateTime; import org.junit.Assert; @@ -71,7 +72,7 @@ import java.util.stream.Collectors; import java.util.stream.Stream; -public class S3InputSourceTest +public class S3InputSourceTest extends InitializedNullHandlingTest { private static final AmazonS3Client S3_CLIENT = EasyMock.createNiceMock(AmazonS3Client.class); private static final ServerSideEncryptingAmazonS3 SERVICE = new ServerSideEncryptingAmazonS3( From d23a1317a19222e50aff3c081e8e8112868c6ddb Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Wed, 20 Nov 2019 12:48:00 -0800 Subject: [PATCH 08/28] re-use better iterator --- .../druid/data/input/s3/S3InputSource.java | 161 +----------------- .../firehose/s3/StaticS3FirehoseFactory.java | 6 +- .../s3/S3TimestampVersionedDataFinder.java | 58 +++---- .../org/apache/druid/storage/s3/S3Utils.java | 112 ++++++++++-- 4 files changed, 131 insertions(+), 206 deletions(-) diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java index 9c8859df5dc4..530e32f17a93 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java @@ -19,17 +19,12 @@ package org.apache.druid.data.input.s3; -import com.amazonaws.services.s3.model.AmazonS3Exception; -import com.amazonaws.services.s3.model.ListObjectsV2Request; -import com.amazonaws.services.s3.model.ListObjectsV2Result; -import com.amazonaws.services.s3.model.ObjectMetadata; import com.amazonaws.services.s3.model.S3ObjectSummary; import com.fasterxml.jackson.annotation.JacksonInject; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; -import com.google.common.collect.Iterators; import org.apache.druid.data.input.AbstractInputSource; import org.apache.druid.data.input.InputFormat; import org.apache.druid.data.input.InputRowSchema; @@ -39,10 +34,6 @@ import org.apache.druid.data.input.impl.InputEntityIteratingReader; import org.apache.druid.data.input.impl.SplittableInputSource; import org.apache.druid.java.util.common.IAE; -import org.apache.druid.java.util.common.ISE; -import org.apache.druid.java.util.common.RE; -import org.apache.druid.java.util.common.StringUtils; -import org.apache.druid.java.util.common.logger.Logger; import org.apache.druid.storage.s3.S3Utils; import org.apache.druid.storage.s3.ServerSideEncryptingAmazonS3; @@ -50,16 +41,13 @@ import java.io.File; import java.net.URI; import java.util.ArrayList; -import java.util.Iterator; import java.util.List; -import java.util.NoSuchElementException; import java.util.Objects; import java.util.stream.Stream; import java.util.stream.StreamSupport; public class S3InputSource extends AbstractInputSource implements SplittableInputSource { - private static final Logger log = new Logger(S3InputSource.class); private static final int MAX_LISTING_LENGTH = 1024; private final ServerSideEncryptingAmazonS3 s3Client; @@ -113,7 +101,9 @@ public Stream> createSplits(InputFormat inputFormat, @Nullable S return uris.stream().map(InputSplit::new); } - return StreamSupport.stream(getIterableObjectsFromPrefixes().spliterator(), false); + return StreamSupport.stream(getIterableObjectsFromPrefixes().spliterator(), false) + .map(S3Utils::summaryToUri) + .map(InputSplit::new); } @Override @@ -156,26 +146,6 @@ protected InputSourceReader formattableReader( ); } - - /** - * Create an {@link URI} from the given {@link S3ObjectSummary}. The result URI is composed as below. - * - *
-   * {@code s3://{BUCKET_NAME}/{OBJECT_KEY}}
-   * 
- */ - public static URI toUri(S3ObjectSummary object) - { - final String originalAuthority = object.getBucketName(); - final String originalPath = object.getKey(); - final String authority = originalAuthority.endsWith("/") ? - originalAuthority.substring(0, originalAuthority.length() - 1) : - originalAuthority; - final String path = originalPath.startsWith("/") ? originalPath.substring(1) : originalPath; - - return URI.create(StringUtils.format("s3://%s/%s", authority, path)); - } - @Override public boolean equals(Object o) { @@ -205,129 +175,8 @@ public String toString() '}'; } - private Iterable> getIterableObjectsFromPrefixes() + private Iterable getIterableObjectsFromPrefixes() { - return () -> objectFetchingIterator(s3Client, prefixes.iterator()); - } - - /** - * iterator which fetches batches of {@link #MAX_LISTING_LENGTH} objects given a set of prefixes, using - * {@link ServerSideEncryptingAmazonS3#listObjectsV2}, with a fallback to - * {@link ServerSideEncryptingAmazonS3#getObjectMetadata} to check if the 'prefix' is an object in the event the - * list objects call responds with a 403 http status code - */ - private static Iterator> objectFetchingIterator( - final ServerSideEncryptingAmazonS3 s3Client, - final Iterator prefixes - ) - { - return new Iterator>() - { - private ListObjectsV2Request request; - private ListObjectsV2Result result; - private URI currentUri; - private String currentBucket; - private String currentPrefix; - private Iterator objectSummaryIterator; - - { - prepareNextRequest(); - fetchNextBatch(); - } - - private void prepareNextRequest() - { - currentUri = prefixes.next(); - currentBucket = currentUri.getAuthority(); - currentPrefix = S3Utils.extractS3Key(currentUri); - - request = new ListObjectsV2Request() - .withBucketName(currentBucket) - .withPrefix(currentPrefix) - .withMaxKeys(S3InputSource.MAX_LISTING_LENGTH); - } - - private void fetchNextBatch() - { - try { - result = S3Utils.retryS3Operation(() -> s3Client.listObjectsV2(request)); - objectSummaryIterator = result.getObjectSummaries().iterator(); - request.setContinuationToken(result.getContinuationToken()); - } - catch (AmazonS3Exception outerException) { - log.error(outerException, "Exception while listing on %s", currentUri); - - if (outerException.getStatusCode() == 403) { - // The "Access Denied" means users might not have a proper permission for listing on the given uri. - // Usually this is not a problem, but the uris might be the full paths to input objects instead of prefixes. - // In this case, users should be able to get objects if they have a proper permission for GetObject. - - log.warn("Access denied for %s. Try to get the object from the uri without listing", currentUri); - try { - final ObjectMetadata objectMetadata = - S3Utils.retryS3Operation(() -> s3Client.getObjectMetadata(currentBucket, currentPrefix)); - - if (!S3Utils.isDirectoryPlaceholder(currentPrefix, objectMetadata)) { - // it's not a directory, so just generate an object summary - S3ObjectSummary fabricated = new S3ObjectSummary(); - fabricated.setBucketName(currentBucket); - fabricated.setKey(currentPrefix); - objectSummaryIterator = Iterators.singletonIterator(fabricated); - } else { - throw new RE( - "[%s] is a directory placeholder, " - + "but failed to get the object list under the directory due to permission", - currentUri - ); - } - } - catch (Exception innerException) { - throw new RuntimeException(innerException); - } - } else { - throw new RuntimeException(outerException); - } - } - catch (Exception ex) { - throw new RuntimeException(ex); - } - } - - @Override - public boolean hasNext() - { - return objectSummaryIterator.hasNext() || result.isTruncated() || prefixes.hasNext(); - } - - @Override - public InputSplit next() - { - if (!hasNext()) { - throw new NoSuchElementException(); - } - - if (objectSummaryIterator.hasNext()) { - return new InputSplit<>(toUri(objectSummaryIterator.next())); - } - - if (result.isTruncated()) { - fetchNextBatch(); - } else if (prefixes.hasNext()) { - prepareNextRequest(); - fetchNextBatch(); - } - - if (!objectSummaryIterator.hasNext()) { - throw new ISE( - "Failed to further iterate on bucket[%s] and prefix[%s]. The last continuationToken was [%s]", - currentBucket, - currentPrefix, - result.getContinuationToken() - ); - } - - return new InputSplit<>(toUri(objectSummaryIterator.next())); - } - }; + return () -> S3Utils.lazyFetchingObjectSummariesIterator(s3Client, prefixes.iterator(), MAX_LISTING_LENGTH); } } diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/firehose/s3/StaticS3FirehoseFactory.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/firehose/s3/StaticS3FirehoseFactory.java index 9a25c29d31ef..e202876f8baf 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/firehose/s3/StaticS3FirehoseFactory.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/firehose/s3/StaticS3FirehoseFactory.java @@ -34,7 +34,6 @@ import org.apache.druid.data.input.InputSplit; import org.apache.druid.data.input.impl.StringInputRowParser; import org.apache.druid.data.input.impl.prefetch.PrefetchableTextFilesFirehoseFactory; -import org.apache.druid.data.input.s3.S3InputSource; import org.apache.druid.java.util.common.IAE; import org.apache.druid.java.util.common.IOE; import org.apache.druid.java.util.common.ISE; @@ -126,8 +125,7 @@ protected Collection initObjects() throws IOException try { final Iterator objectSummaryIterator = S3Utils.objectSummaryIterator( s3Client, - bucket, - prefix, + uri, MAX_LISTING_LENGTH ); objects.addAll(Lists.newArrayList(objectSummaryIterator)); @@ -162,7 +160,7 @@ protected Collection initObjects() throws IOException } } } - return objects.stream().map(S3InputSource::toUri).collect(Collectors.toList()); + return objects.stream().map(S3Utils::summaryToUri).collect(Collectors.toList()); } } diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3TimestampVersionedDataFinder.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3TimestampVersionedDataFinder.java index 5ef94c0ced8d..429d64beb97e 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3TimestampVersionedDataFinder.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3TimestampVersionedDataFinder.java @@ -22,7 +22,6 @@ import com.amazonaws.services.s3.model.S3ObjectSummary; import com.google.inject.Inject; import org.apache.druid.data.SearchableVersionedDataFinder; -import org.apache.druid.java.util.common.RetryUtils; import org.apache.druid.java.util.common.StringUtils; import javax.annotation.Nullable; @@ -57,39 +56,32 @@ public S3TimestampVersionedDataFinder(ServerSideEncryptingAmazonS3 s3Client) public URI getLatestVersion(final URI uri, final @Nullable Pattern pattern) { try { - return RetryUtils.retry( - () -> { - final S3Coords coords = new S3Coords(checkURI(uri)); - long mostRecent = Long.MIN_VALUE; - URI latest = null; - final Iterator objectSummaryIterator = S3Utils.objectSummaryIterator( - s3Client, - coords.bucket, - coords.path, - MAX_LISTING_KEYS - ); - while (objectSummaryIterator.hasNext()) { - final S3ObjectSummary objectSummary = objectSummaryIterator.next(); - String keyString = objectSummary.getKey().substring(coords.path.length()); - if (keyString.startsWith("/")) { - keyString = keyString.substring(1); - } - if (pattern != null && !pattern.matcher(keyString).matches()) { - continue; - } - final long latestModified = objectSummary.getLastModified().getTime(); - if (latestModified >= mostRecent) { - mostRecent = latestModified; - latest = new URI( - StringUtils.format("s3://%s/%s", objectSummary.getBucketName(), objectSummary.getKey()) - ); - } - } - return latest; - }, - shouldRetryPredicate(), - DEFAULT_RETRY_COUNT + final S3Coords coords = new S3Coords(checkURI(uri)); + long mostRecent = Long.MIN_VALUE; + URI latest = null; + final Iterator objectSummaryIterator = S3Utils.objectSummaryIterator( + s3Client, + uri, + MAX_LISTING_KEYS ); + while (objectSummaryIterator.hasNext()) { + final S3ObjectSummary objectSummary = objectSummaryIterator.next(); + String keyString = objectSummary.getKey().substring(coords.path.length()); + if (keyString.startsWith("/")) { + keyString = keyString.substring(1); + } + if (pattern != null && !pattern.matcher(keyString).matches()) { + continue; + } + final long latestModified = objectSummary.getLastModified().getTime(); + if (latestModified >= mostRecent) { + mostRecent = latestModified; + latest = new URI( + StringUtils.format("s3://%s/%s", objectSummary.getBucketName(), objectSummary.getKey()) + ); + } + } + return latest; } catch (Exception e) { throw new RuntimeException(e); diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Utils.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Utils.java index 71ceba67446f..114001a5a89e 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Utils.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Utils.java @@ -32,9 +32,12 @@ import com.amazonaws.services.s3.model.S3ObjectSummary; import com.google.common.base.Joiner; import com.google.common.base.Predicate; +import com.google.common.collect.Iterators; import org.apache.druid.java.util.common.ISE; +import org.apache.druid.java.util.common.RE; import org.apache.druid.java.util.common.RetryUtils; import org.apache.druid.java.util.common.RetryUtils.Task; +import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.java.util.common.logger.Logger; import java.io.File; @@ -56,7 +59,8 @@ static boolean isServiceExceptionRecoverable(AmazonServiceException ex) { final boolean isIOException = ex.getCause() instanceof IOException; final boolean isTimeout = "RequestTimeout".equals(ex.getErrorCode()); - return isIOException || isTimeout; + final boolean badStatusCode = ex.getStatusCode() == 400 || ex.getStatusCode() == 403 || ex.getStatusCode() == 404; + return !badStatusCode && (isIOException || isTimeout); } public static final Predicate S3RETRY = new Predicate() @@ -107,36 +111,95 @@ static boolean isObjectInBucketIgnoringPermission( public static Iterator objectSummaryIterator( final ServerSideEncryptingAmazonS3 s3Client, - final String bucket, - final String prefix, + final URI prefix, final int numMaxKeys ) { - final ListObjectsV2Request request = new ListObjectsV2Request() - .withBucketName(bucket) - .withPrefix(prefix) - .withMaxKeys(numMaxKeys); + return lazyFetchingObjectSummariesIterator(s3Client, Iterators.singletonIterator(prefix), numMaxKeys); + } + public static Iterator lazyFetchingObjectSummariesIterator( + final ServerSideEncryptingAmazonS3 s3Client, + final Iterator prefixes, + final int maxListingLength + ) + { return new Iterator() { + private ListObjectsV2Request request; private ListObjectsV2Result result; + private URI currentUri; + private String currentBucket; + private String currentPrefix; private Iterator objectSummaryIterator; { + prepareNextRequest(); fetchNextBatch(); } + private void prepareNextRequest() + { + currentUri = prefixes.next(); + currentBucket = currentUri.getAuthority(); + currentPrefix = S3Utils.extractS3Key(currentUri); + + request = new ListObjectsV2Request() + .withBucketName(currentBucket) + .withPrefix(currentPrefix) + .withMaxKeys(maxListingLength); + } + private void fetchNextBatch() { - result = s3Client.listObjectsV2(request); - objectSummaryIterator = result.getObjectSummaries().iterator(); - request.setContinuationToken(result.getContinuationToken()); + try { + result = S3Utils.retryS3Operation(() -> s3Client.listObjectsV2(request)); + objectSummaryIterator = result.getObjectSummaries().iterator(); + request.setContinuationToken(result.getContinuationToken()); + } + catch (AmazonS3Exception outerException) { + log.error(outerException, "Exception while listing on %s", currentUri); + + if (outerException.getStatusCode() == 403) { + // The "Access Denied" means users might not have a proper permission for listing on the given uri. + // Usually this is not a problem, but the uris might be the full paths to input objects instead of prefixes. + // In this case, users should be able to get objects if they have a proper permission for GetObject. + + log.warn("Access denied for %s. Try to get the object from the uri without listing", currentUri); + try { + final ObjectMetadata objectMetadata = + S3Utils.retryS3Operation(() -> s3Client.getObjectMetadata(currentBucket, currentPrefix)); + + if (!S3Utils.isDirectoryPlaceholder(currentPrefix, objectMetadata)) { + // it's not a directory, so just generate an object summary + S3ObjectSummary fabricated = new S3ObjectSummary(); + fabricated.setBucketName(currentBucket); + fabricated.setKey(currentPrefix); + objectSummaryIterator = Iterators.singletonIterator(fabricated); + } else { + throw new RE( + "[%s] is a directory placeholder, " + + "but failed to get the object list under the directory due to permission", + currentUri + ); + } + } + catch (Exception innerException) { + throw new RuntimeException(innerException); + } + } else { + throw new RuntimeException(outerException); + } + } + catch (Exception ex) { + throw new RuntimeException(ex); + } } @Override public boolean hasNext() { - return objectSummaryIterator.hasNext() || result.isTruncated(); + return objectSummaryIterator.hasNext() || result.isTruncated() || prefixes.hasNext(); } @Override @@ -152,13 +215,16 @@ public S3ObjectSummary next() if (result.isTruncated()) { fetchNextBatch(); + } else if (prefixes.hasNext()) { + prepareNextRequest(); + fetchNextBatch(); } if (!objectSummaryIterator.hasNext()) { throw new ISE( "Failed to further iterate on bucket[%s] and prefix[%s]. The last continuationToken was [%s]", - bucket, - prefix, + currentBucket, + currentPrefix, result.getContinuationToken() ); } @@ -168,6 +234,26 @@ public S3ObjectSummary next() }; } + + /** + * Create an {@link URI} from the given {@link S3ObjectSummary}. The result URI is composed as below. + * + *
+   * {@code s3://{BUCKET_NAME}/{OBJECT_KEY}}
+   * 
+ */ + public static URI summaryToUri(S3ObjectSummary object) + { + final String originalAuthority = object.getBucketName(); + final String originalPath = object.getKey(); + final String authority = originalAuthority.endsWith("/") ? + originalAuthority.substring(0, originalAuthority.length() - 1) : + originalAuthority; + final String path = originalPath.startsWith("/") ? originalPath.substring(1) : originalPath; + + return URI.create(StringUtils.format("s3://%s/%s", authority, path)); + } + public static String constructSegmentPath(String baseKey, String storageDir) { return JOINER.join( From 765a6f41ae739e7e24bbd8b977d225ed136d7904 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Wed, 20 Nov 2019 13:24:41 -0800 Subject: [PATCH 09/28] use key --- .../druid/data/input/google/GoogleCloudStorageEntity.java | 2 +- .../src/main/java/org/apache/druid/data/input/s3/S3Entity.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/extensions-core/google-extensions/src/main/java/org/apache/druid/data/input/google/GoogleCloudStorageEntity.java b/extensions-core/google-extensions/src/main/java/org/apache/druid/data/input/google/GoogleCloudStorageEntity.java index 5a3256eb374f..248cf839e0d4 100644 --- a/extensions-core/google-extensions/src/main/java/org/apache/druid/data/input/google/GoogleCloudStorageEntity.java +++ b/extensions-core/google-extensions/src/main/java/org/apache/druid/data/input/google/GoogleCloudStorageEntity.java @@ -56,7 +56,7 @@ public InputStream open() throws IOException final String bucket = uri.getAuthority(); final String key = GoogleUtils.extractGoogleCloudStorageObjectKey(uri); final GoogleByteSource byteSource = new GoogleByteSource(storage, bucket, key); - return CompressionUtils.decompress(byteSource.openStream(), uri.getPath()); + return CompressionUtils.decompress(byteSource.openStream(), key); } @Override diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java index a62eda3599d3..b3801af4aadf 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java @@ -61,7 +61,7 @@ public InputStream open() throws IOException if (s3Object == null) { throw new ISE("Failed to get an s3 object for bucket[%s] and key[%s]", bucket, key); } - return CompressionUtils.decompress(s3Object.getObjectContent(), uri.toString()); + return CompressionUtils.decompress(s3Object.getObjectContent(), key); } catch (AmazonS3Exception e) { throw new IOException(e); From 23d63ec7c80af6c29fc8f7d23761f89456b6eee6 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Wed, 20 Nov 2019 15:16:28 -0800 Subject: [PATCH 10/28] javadoc and checkstyle --- .../org/apache/druid/storage/s3/S3Utils.java | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Utils.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Utils.java index 114001a5a89e..822cc4037048 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Utils.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Utils.java @@ -59,7 +59,7 @@ static boolean isServiceExceptionRecoverable(AmazonServiceException ex) { final boolean isIOException = ex.getCause() instanceof IOException; final boolean isTimeout = "RequestTimeout".equals(ex.getErrorCode()); - final boolean badStatusCode = ex.getStatusCode() == 400 || ex.getStatusCode() == 403 || ex.getStatusCode() == 404; + final boolean badStatusCode = ex.getStatusCode() == 400 || ex.getStatusCode() == 403 || ex.getStatusCode() == 404; return !badStatusCode && (isIOException || isTimeout); } @@ -118,9 +118,19 @@ public static Iterator objectSummaryIterator( return lazyFetchingObjectSummariesIterator(s3Client, Iterators.singletonIterator(prefix), numMaxKeys); } + /** + * Create an iterator over a set of s3 objects specified by a set of 'prefixes' which may be paths or individual + * objects, in order to get {@link S3ObjectSummary} for each discovered object. This iterator is computed lazily as it + * is iterated, calling {@link ServerSideEncryptingAmazonS3#listObjectsV2} for each prefix in batches of + * {@param maxListLength}, falling back to {@link ServerSideEncryptingAmazonS3#getObjectMetadata} if the list API + * returns a 403 status code as a fallback to check if the URI is a single object instead of a directory. These + * summaries are supplied to the outer iterator until drained, then if additional results for the current prefix are + * still available, it will continue fetching and repeat the process, else it will move on to the next prefix, + * continuing until all objects have been evaluated. + */ public static Iterator lazyFetchingObjectSummariesIterator( final ServerSideEncryptingAmazonS3 s3Client, - final Iterator prefixes, + final Iterator uris, final int maxListingLength ) { @@ -140,7 +150,7 @@ public static Iterator lazyFetchingObjectSummariesIterator( private void prepareNextRequest() { - currentUri = prefixes.next(); + currentUri = uris.next(); currentBucket = currentUri.getAuthority(); currentPrefix = S3Utils.extractS3Key(currentUri); @@ -199,7 +209,7 @@ private void fetchNextBatch() @Override public boolean hasNext() { - return objectSummaryIterator.hasNext() || result.isTruncated() || prefixes.hasNext(); + return objectSummaryIterator.hasNext() || result.isTruncated() || uris.hasNext(); } @Override @@ -215,7 +225,7 @@ public S3ObjectSummary next() if (result.isTruncated()) { fetchNextBatch(); - } else if (prefixes.hasNext()) { + } else if (uris.hasNext()) { prepareNextRequest(); fetchNextBatch(); } From a16758b7cc5af758ece8cac0aba7cb88931d5d13 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Wed, 20 Nov 2019 16:40:43 -0800 Subject: [PATCH 11/28] exception --- .../apache/druid/data/input/impl/TimedShutoffInputSource.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/src/main/java/org/apache/druid/data/input/impl/TimedShutoffInputSource.java b/core/src/main/java/org/apache/druid/data/input/impl/TimedShutoffInputSource.java index fd944e5835c1..099c591c1a71 100644 --- a/core/src/main/java/org/apache/druid/data/input/impl/TimedShutoffInputSource.java +++ b/core/src/main/java/org/apache/druid/data/input/impl/TimedShutoffInputSource.java @@ -28,6 +28,7 @@ import javax.annotation.Nullable; import java.io.File; +import java.io.IOException; /** * A wrapping InputSource that will close the underlying InputSource at {@link #shutoffTime}. @@ -76,7 +77,7 @@ public InputSourceReader reader( InputRowSchema inputRowSchema, @Nullable InputFormat inputFormat, File temporaryDirectory - ) + ) throws IOException { return new TimedShutoffInputSourceReader( delegate.reader(inputRowSchema, inputFormat, temporaryDirectory), From 7125e3e94bd468bc82c95fad68538890a23c69ee Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Wed, 20 Nov 2019 17:21:32 -0800 Subject: [PATCH 12/28] oops --- .../druid/indexing/overlord/sampler/InputSourceSampler.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/sampler/InputSourceSampler.java b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/sampler/InputSourceSampler.java index 8ae900f18d17..483cf4321dc7 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/sampler/InputSourceSampler.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/sampler/InputSourceSampler.java @@ -50,6 +50,7 @@ import javax.annotation.Nullable; import java.io.File; +import java.io.IOException; import java.util.Arrays; import java.util.HashMap; import java.util.List; @@ -87,7 +88,7 @@ public SamplerResponse sample( @Nullable final InputFormat inputFormat, @Nullable final DataSchema dataSchema, @Nullable final SamplerConfig samplerConfig - ) + ) throws IOException { Preconditions.checkNotNull(inputSource, "inputSource required"); if (inputSource.needsFormat()) { @@ -190,7 +191,7 @@ private InputSourceReader buildReader( InputSource inputSource, InputFormat inputFormat, File tempDir - ) + ) throws IOException { final List metricsNames = Arrays.stream(dataSchema.getAggregators()) .map(AggregatorFactory::getName) From 4f221c2fb4476c32bd0b008f8af6c3c2e26e3f30 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Wed, 20 Nov 2019 22:49:23 -0800 Subject: [PATCH 13/28] refactor to use S3Coords instead of URI --- .../apache/druid/data/input/InputSource.java | 3 +- .../input/impl/TimedShutoffInputSource.java | 3 +- .../druid/java/util/common/StringUtils.java | 10 +++ .../apache/druid/data/input/s3/S3Entity.java | 20 ++--- .../druid/data/input/s3/S3InputSource.java | 40 +++++++-- .../org/apache/druid/storage/s3/S3Coords.java | 87 +++++++++++++++++++ .../druid/storage/s3/S3DataSegmentPuller.java | 31 ------- .../apache/druid/storage/s3/S3LoadSpec.java | 2 +- .../org/apache/druid/storage/s3/S3Utils.java | 5 ++ .../data/input/s3/S3InputSourceTest.java | 19 ++-- .../storage/s3/S3DataSegmentPullerTest.java | 4 +- .../overlord/sampler/InputSourceSampler.java | 5 +- 12 files changed, 162 insertions(+), 67 deletions(-) create mode 100644 extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Coords.java diff --git a/core/src/main/java/org/apache/druid/data/input/InputSource.java b/core/src/main/java/org/apache/druid/data/input/InputSource.java index 352b9dbadca3..09e95116a0a5 100644 --- a/core/src/main/java/org/apache/druid/data/input/InputSource.java +++ b/core/src/main/java/org/apache/druid/data/input/InputSource.java @@ -29,7 +29,6 @@ import javax.annotation.Nullable; import java.io.File; -import java.io.IOException; /** * InputSource abstracts the storage system where input data is stored. It creates an {@link InputSourceReader} @@ -80,5 +79,5 @@ InputSourceReader reader( InputRowSchema inputRowSchema, @Nullable InputFormat inputFormat, @Nullable File temporaryDirectory - ) throws IOException; + ); } diff --git a/core/src/main/java/org/apache/druid/data/input/impl/TimedShutoffInputSource.java b/core/src/main/java/org/apache/druid/data/input/impl/TimedShutoffInputSource.java index 099c591c1a71..fd944e5835c1 100644 --- a/core/src/main/java/org/apache/druid/data/input/impl/TimedShutoffInputSource.java +++ b/core/src/main/java/org/apache/druid/data/input/impl/TimedShutoffInputSource.java @@ -28,7 +28,6 @@ import javax.annotation.Nullable; import java.io.File; -import java.io.IOException; /** * A wrapping InputSource that will close the underlying InputSource at {@link #shutoffTime}. @@ -77,7 +76,7 @@ public InputSourceReader reader( InputRowSchema inputRowSchema, @Nullable InputFormat inputFormat, File temporaryDirectory - ) throws IOException + ) { return new TimedShutoffInputSourceReader( delegate.reader(inputRowSchema, inputFormat, temporaryDirectory), diff --git a/core/src/main/java/org/apache/druid/java/util/common/StringUtils.java b/core/src/main/java/org/apache/druid/java/util/common/StringUtils.java index d5e39e31e4d8..2eca5a3bd648 100644 --- a/core/src/main/java/org/apache/druid/java/util/common/StringUtils.java +++ b/core/src/main/java/org/apache/druid/java/util/common/StringUtils.java @@ -222,6 +222,16 @@ public static String urlEncode(@Nullable String s) } } + @Nullable + public static String urlEncodeFormat(@Nullable String message, Object... formatArgs) + { + return String.format( + Locale.ENGLISH, + message, + Arrays.stream(formatArgs).map(x -> urlEncode(x != null ? x.toString() : null)) + ); + } + @Nullable public static String urlDecode(String s) { diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java index b3801af4aadf..6aa7d34e6ac4 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java @@ -24,6 +24,7 @@ import com.google.common.base.Predicate; import org.apache.druid.data.input.InputEntity; import org.apache.druid.java.util.common.ISE; +import org.apache.druid.storage.s3.S3Coords; import org.apache.druid.storage.s3.S3Utils; import org.apache.druid.storage.s3.ServerSideEncryptingAmazonS3; import org.apache.druid.utils.CompressionUtils; @@ -35,33 +36,32 @@ public class S3Entity implements InputEntity { private final ServerSideEncryptingAmazonS3 s3Client; - private final URI uri; + private final S3Coords entityLocation; - S3Entity(ServerSideEncryptingAmazonS3 s3Client, URI uri) + S3Entity(ServerSideEncryptingAmazonS3 s3Client, S3Coords coords) { this.s3Client = s3Client; - this.uri = uri; + this.entityLocation = coords; } @Override public URI getUri() { - return uri; + return null; } @Override public InputStream open() throws IOException { try { + final String bucket = entityLocation.getBucket(); + final String path = entityLocation.getPath(); // Get data of the given object and open an input stream - final String bucket = uri.getAuthority(); - final String key = S3Utils.extractS3Key(uri); - - final S3Object s3Object = s3Client.getObject(bucket, key); + final S3Object s3Object = s3Client.getObject(bucket, path); if (s3Object == null) { - throw new ISE("Failed to get an s3 object for bucket[%s] and key[%s]", bucket, key); + throw new ISE("Failed to get an s3 object for bucket[%s] and key[%s]", bucket, path); } - return CompressionUtils.decompress(s3Object.getObjectContent(), key); + return CompressionUtils.decompress(s3Object.getObjectContent(), path); } catch (AmazonS3Exception e) { throw new IOException(e); diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java index 530e32f17a93..184b35d32c15 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java @@ -22,9 +22,9 @@ import com.amazonaws.services.s3.model.S3ObjectSummary; import com.fasterxml.jackson.annotation.JacksonInject; import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.Preconditions; -import com.google.common.collect.ImmutableList; import org.apache.druid.data.input.AbstractInputSource; import org.apache.druid.data.input.InputFormat; import org.apache.druid.data.input.InputRowSchema; @@ -34,6 +34,7 @@ import org.apache.druid.data.input.impl.InputEntityIteratingReader; import org.apache.druid.data.input.impl.SplittableInputSource; import org.apache.druid.java.util.common.IAE; +import org.apache.druid.storage.s3.S3Coords; import org.apache.druid.storage.s3.S3Utils; import org.apache.druid.storage.s3.ServerSideEncryptingAmazonS3; @@ -46,13 +47,15 @@ import java.util.stream.Stream; import java.util.stream.StreamSupport; -public class S3InputSource extends AbstractInputSource implements SplittableInputSource +public class S3InputSource extends AbstractInputSource implements SplittableInputSource { private static final int MAX_LISTING_LENGTH = 1024; private final ServerSideEncryptingAmazonS3 s3Client; private final List uris; private final List prefixes; + @JsonIgnore + private final InputSplit inputSplit; @JsonCreator public S3InputSource( @@ -62,6 +65,7 @@ public S3InputSource( ) { this.s3Client = Preconditions.checkNotNull(s3Client, "s3Client"); + this.inputSplit = null; this.uris = uris == null ? new ArrayList<>() : uris; this.prefixes = prefixes == null ? new ArrayList<>() : prefixes; @@ -82,6 +86,14 @@ public S3InputSource( } } + private S3InputSource(ServerSideEncryptingAmazonS3 s3Client, InputSplit inputSplit) + { + this.s3Client = Preconditions.checkNotNull(s3Client, "s3Client"); + this.inputSplit = Preconditions.checkNotNull(inputSplit, "inputSplit"); + this.uris = new ArrayList<>(); + this.prefixes = new ArrayList<>(); + } + @JsonProperty public List getUris() { @@ -95,20 +107,28 @@ public List getPrefixes() } @Override - public Stream> createSplits(InputFormat inputFormat, @Nullable SplitHintSpec splitHintSpec) + public Stream> createSplits(InputFormat inputFormat, @Nullable SplitHintSpec splitHintSpec) { + if (inputSplit != null) { + return Stream.of(inputSplit); + } + if (!uris.isEmpty()) { - return uris.stream().map(InputSplit::new); + return uris.stream().map(S3Coords::new).map(InputSplit::new); } return StreamSupport.stream(getIterableObjectsFromPrefixes().spliterator(), false) - .map(S3Utils::summaryToUri) + .map(S3Utils::summaryToS3Coords) .map(InputSplit::new); } @Override public int getNumSplits(InputFormat inputFormat, @Nullable SplitHintSpec splitHintSpec) { + if (inputSplit != null) { + return 1; + } + if (!uris.isEmpty()) { return uris.size(); } @@ -117,9 +137,9 @@ public int getNumSplits(InputFormat inputFormat, @Nullable SplitHintSpec splitHi } @Override - public SplittableInputSource withSplit(InputSplit split) + public SplittableInputSource withSplit(InputSplit split) { - return new S3InputSource(s3Client, ImmutableList.of(split.get()), ImmutableList.of()); + return new S3InputSource(s3Client, split); } @Override @@ -157,13 +177,14 @@ public boolean equals(Object o) } S3InputSource that = (S3InputSource) o; return Objects.equals(uris, that.uris) && - Objects.equals(prefixes, that.prefixes); + Objects.equals(prefixes, that.prefixes) && + Objects.equals(inputSplit, that.inputSplit); } @Override public int hashCode() { - return Objects.hash(uris, prefixes); + return Objects.hash(uris, prefixes, inputSplit); } @Override @@ -172,6 +193,7 @@ public String toString() return "S3InputSource{" + "uris=" + uris + ", prefixes=" + prefixes + + ", inputSplit=" + inputSplit + '}'; } diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Coords.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Coords.java new file mode 100644 index 000000000000..a154f5073d08 --- /dev/null +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Coords.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.storage.s3; + +import org.apache.druid.java.util.common.IAE; +import org.apache.druid.java.util.common.StringUtils; + +import java.net.URI; +import java.util.Objects; + +public class S3Coords +{ + final String bucket; + final String path; + + public S3Coords(URI uri) + { + if (!"s3".equalsIgnoreCase(uri.getScheme())) { + throw new IAE("Unsupported scheme: [%s]", uri.getScheme()); + } + bucket = uri.getHost(); + String path = uri.getPath(); + if (path.startsWith("/")) { + path = path.substring(1); + } + this.path = path; + } + + public S3Coords(String bucket, String key) + { + this.bucket = bucket; + this.path = key; + } + + public String getBucket() + { + return bucket; + } + + public String getPath() + { + return path; + } + + @Override + public String toString() + { + return StringUtils.format("s3://%s/%s", bucket, path); + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + S3Coords s3Coords = (S3Coords) o; + return bucket.equals(s3Coords.bucket) && + path.equals(s3Coords.path); + } + + @Override + public int hashCode() + { + return Objects.hash(bucket, path); + } +} diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3DataSegmentPuller.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3DataSegmentPuller.java index e2e0535286c7..4d35753e295f 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3DataSegmentPuller.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3DataSegmentPuller.java @@ -317,35 +317,4 @@ private boolean isObjectInBucket(final S3Coords coords) throws SegmentLoadingExc throw new RuntimeException(e); } } - - protected static class S3Coords - { - String bucket; - String path; - - public S3Coords(URI uri) - { - if (!"s3".equalsIgnoreCase(uri.getScheme())) { - throw new IAE("Unsupported scheme: [%s]", uri.getScheme()); - } - bucket = uri.getHost(); - String path = uri.getPath(); - if (path.startsWith("/")) { - path = path.substring(1); - } - this.path = path; - } - - public S3Coords(String bucket, String key) - { - this.bucket = bucket; - this.path = key; - } - - @Override - public String toString() - { - return StringUtils.format("s3://%s/%s", bucket, path); - } - } } diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3LoadSpec.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3LoadSpec.java index 19e3b69ea242..ba58b608cdb9 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3LoadSpec.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3LoadSpec.java @@ -57,7 +57,7 @@ public S3LoadSpec( @Override public LoadSpecResult loadSegment(File outDir) throws SegmentLoadingException { - return new LoadSpecResult(puller.getSegmentFiles(new S3DataSegmentPuller.S3Coords(bucket, key), outDir).size()); + return new LoadSpecResult(puller.getSegmentFiles(new S3Coords(bucket, key), outDir).size()); } @JsonProperty(S3DataSegmentPuller.BUCKET) diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Utils.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Utils.java index 822cc4037048..bb3d90903e61 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Utils.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Utils.java @@ -264,6 +264,11 @@ public static URI summaryToUri(S3ObjectSummary object) return URI.create(StringUtils.format("s3://%s/%s", authority, path)); } + public static S3Coords summaryToS3Coords(S3ObjectSummary object) + { + return new S3Coords(object.getBucketName(), object.getKey()); + } + public static String constructSegmentPath(String baseKey, String storageDir) { return JOINER.join( diff --git a/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java b/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java index e792d2ce520f..fa1b969e1eb2 100644 --- a/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java +++ b/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java @@ -56,6 +56,7 @@ import org.apache.druid.java.util.common.parsers.CloseableIterator; import org.apache.druid.java.util.common.parsers.JSONPathSpec; import org.apache.druid.storage.s3.NoopServerSideEncryption; +import org.apache.druid.storage.s3.S3Coords; import org.apache.druid.storage.s3.S3Utils; import org.apache.druid.storage.s3.ServerSideEncryptingAmazonS3; import org.apache.druid.testing.InitializedNullHandlingTest; @@ -85,13 +86,17 @@ public class S3InputSourceTest extends InitializedNullHandlingTest URI.create("s3://bar/foo/file2.csv") ); + private static final List EXPECTED_COORDS = + EXPECTED_URIS.stream().map(S3Coords::new).collect(Collectors.toList()); + private static final List PREFIXES = Arrays.asList( URI.create("s3://foo/bar"), URI.create("s3://bar/foo") ); private static final DateTime NOW = DateTimes.nowUtc(); - private static final byte[] CONTENT = StringUtils.toUtf8(StringUtils.format("%d,hello,world", NOW.getMillis())); + private static final byte[] CONTENT = + StringUtils.toUtf8(StringUtils.format("%d,hello,world", NOW.getMillis())); @Test public void testSerdeWithUris() throws Exception @@ -119,12 +124,12 @@ public void testWithUrisSplit() { S3InputSource inputSource = new S3InputSource(SERVICE, EXPECTED_URIS, null); - Stream> splits = inputSource.createSplits( + Stream> splits = inputSource.createSplits( new JsonInputFormat(JSONPathSpec.DEFAULT, null), null ); - Assert.assertEquals(EXPECTED_URIS, splits.map(InputSplit::get).collect(Collectors.toList())); + Assert.assertEquals(EXPECTED_COORDS, splits.map(InputSplit::get).collect(Collectors.toList())); } @Test @@ -137,12 +142,12 @@ public void testWithPrefixesSplit() S3InputSource inputSource = new S3InputSource(SERVICE, null, PREFIXES); - Stream> splits = inputSource.createSplits( + Stream> splits = inputSource.createSplits( new JsonInputFormat(JSONPathSpec.DEFAULT, null), null ); - Assert.assertEquals(EXPECTED_URIS, splits.map(InputSplit::get).collect(Collectors.toList())); + Assert.assertEquals(EXPECTED_COORDS, splits.map(InputSplit::get).collect(Collectors.toList())); } @Test @@ -159,12 +164,12 @@ public void testWithPrefixesWhereOneIsUrisAndNoListPermissionSplit() ImmutableList.of(PREFIXES.get(0), EXPECTED_URIS.get(1)) ); - Stream> splits = inputSource.createSplits( + Stream> splits = inputSource.createSplits( new JsonInputFormat(JSONPathSpec.DEFAULT, null), null ); - Assert.assertEquals(EXPECTED_URIS, splits.map(InputSplit::get).collect(Collectors.toList())); + Assert.assertEquals(EXPECTED_COORDS, splits.map(InputSplit::get).collect(Collectors.toList())); } @Test diff --git a/extensions-core/s3-extensions/src/test/java/org/apache/druid/storage/s3/S3DataSegmentPullerTest.java b/extensions-core/s3-extensions/src/test/java/org/apache/druid/storage/s3/S3DataSegmentPullerTest.java index a4306d12b97d..1db5970ac429 100644 --- a/extensions-core/s3-extensions/src/test/java/org/apache/druid/storage/s3/S3DataSegmentPullerTest.java +++ b/extensions-core/s3-extensions/src/test/java/org/apache/druid/storage/s3/S3DataSegmentPullerTest.java @@ -124,7 +124,7 @@ public void testGZUncompress() throws IOException, SegmentLoadingException EasyMock.replay(s3Client); FileUtils.FileCopyResult result = puller.getSegmentFiles( - new S3DataSegmentPuller.S3Coords( + new S3Coords( bucket, object0.getKey() ), tmpDir @@ -191,7 +191,7 @@ public void testGZUncompressRetries() throws IOException, SegmentLoadingExceptio EasyMock.replay(s3Client); FileUtils.FileCopyResult result = puller.getSegmentFiles( - new S3DataSegmentPuller.S3Coords( + new S3Coords( bucket, object0.getKey() ), tmpDir diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/sampler/InputSourceSampler.java b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/sampler/InputSourceSampler.java index 483cf4321dc7..8ae900f18d17 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/sampler/InputSourceSampler.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/sampler/InputSourceSampler.java @@ -50,7 +50,6 @@ import javax.annotation.Nullable; import java.io.File; -import java.io.IOException; import java.util.Arrays; import java.util.HashMap; import java.util.List; @@ -88,7 +87,7 @@ public SamplerResponse sample( @Nullable final InputFormat inputFormat, @Nullable final DataSchema dataSchema, @Nullable final SamplerConfig samplerConfig - ) throws IOException + ) { Preconditions.checkNotNull(inputSource, "inputSource required"); if (inputSource.needsFormat()) { @@ -191,7 +190,7 @@ private InputSourceReader buildReader( InputSource inputSource, InputFormat inputFormat, File tempDir - ) throws IOException + ) { final List metricsNames = Arrays.stream(dataSchema.getAggregators()) .map(AggregatorFactory::getName) From 74fd2eb012c7e9992f9d30b51955060359a97f21 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Thu, 21 Nov 2019 01:37:51 -0800 Subject: [PATCH 14/28] remove unused code, add retrying stream to handle s3 stream --- .../impl/prefetch/RetryingInputStream.java | 4 +- .../druid/java/util/common/StringUtils.java | 10 --- .../apache/druid/data/input/s3/S3Entity.java | 65 ++++++++++++++----- .../org/apache/druid/storage/s3/S3Utils.java | 5 +- 4 files changed, 55 insertions(+), 29 deletions(-) diff --git a/core/src/main/java/org/apache/druid/data/input/impl/prefetch/RetryingInputStream.java b/core/src/main/java/org/apache/druid/data/input/impl/prefetch/RetryingInputStream.java index af401e9eb12e..e7b802348a6c 100644 --- a/core/src/main/java/org/apache/druid/data/input/impl/prefetch/RetryingInputStream.java +++ b/core/src/main/java/org/apache/druid/data/input/impl/prefetch/RetryingInputStream.java @@ -35,7 +35,7 @@ * * @param object type */ -class RetryingInputStream extends InputStream +public class RetryingInputStream extends InputStream { private static final Logger log = new Logger(RetryingInputStream.class); @@ -47,7 +47,7 @@ class RetryingInputStream extends InputStream private CountingInputStream delegate; private long startOffset; - RetryingInputStream( + public RetryingInputStream( T object, ObjectOpenFunction objectOpenFunction, Predicate retryCondition, diff --git a/core/src/main/java/org/apache/druid/java/util/common/StringUtils.java b/core/src/main/java/org/apache/druid/java/util/common/StringUtils.java index 2eca5a3bd648..d5e39e31e4d8 100644 --- a/core/src/main/java/org/apache/druid/java/util/common/StringUtils.java +++ b/core/src/main/java/org/apache/druid/java/util/common/StringUtils.java @@ -222,16 +222,6 @@ public static String urlEncode(@Nullable String s) } } - @Nullable - public static String urlEncodeFormat(@Nullable String message, Object... formatArgs) - { - return String.format( - Locale.ENGLISH, - message, - Arrays.stream(formatArgs).map(x -> urlEncode(x != null ? x.toString() : null)) - ); - } - @Nullable public static String urlDecode(String s) { diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java index 6aa7d34e6ac4..f0fae57f076b 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java @@ -20,9 +20,12 @@ package org.apache.druid.data.input.s3; import com.amazonaws.services.s3.model.AmazonS3Exception; +import com.amazonaws.services.s3.model.GetObjectRequest; import com.amazonaws.services.s3.model.S3Object; import com.google.common.base.Predicate; import org.apache.druid.data.input.InputEntity; +import org.apache.druid.data.input.impl.prefetch.ObjectOpenFunction; +import org.apache.druid.data.input.impl.prefetch.RetryingInputStream; import org.apache.druid.java.util.common.ISE; import org.apache.druid.storage.s3.S3Coords; import org.apache.druid.storage.s3.S3Utils; @@ -35,13 +38,13 @@ public class S3Entity implements InputEntity { - private final ServerSideEncryptingAmazonS3 s3Client; private final S3Coords entityLocation; + private final ObjectOpenFunction s3OpenFunction; S3Entity(ServerSideEncryptingAmazonS3 s3Client, S3Coords coords) { - this.s3Client = s3Client; this.entityLocation = coords; + this.s3OpenFunction = new S3OpenFunction(s3Client); } @Override @@ -53,19 +56,13 @@ public URI getUri() @Override public InputStream open() throws IOException { - try { - final String bucket = entityLocation.getBucket(); - final String path = entityLocation.getPath(); - // Get data of the given object and open an input stream - final S3Object s3Object = s3Client.getObject(bucket, path); - if (s3Object == null) { - throw new ISE("Failed to get an s3 object for bucket[%s] and key[%s]", bucket, path); - } - return CompressionUtils.decompress(s3Object.getObjectContent(), path); - } - catch (AmazonS3Exception e) { - throw new IOException(e); - } + RetryingInputStream retryingStream = new RetryingInputStream<>( + entityLocation, + s3OpenFunction, + S3Utils.S3RETRY, + S3Utils.MAX_S3_RETRIES + ); + return CompressionUtils.decompress(retryingStream, entityLocation.getPath()); } @Override @@ -73,4 +70,42 @@ public Predicate getFetchRetryCondition() { return S3Utils.S3RETRY; } + + private static class S3OpenFunction implements ObjectOpenFunction + { + private final ServerSideEncryptingAmazonS3 s3Client; + + S3OpenFunction(ServerSideEncryptingAmazonS3 s3Client) + { + this.s3Client = s3Client; + } + + @Override + public InputStream open(S3Coords object) throws IOException + { + return open(object, 0L); + } + + @Override + public InputStream open(S3Coords object, long start) throws IOException + { + final GetObjectRequest request = new GetObjectRequest(object.getBucket(), object.getPath()); + request.setRange(start); + try { + final S3Object s3Object = s3Client.getObject(request); + if (s3Object == null) { + throw new ISE( + "Failed to get an s3 object for bucket[%s], key[%s], and start[%d]", + object.getBucket(), + object.getPath(), + start + ); + } + return s3Object.getObjectContent(); + } + catch (AmazonS3Exception e) { + throw new IOException(e); + } + } + } } diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Utils.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Utils.java index bb3d90903e61..e2366661edae 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Utils.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Utils.java @@ -55,6 +55,8 @@ public class S3Utils private static final String MIMETYPE_JETS3T_DIRECTORY = "application/x-directory"; private static final Logger log = new Logger(S3Utils.class); + public static final int MAX_S3_RETRIES = 10; + static boolean isServiceExceptionRecoverable(AmazonServiceException ex) { final boolean isIOException = ex.getCause() instanceof IOException; @@ -86,8 +88,7 @@ public boolean apply(Throwable e) */ public static T retryS3Operation(Task f) throws Exception { - final int maxTries = 10; - return RetryUtils.retry(f, S3RETRY, maxTries); + return RetryUtils.retry(f, S3RETRY, MAX_S3_RETRIES); } static boolean isObjectInBucketIgnoringPermission( From b4527c8966766f9e707afbe69257a97e215538cc Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Thu, 21 Nov 2019 05:12:42 -0800 Subject: [PATCH 15/28] remove unused parameter --- .../org/apache/druid/data/input/s3/S3InputSourceTest.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java b/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java index fa1b969e1eb2..e71053fddfb1 100644 --- a/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java +++ b/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java @@ -155,7 +155,7 @@ public void testWithPrefixesWhereOneIsUrisAndNoListPermissionSplit() { EasyMock.reset(S3_CLIENT); addExpectedPrefixObjects(PREFIXES.get(0), ImmutableList.of(EXPECTED_URIS.get(0))); - addExpectedNonPrefixObjectsWithNoListPermission(EXPECTED_URIS.get(1)); + addExpectedNonPrefixObjectsWithNoListPermission(); EasyMock.replay(S3_CLIENT); S3InputSource inputSource = new S3InputSource( @@ -177,7 +177,7 @@ public void testReader() throws IOException { EasyMock.reset(S3_CLIENT); addExpectedPrefixObjects(PREFIXES.get(0), ImmutableList.of(EXPECTED_URIS.get(0))); - addExpectedNonPrefixObjectsWithNoListPermission(EXPECTED_URIS.get(1)); + addExpectedNonPrefixObjectsWithNoListPermission(); addExpectedGetObjectMock(EXPECTED_URIS.get(0)); addExpectedGetObjectMock(EXPECTED_URIS.get(1)); EasyMock.replay(S3_CLIENT); @@ -226,7 +226,7 @@ private static void addExpectedPrefixObjects(URI prefix, List uris) EasyMock.expect(S3_CLIENT.listObjectsV2(EasyMock.anyObject(ListObjectsV2Request.class))).andReturn(result).once(); } - private static void addExpectedNonPrefixObjectsWithNoListPermission(URI uri) + private static void addExpectedNonPrefixObjectsWithNoListPermission() { AmazonS3Exception boom = new AmazonS3Exception("oh dang, you can't list that bucket friend"); boom.setStatusCode(403); From f47edbf00c63a3267b834719367becf1a4188b21 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Fri, 22 Nov 2019 00:44:58 -0800 Subject: [PATCH 16/28] update to latest master --- .../apache/druid/data/input/s3/S3Entity.java | 77 ++++++------------- .../druid/data/input/s3/S3InputSource.java | 73 +++++++++++------- .../org/apache/druid/storage/s3/S3Coords.java | 7 +- .../data/input/s3/S3InputSourceTest.java | 14 ++-- 4 files changed, 81 insertions(+), 90 deletions(-) diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java index f0fae57f076b..1685ddbe8e34 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java @@ -23,28 +23,25 @@ import com.amazonaws.services.s3.model.GetObjectRequest; import com.amazonaws.services.s3.model.S3Object; import com.google.common.base.Predicate; -import org.apache.druid.data.input.InputEntity; -import org.apache.druid.data.input.impl.prefetch.ObjectOpenFunction; -import org.apache.druid.data.input.impl.prefetch.RetryingInputStream; +import org.apache.druid.data.input.RetryingInputEntity; import org.apache.druid.java.util.common.ISE; import org.apache.druid.storage.s3.S3Coords; import org.apache.druid.storage.s3.S3Utils; import org.apache.druid.storage.s3.ServerSideEncryptingAmazonS3; -import org.apache.druid.utils.CompressionUtils; import java.io.IOException; import java.io.InputStream; import java.net.URI; -public class S3Entity implements InputEntity +public class S3Entity implements RetryingInputEntity { + private final ServerSideEncryptingAmazonS3 s3Client; private final S3Coords entityLocation; - private final ObjectOpenFunction s3OpenFunction; S3Entity(ServerSideEncryptingAmazonS3 s3Client, S3Coords coords) { + this.s3Client = s3Client; this.entityLocation = coords; - this.s3OpenFunction = new S3OpenFunction(s3Client); } @Override @@ -54,58 +51,30 @@ public URI getUri() } @Override - public InputStream open() throws IOException + public InputStream readFrom(long offset) throws IOException { - RetryingInputStream retryingStream = new RetryingInputStream<>( - entityLocation, - s3OpenFunction, - S3Utils.S3RETRY, - S3Utils.MAX_S3_RETRIES - ); - return CompressionUtils.decompress(retryingStream, entityLocation.getPath()); + final GetObjectRequest request = new GetObjectRequest(entityLocation.getBucket(), entityLocation.getPath()); + request.setRange(offset); + try { + final S3Object s3Object = s3Client.getObject(request); + if (s3Object == null) { + throw new ISE( + "Failed to get an s3 object for bucket[%s], key[%s], and start[%d]", + entityLocation.getBucket(), + entityLocation.getPath(), + offset + ); + } + return s3Object.getObjectContent(); + } + catch (AmazonS3Exception e) { + throw new IOException(e); + } } @Override - public Predicate getFetchRetryCondition() + public Predicate getRetryCondition() { return S3Utils.S3RETRY; } - - private static class S3OpenFunction implements ObjectOpenFunction - { - private final ServerSideEncryptingAmazonS3 s3Client; - - S3OpenFunction(ServerSideEncryptingAmazonS3 s3Client) - { - this.s3Client = s3Client; - } - - @Override - public InputStream open(S3Coords object) throws IOException - { - return open(object, 0L); - } - - @Override - public InputStream open(S3Coords object, long start) throws IOException - { - final GetObjectRequest request = new GetObjectRequest(object.getBucket(), object.getPath()); - request.setRange(start); - try { - final S3Object s3Object = s3Client.getObject(request); - if (s3Object == null) { - throw new ISE( - "Failed to get an s3 object for bucket[%s], key[%s], and start[%d]", - object.getBucket(), - object.getPath(), - start - ); - } - return s3Object.getObjectContent(); - } - catch (AmazonS3Exception e) { - throw new IOException(e); - } - } - } } diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java index 184b35d32c15..23b2c3fec2dd 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java @@ -22,7 +22,6 @@ import com.amazonaws.services.s3.model.S3ObjectSummary; import com.fasterxml.jackson.annotation.JacksonInject; import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.Preconditions; import org.apache.druid.data.input.AbstractInputSource; @@ -54,47 +53,57 @@ public class S3InputSource extends AbstractInputSource implements SplittableInpu private final ServerSideEncryptingAmazonS3 s3Client; private final List uris; private final List prefixes; - @JsonIgnore - private final InputSplit inputSplit; + private final S3Coords object; @JsonCreator public S3InputSource( @JacksonInject ServerSideEncryptingAmazonS3 s3Client, @JsonProperty("uris") @Nullable List uris, - @JsonProperty("prefixes") @Nullable List prefixes + @JsonProperty("prefixes") @Nullable List prefixes, + @JsonProperty("object") @Nullable S3Coords object ) { this.s3Client = Preconditions.checkNotNull(s3Client, "s3Client"); - this.inputSplit = null; this.uris = uris == null ? new ArrayList<>() : uris; this.prefixes = prefixes == null ? new ArrayList<>() : prefixes; - if (!this.uris.isEmpty() && !this.prefixes.isEmpty()) { - throw new IAE("uris and prefixes cannot be used together"); - } - - if (this.uris.isEmpty() && this.prefixes.isEmpty()) { - throw new IAE("uris or prefixes must be specified"); - } - - for (final URI inputURI : this.uris) { - Preconditions.checkArgument("s3".equals(inputURI.getScheme()), "input uri scheme == s3 (%s)", inputURI); - } - - for (final URI inputURI : this.prefixes) { - Preconditions.checkArgument("s3".equals(inputURI.getScheme()), "input uri scheme == s3 (%s)", inputURI); + if (object != null) { + this.object = object; + if (!this.uris.isEmpty()) { + throw new IAE("uris cannot be used with object"); + } + if (!this.prefixes.isEmpty()) { + throw new IAE("prefixes cannot be used with object"); + } + } else { + this.object = null; + if (!this.uris.isEmpty() && !this.prefixes.isEmpty()) { + throw new IAE("uris and prefixes cannot be used together"); + } + + if (this.uris.isEmpty() && this.prefixes.isEmpty()) { + throw new IAE("uris or prefixes must be specified"); + } + + for (final URI inputURI : this.uris) { + Preconditions.checkArgument("s3".equals(inputURI.getScheme()), "input uri scheme == s3 (%s)", inputURI); + } + + for (final URI inputURI : this.prefixes) { + Preconditions.checkArgument("s3".equals(inputURI.getScheme()), "input uri scheme == s3 (%s)", inputURI); + } } } - private S3InputSource(ServerSideEncryptingAmazonS3 s3Client, InputSplit inputSplit) + private S3InputSource(ServerSideEncryptingAmazonS3 s3Client, S3Coords inputSplit) { this.s3Client = Preconditions.checkNotNull(s3Client, "s3Client"); - this.inputSplit = Preconditions.checkNotNull(inputSplit, "inputSplit"); + this.object = Preconditions.checkNotNull(inputSplit, "object"); this.uris = new ArrayList<>(); this.prefixes = new ArrayList<>(); } - @JsonProperty + @JsonProperty("uris") public List getUris() { return uris; @@ -106,11 +115,17 @@ public List getPrefixes() return prefixes; } + @JsonProperty("object") + public S3Coords getObject() + { + return object; + } + @Override public Stream> createSplits(InputFormat inputFormat, @Nullable SplitHintSpec splitHintSpec) { - if (inputSplit != null) { - return Stream.of(inputSplit); + if (object != null) { + return Stream.of(new InputSplit<>(object)); } if (!uris.isEmpty()) { @@ -125,7 +140,7 @@ public Stream> createSplits(InputFormat inputFormat, @Nulla @Override public int getNumSplits(InputFormat inputFormat, @Nullable SplitHintSpec splitHintSpec) { - if (inputSplit != null) { + if (object != null) { return 1; } @@ -139,7 +154,7 @@ public int getNumSplits(InputFormat inputFormat, @Nullable SplitHintSpec splitHi @Override public SplittableInputSource withSplit(InputSplit split) { - return new S3InputSource(s3Client, split); + return new S3InputSource(s3Client, split.get()); } @Override @@ -178,13 +193,13 @@ public boolean equals(Object o) S3InputSource that = (S3InputSource) o; return Objects.equals(uris, that.uris) && Objects.equals(prefixes, that.prefixes) && - Objects.equals(inputSplit, that.inputSplit); + Objects.equals(object, that.object); } @Override public int hashCode() { - return Objects.hash(uris, prefixes, inputSplit); + return Objects.hash(uris, prefixes, object); } @Override @@ -193,7 +208,7 @@ public String toString() return "S3InputSource{" + "uris=" + uris + ", prefixes=" + prefixes + - ", inputSplit=" + inputSplit + + ", object=" + object + '}'; } diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Coords.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Coords.java index a154f5073d08..3ca618a8bd7e 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Coords.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Coords.java @@ -19,6 +19,8 @@ package org.apache.druid.storage.s3; +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; import org.apache.druid.java.util.common.IAE; import org.apache.druid.java.util.common.StringUtils; @@ -43,17 +45,20 @@ public S3Coords(URI uri) this.path = path; } - public S3Coords(String bucket, String key) + @JsonCreator + public S3Coords(@JsonProperty("bucket") String bucket, @JsonProperty("path") String key) { this.bucket = bucket; this.path = key; } + @JsonProperty("bucket") public String getBucket() { return bucket; } + @JsonProperty("path") public String getPath() { return path; diff --git a/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java b/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java index e71053fddfb1..b733f1e9ded0 100644 --- a/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java +++ b/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java @@ -103,7 +103,7 @@ public void testSerdeWithUris() throws Exception { final ObjectMapper mapper = createS3ObjectMapper(); - final S3InputSource withUris = new S3InputSource(SERVICE, EXPECTED_URIS, null); + final S3InputSource withUris = new S3InputSource(SERVICE, EXPECTED_URIS, null, null); final S3InputSource serdeWithUris = mapper.readValue(mapper.writeValueAsString(withUris), S3InputSource.class); Assert.assertEquals(withUris, serdeWithUris); } @@ -113,7 +113,7 @@ public void testSerdeWithPrefixes() throws Exception { final ObjectMapper mapper = createS3ObjectMapper(); - final S3InputSource withPrefixes = new S3InputSource(SERVICE, null, PREFIXES); + final S3InputSource withPrefixes = new S3InputSource(SERVICE, null, PREFIXES, null); final S3InputSource serdeWithPrefixes = mapper.readValue(mapper.writeValueAsString(withPrefixes), S3InputSource.class); Assert.assertEquals(withPrefixes, serdeWithPrefixes); @@ -122,7 +122,7 @@ public void testSerdeWithPrefixes() throws Exception @Test public void testWithUrisSplit() { - S3InputSource inputSource = new S3InputSource(SERVICE, EXPECTED_URIS, null); + S3InputSource inputSource = new S3InputSource(SERVICE, EXPECTED_URIS, null, null); Stream> splits = inputSource.createSplits( new JsonInputFormat(JSONPathSpec.DEFAULT, null), @@ -140,7 +140,7 @@ public void testWithPrefixesSplit() addExpectedPrefixObjects(PREFIXES.get(1), ImmutableList.of(EXPECTED_URIS.get(1))); EasyMock.replay(S3_CLIENT); - S3InputSource inputSource = new S3InputSource(SERVICE, null, PREFIXES); + S3InputSource inputSource = new S3InputSource(SERVICE, null, PREFIXES, null); Stream> splits = inputSource.createSplits( new JsonInputFormat(JSONPathSpec.DEFAULT, null), @@ -161,7 +161,8 @@ public void testWithPrefixesWhereOneIsUrisAndNoListPermissionSplit() S3InputSource inputSource = new S3InputSource( SERVICE, null, - ImmutableList.of(PREFIXES.get(0), EXPECTED_URIS.get(1)) + ImmutableList.of(PREFIXES.get(0), EXPECTED_URIS.get(1)), + null ); Stream> splits = inputSource.createSplits( @@ -185,7 +186,8 @@ public void testReader() throws IOException S3InputSource inputSource = new S3InputSource( SERVICE, null, - ImmutableList.of(PREFIXES.get(0), EXPECTED_URIS.get(1)) + ImmutableList.of(PREFIXES.get(0), EXPECTED_URIS.get(1)), + null ); InputRowSchema someSchema = new InputRowSchema( From b07e4907eebbf93117c2f9188acb488155118b4b Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Fri, 22 Nov 2019 01:45:11 -0800 Subject: [PATCH 17/28] use list of objects instead of object --- .../data/input/impl/CloudObjectLocation.java | 47 ++++++++++--------- .../apache/druid/data/input/s3/S3Entity.java | 14 +++--- .../druid/data/input/s3/S3InputSource.java | 47 ++++++++++--------- .../druid/storage/s3/S3DataSegmentPuller.java | 39 ++++++--------- .../apache/druid/storage/s3/S3LoadSpec.java | 3 +- .../s3/S3TimestampVersionedDataFinder.java | 5 +- .../org/apache/druid/storage/s3/S3Utils.java | 16 ++++++- .../data/input/s3/S3InputSourceTest.java | 12 ++--- .../storage/s3/S3DataSegmentPullerTest.java | 5 +- 9 files changed, 97 insertions(+), 91 deletions(-) rename extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Coords.java => core/src/main/java/org/apache/druid/data/input/impl/CloudObjectLocation.java (68%) diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Coords.java b/core/src/main/java/org/apache/druid/data/input/impl/CloudObjectLocation.java similarity index 68% rename from extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Coords.java rename to core/src/main/java/org/apache/druid/data/input/impl/CloudObjectLocation.java index 3ca618a8bd7e..42f44abaedb1 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Coords.java +++ b/core/src/main/java/org/apache/druid/data/input/impl/CloudObjectLocation.java @@ -17,26 +17,29 @@ * under the License. */ -package org.apache.druid.storage.s3; +package org.apache.druid.data.input.impl; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; -import org.apache.druid.java.util.common.IAE; -import org.apache.druid.java.util.common.StringUtils; +import com.google.common.base.Preconditions; import java.net.URI; import java.util.Objects; -public class S3Coords +public class CloudObjectLocation { - final String bucket; - final String path; + private final String bucket; + private final String path; - public S3Coords(URI uri) + @JsonCreator + public CloudObjectLocation(@JsonProperty("bucket") String bucket, @JsonProperty("path") String path) + { + this.bucket = Preconditions.checkNotNull(bucket); + this.path = Preconditions.checkNotNull(path); + } + + public CloudObjectLocation(URI uri) { - if (!"s3".equalsIgnoreCase(uri.getScheme())) { - throw new IAE("Unsupported scheme: [%s]", uri.getScheme()); - } bucket = uri.getHost(); String path = uri.getPath(); if (path.startsWith("/")) { @@ -45,20 +48,13 @@ public S3Coords(URI uri) this.path = path; } - @JsonCreator - public S3Coords(@JsonProperty("bucket") String bucket, @JsonProperty("path") String key) - { - this.bucket = bucket; - this.path = key; - } - - @JsonProperty("bucket") + @JsonProperty public String getBucket() { return bucket; } - @JsonProperty("path") + @JsonProperty public String getPath() { return path; @@ -67,7 +63,10 @@ public String getPath() @Override public String toString() { - return StringUtils.format("s3://%s/%s", bucket, path); + return "CloudObjectLocation {" + + "bucket=" + bucket + + ",path=" + path + + "}"; } @Override @@ -76,12 +75,14 @@ public boolean equals(Object o) if (this == o) { return true; } + if (o == null || getClass() != o.getClass()) { return false; } - S3Coords s3Coords = (S3Coords) o; - return bucket.equals(s3Coords.bucket) && - path.equals(s3Coords.path); + + final CloudObjectLocation that = (CloudObjectLocation) o; + return Objects.equals(bucket, that.bucket) && + Objects.equals(path, that.path); } @Override diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java index 1685ddbe8e34..aedb631280b1 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java @@ -24,8 +24,8 @@ import com.amazonaws.services.s3.model.S3Object; import com.google.common.base.Predicate; import org.apache.druid.data.input.RetryingInputEntity; +import org.apache.druid.data.input.impl.CloudObjectLocation; import org.apache.druid.java.util.common.ISE; -import org.apache.druid.storage.s3.S3Coords; import org.apache.druid.storage.s3.S3Utils; import org.apache.druid.storage.s3.ServerSideEncryptingAmazonS3; @@ -36,12 +36,12 @@ public class S3Entity implements RetryingInputEntity { private final ServerSideEncryptingAmazonS3 s3Client; - private final S3Coords entityLocation; + private final CloudObjectLocation object; - S3Entity(ServerSideEncryptingAmazonS3 s3Client, S3Coords coords) + S3Entity(ServerSideEncryptingAmazonS3 s3Client, CloudObjectLocation coords) { this.s3Client = s3Client; - this.entityLocation = coords; + this.object = coords; } @Override @@ -53,15 +53,15 @@ public URI getUri() @Override public InputStream readFrom(long offset) throws IOException { - final GetObjectRequest request = new GetObjectRequest(entityLocation.getBucket(), entityLocation.getPath()); + final GetObjectRequest request = new GetObjectRequest(object.getBucket(), object.getPath()); request.setRange(offset); try { final S3Object s3Object = s3Client.getObject(request); if (s3Object == null) { throw new ISE( "Failed to get an s3 object for bucket[%s], key[%s], and start[%d]", - entityLocation.getBucket(), - entityLocation.getPath(), + object.getBucket(), + object.getPath(), offset ); } diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java index 23b2c3fec2dd..d6f9b223c997 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java @@ -24,16 +24,17 @@ import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; import org.apache.druid.data.input.AbstractInputSource; import org.apache.druid.data.input.InputFormat; import org.apache.druid.data.input.InputRowSchema; import org.apache.druid.data.input.InputSourceReader; import org.apache.druid.data.input.InputSplit; import org.apache.druid.data.input.SplitHintSpec; +import org.apache.druid.data.input.impl.CloudObjectLocation; import org.apache.druid.data.input.impl.InputEntityIteratingReader; import org.apache.druid.data.input.impl.SplittableInputSource; import org.apache.druid.java.util.common.IAE; -import org.apache.druid.storage.s3.S3Coords; import org.apache.druid.storage.s3.S3Utils; import org.apache.druid.storage.s3.ServerSideEncryptingAmazonS3; @@ -46,29 +47,29 @@ import java.util.stream.Stream; import java.util.stream.StreamSupport; -public class S3InputSource extends AbstractInputSource implements SplittableInputSource +public class S3InputSource extends AbstractInputSource implements SplittableInputSource { private static final int MAX_LISTING_LENGTH = 1024; private final ServerSideEncryptingAmazonS3 s3Client; private final List uris; private final List prefixes; - private final S3Coords object; + private final List objects; @JsonCreator public S3InputSource( @JacksonInject ServerSideEncryptingAmazonS3 s3Client, @JsonProperty("uris") @Nullable List uris, @JsonProperty("prefixes") @Nullable List prefixes, - @JsonProperty("object") @Nullable S3Coords object + @JsonProperty("objects") @Nullable List objects ) { this.s3Client = Preconditions.checkNotNull(s3Client, "s3Client"); this.uris = uris == null ? new ArrayList<>() : uris; this.prefixes = prefixes == null ? new ArrayList<>() : prefixes; - if (object != null) { - this.object = object; + if (objects != null) { + this.objects = objects; if (!this.uris.isEmpty()) { throw new IAE("uris cannot be used with object"); } @@ -76,7 +77,7 @@ public S3InputSource( throw new IAE("prefixes cannot be used with object"); } } else { - this.object = null; + this.objects = null; if (!this.uris.isEmpty() && !this.prefixes.isEmpty()) { throw new IAE("uris and prefixes cannot be used together"); } @@ -95,10 +96,10 @@ public S3InputSource( } } - private S3InputSource(ServerSideEncryptingAmazonS3 s3Client, S3Coords inputSplit) + private S3InputSource(ServerSideEncryptingAmazonS3 s3Client, CloudObjectLocation inputSplit) { this.s3Client = Preconditions.checkNotNull(s3Client, "s3Client"); - this.object = Preconditions.checkNotNull(inputSplit, "object"); + this.objects = ImmutableList.of(Preconditions.checkNotNull(inputSplit, "object")); this.uris = new ArrayList<>(); this.prefixes = new ArrayList<>(); } @@ -115,33 +116,33 @@ public List getPrefixes() return prefixes; } - @JsonProperty("object") - public S3Coords getObject() + @JsonProperty("objects") + public List getObject() { - return object; + return objects; } @Override - public Stream> createSplits(InputFormat inputFormat, @Nullable SplitHintSpec splitHintSpec) + public Stream> createSplits(InputFormat inputFormat, @Nullable SplitHintSpec splitHintSpec) { - if (object != null) { - return Stream.of(new InputSplit<>(object)); + if (objects != null) { + return objects.stream().map(InputSplit::new); } if (!uris.isEmpty()) { - return uris.stream().map(S3Coords::new).map(InputSplit::new); + return uris.stream().map(S3Utils::checkURI).map(CloudObjectLocation::new).map(InputSplit::new); } return StreamSupport.stream(getIterableObjectsFromPrefixes().spliterator(), false) - .map(S3Utils::summaryToS3Coords) + .map(S3Utils::summaryToCloudObjectLocation) .map(InputSplit::new); } @Override public int getNumSplits(InputFormat inputFormat, @Nullable SplitHintSpec splitHintSpec) { - if (object != null) { - return 1; + if (objects != null) { + return objects.size(); } if (!uris.isEmpty()) { @@ -152,7 +153,7 @@ public int getNumSplits(InputFormat inputFormat, @Nullable SplitHintSpec splitHi } @Override - public SplittableInputSource withSplit(InputSplit split) + public SplittableInputSource withSplit(InputSplit split) { return new S3InputSource(s3Client, split.get()); } @@ -193,13 +194,13 @@ public boolean equals(Object o) S3InputSource that = (S3InputSource) o; return Objects.equals(uris, that.uris) && Objects.equals(prefixes, that.prefixes) && - Objects.equals(object, that.object); + Objects.equals(objects, that.objects); } @Override public int hashCode() { - return Objects.hash(uris, prefixes, object); + return Objects.hash(uris, prefixes, objects); } @Override @@ -208,7 +209,7 @@ public String toString() return "S3InputSource{" + "uris=" + uris + ", prefixes=" + prefixes + - ", object=" + object + + ", objects=" + objects + '}'; } diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3DataSegmentPuller.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3DataSegmentPuller.java index 4d35753e295f..24e823e172b2 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3DataSegmentPuller.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3DataSegmentPuller.java @@ -28,6 +28,7 @@ import com.google.common.io.ByteSource; import com.google.common.io.Files; import com.google.inject.Inject; +import org.apache.druid.data.input.impl.CloudObjectLocation; import org.apache.druid.java.util.common.FileUtils; import org.apache.druid.java.util.common.IAE; import org.apache.druid.java.util.common.IOE; @@ -55,13 +56,9 @@ */ public class S3DataSegmentPuller implements URIDataPuller { - public static final int DEFAULT_RETRY_COUNT = 3; - - public static final String SCHEME = S3StorageDruidModule.SCHEME; - private static final Logger log = new Logger(S3DataSegmentPuller.class); - protected static final String BUCKET = "bucket"; + static final String BUCKET = "bucket"; protected static final String KEY = "key"; protected final ServerSideEncryptingAmazonS3 s3Client; @@ -72,7 +69,7 @@ public S3DataSegmentPuller(ServerSideEncryptingAmazonS3 s3Client) this.s3Client = s3Client; } - FileUtils.FileCopyResult getSegmentFiles(final S3Coords s3Coords, final File outDir) throws SegmentLoadingException + FileUtils.FileCopyResult getSegmentFiles(final CloudObjectLocation s3Coords, final File outDir) throws SegmentLoadingException { log.info("Pulling index at path[%s] to outDir[%s]", s3Coords, outDir); @@ -84,7 +81,7 @@ FileUtils.FileCopyResult getSegmentFiles(final S3Coords s3Coords, final File out try { org.apache.commons.io.FileUtils.forceMkdir(outDir); - final URI uri = URI.create(StringUtils.format("s3://%s/%s", s3Coords.bucket, s3Coords.path)); + final URI uri = URI.create(StringUtils.format("s3://%s/%s", s3Coords.getBucket(), s3Coords.getPath())); final ByteSource byteSource = new ByteSource() { @Override @@ -103,7 +100,7 @@ public InputStream openStream() throws IOException } } }; - if (CompressionUtils.isZip(s3Coords.path)) { + if (CompressionUtils.isZip(s3Coords.getPath())) { final FileUtils.FileCopyResult result = CompressionUtils.unzip( byteSource, outDir, @@ -113,7 +110,7 @@ public InputStream openStream() throws IOException log.info("Loaded %d bytes from [%s] to [%s]", result.size(), s3Coords.toString(), outDir.getAbsolutePath()); return result; } - if (CompressionUtils.isGz(s3Coords.path)) { + if (CompressionUtils.isGz(s3Coords.getPath())) { final String fname = Files.getNameWithoutExtension(uri.getPath()); final File outFile = new File(outDir, fname); @@ -139,16 +136,6 @@ public InputStream openStream() throws IOException } } - public static URI checkURI(URI uri) - { - if (uri.getScheme().equalsIgnoreCase(SCHEME)) { - uri = URI.create("s3" + uri.toString().substring(SCHEME.length())); - } else if (!"s3".equalsIgnoreCase(uri.getScheme())) { - throw new IAE("Don't know how to load scheme for URI [%s]", uri.toString()); - } - return uri; - } - @Override public InputStream getInputStream(URI uri) throws IOException { @@ -162,8 +149,9 @@ public InputStream getInputStream(URI uri) throws IOException private FileObject buildFileObject(final URI uri) throws AmazonServiceException { - final S3Coords coords = new S3Coords(checkURI(uri)); - final S3ObjectSummary objectSummary = S3Utils.getSingleObjectSummary(s3Client, coords.bucket, coords.path); + final CloudObjectLocation coords = new CloudObjectLocation(S3Utils.checkURI(uri)); + final S3ObjectSummary objectSummary = + S3Utils.getSingleObjectSummary(s3Client, coords.getBucket(), coords.getPath()); final String path = uri.getPath(); return new FileObject() @@ -289,8 +277,9 @@ public boolean apply(Throwable e) public String getVersion(URI uri) throws IOException { try { - final S3Coords coords = new S3Coords(checkURI(uri)); - final S3ObjectSummary objectSummary = S3Utils.getSingleObjectSummary(s3Client, coords.bucket, coords.path); + final CloudObjectLocation coords = new CloudObjectLocation(S3Utils.checkURI(uri)); + final S3ObjectSummary objectSummary = + S3Utils.getSingleObjectSummary(s3Client, coords.getBucket(), coords.getPath()); return StringUtils.format("%d", objectSummary.getLastModified().getTime()); } catch (AmazonServiceException e) { @@ -303,11 +292,11 @@ public String getVersion(URI uri) throws IOException } } - private boolean isObjectInBucket(final S3Coords coords) throws SegmentLoadingException + private boolean isObjectInBucket(final CloudObjectLocation coords) throws SegmentLoadingException { try { return S3Utils.retryS3Operation( - () -> S3Utils.isObjectInBucketIgnoringPermission(s3Client, coords.bucket, coords.path) + () -> S3Utils.isObjectInBucketIgnoringPermission(s3Client, coords.getBucket(), coords.getPath()) ); } catch (AmazonS3Exception | IOException e) { diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3LoadSpec.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3LoadSpec.java index ba58b608cdb9..804e0c64d72d 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3LoadSpec.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3LoadSpec.java @@ -24,6 +24,7 @@ import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.annotation.JsonTypeName; import com.google.common.base.Preconditions; +import org.apache.druid.data.input.impl.CloudObjectLocation; import org.apache.druid.segment.loading.LoadSpec; import org.apache.druid.segment.loading.SegmentLoadingException; @@ -57,7 +58,7 @@ public S3LoadSpec( @Override public LoadSpecResult loadSegment(File outDir) throws SegmentLoadingException { - return new LoadSpecResult(puller.getSegmentFiles(new S3Coords(bucket, key), outDir).size()); + return new LoadSpecResult(puller.getSegmentFiles(new CloudObjectLocation(bucket, key), outDir).size()); } @JsonProperty(S3DataSegmentPuller.BUCKET) diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3TimestampVersionedDataFinder.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3TimestampVersionedDataFinder.java index 429d64beb97e..e4ff32b76045 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3TimestampVersionedDataFinder.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3TimestampVersionedDataFinder.java @@ -22,6 +22,7 @@ import com.amazonaws.services.s3.model.S3ObjectSummary; import com.google.inject.Inject; import org.apache.druid.data.SearchableVersionedDataFinder; +import org.apache.druid.data.input.impl.CloudObjectLocation; import org.apache.druid.java.util.common.StringUtils; import javax.annotation.Nullable; @@ -56,7 +57,7 @@ public S3TimestampVersionedDataFinder(ServerSideEncryptingAmazonS3 s3Client) public URI getLatestVersion(final URI uri, final @Nullable Pattern pattern) { try { - final S3Coords coords = new S3Coords(checkURI(uri)); + final CloudObjectLocation coords = new CloudObjectLocation(S3Utils.checkURI(uri)); long mostRecent = Long.MIN_VALUE; URI latest = null; final Iterator objectSummaryIterator = S3Utils.objectSummaryIterator( @@ -66,7 +67,7 @@ public URI getLatestVersion(final URI uri, final @Nullable Pattern pattern) ); while (objectSummaryIterator.hasNext()) { final S3ObjectSummary objectSummary = objectSummaryIterator.next(); - String keyString = objectSummary.getKey().substring(coords.path.length()); + String keyString = objectSummary.getKey().substring(coords.getPath().length()); if (keyString.startsWith("/")) { keyString = keyString.substring(1); } diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Utils.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Utils.java index ee6351289dae..8c222b7e97cd 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Utils.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Utils.java @@ -33,6 +33,8 @@ import com.google.common.base.Joiner; import com.google.common.base.Predicate; import com.google.common.collect.Iterators; +import org.apache.druid.data.input.impl.CloudObjectLocation; +import org.apache.druid.java.util.common.IAE; import org.apache.druid.java.util.common.ISE; import org.apache.druid.java.util.common.RE; import org.apache.druid.java.util.common.RetryUtils; @@ -265,9 +267,9 @@ public static URI summaryToUri(S3ObjectSummary object) return URI.create(StringUtils.format("s3://%s/%s", authority, path)); } - public static S3Coords summaryToS3Coords(S3ObjectSummary object) + public static CloudObjectLocation summaryToCloudObjectLocation(S3ObjectSummary object) { - return new S3Coords(object.getBucketName(), object.getKey()); + return new CloudObjectLocation(object.getBucketName(), object.getKey()); } public static String constructSegmentPath(String baseKey, String storageDir) @@ -290,6 +292,16 @@ public static String extractS3Key(URI uri) return uri.getPath().startsWith("/") ? uri.getPath().substring(1) : uri.getPath(); } + public static URI checkURI(URI uri) + { + if (uri.getScheme().equalsIgnoreCase(S3StorageDruidModule.SCHEME)) { + uri = URI.create("s3" + uri.toString().substring(S3StorageDruidModule.SCHEME.length())); + } else if (!"s3".equalsIgnoreCase(uri.getScheme())) { + throw new IAE("Don't know how to load scheme for URI [%s]", uri.toString()); + } + return uri; + } + // Copied from org.jets3t.service.model.StorageObject.isDirectoryPlaceholder() public static boolean isDirectoryPlaceholder(String key, ObjectMetadata objectMetadata) { diff --git a/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java b/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java index b733f1e9ded0..c400f749aab2 100644 --- a/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java +++ b/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java @@ -46,6 +46,7 @@ import org.apache.druid.data.input.InputRowSchema; import org.apache.druid.data.input.InputSourceReader; import org.apache.druid.data.input.InputSplit; +import org.apache.druid.data.input.impl.CloudObjectLocation; import org.apache.druid.data.input.impl.CsvInputFormat; import org.apache.druid.data.input.impl.DimensionsSpec; import org.apache.druid.data.input.impl.JsonInputFormat; @@ -56,7 +57,6 @@ import org.apache.druid.java.util.common.parsers.CloseableIterator; import org.apache.druid.java.util.common.parsers.JSONPathSpec; import org.apache.druid.storage.s3.NoopServerSideEncryption; -import org.apache.druid.storage.s3.S3Coords; import org.apache.druid.storage.s3.S3Utils; import org.apache.druid.storage.s3.ServerSideEncryptingAmazonS3; import org.apache.druid.testing.InitializedNullHandlingTest; @@ -86,8 +86,8 @@ public class S3InputSourceTest extends InitializedNullHandlingTest URI.create("s3://bar/foo/file2.csv") ); - private static final List EXPECTED_COORDS = - EXPECTED_URIS.stream().map(S3Coords::new).collect(Collectors.toList()); + private static final List EXPECTED_COORDS = + EXPECTED_URIS.stream().map(CloudObjectLocation::new).collect(Collectors.toList()); private static final List PREFIXES = Arrays.asList( URI.create("s3://foo/bar"), @@ -124,7 +124,7 @@ public void testWithUrisSplit() { S3InputSource inputSource = new S3InputSource(SERVICE, EXPECTED_URIS, null, null); - Stream> splits = inputSource.createSplits( + Stream> splits = inputSource.createSplits( new JsonInputFormat(JSONPathSpec.DEFAULT, null), null ); @@ -142,7 +142,7 @@ public void testWithPrefixesSplit() S3InputSource inputSource = new S3InputSource(SERVICE, null, PREFIXES, null); - Stream> splits = inputSource.createSplits( + Stream> splits = inputSource.createSplits( new JsonInputFormat(JSONPathSpec.DEFAULT, null), null ); @@ -165,7 +165,7 @@ public void testWithPrefixesWhereOneIsUrisAndNoListPermissionSplit() null ); - Stream> splits = inputSource.createSplits( + Stream> splits = inputSource.createSplits( new JsonInputFormat(JSONPathSpec.DEFAULT, null), null ); diff --git a/extensions-core/s3-extensions/src/test/java/org/apache/druid/storage/s3/S3DataSegmentPullerTest.java b/extensions-core/s3-extensions/src/test/java/org/apache/druid/storage/s3/S3DataSegmentPullerTest.java index 1db5970ac429..914785d1ce7d 100644 --- a/extensions-core/s3-extensions/src/test/java/org/apache/druid/storage/s3/S3DataSegmentPullerTest.java +++ b/extensions-core/s3-extensions/src/test/java/org/apache/druid/storage/s3/S3DataSegmentPullerTest.java @@ -24,6 +24,7 @@ import com.amazonaws.services.s3.model.ListObjectsV2Result; import com.amazonaws.services.s3.model.S3Object; import com.amazonaws.services.s3.model.S3ObjectSummary; +import org.apache.druid.data.input.impl.CloudObjectLocation; import org.apache.druid.java.util.common.FileUtils; import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.segment.loading.SegmentLoadingException; @@ -124,7 +125,7 @@ public void testGZUncompress() throws IOException, SegmentLoadingException EasyMock.replay(s3Client); FileUtils.FileCopyResult result = puller.getSegmentFiles( - new S3Coords( + new CloudObjectLocation( bucket, object0.getKey() ), tmpDir @@ -191,7 +192,7 @@ public void testGZUncompressRetries() throws IOException, SegmentLoadingExceptio EasyMock.replay(s3Client); FileUtils.FileCopyResult result = puller.getSegmentFiles( - new S3Coords( + new CloudObjectLocation( bucket, object0.getKey() ), tmpDir From 33cee3c26a98f95ebfaf4dd96adceed63adf840f Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Fri, 22 Nov 2019 03:19:31 -0800 Subject: [PATCH 18/28] serde test --- .../data/input/s3/S3InputSourceTest.java | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java b/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java index c400f749aab2..aef6ddc594c7 100644 --- a/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java +++ b/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java @@ -75,6 +75,7 @@ public class S3InputSourceTest extends InitializedNullHandlingTest { + private static final ObjectMapper MAPPER = createS3ObjectMapper(); private static final AmazonS3Client S3_CLIENT = EasyMock.createNiceMock(AmazonS3Client.class); private static final ServerSideEncryptingAmazonS3 SERVICE = new ServerSideEncryptingAmazonS3( S3_CLIENT, @@ -101,21 +102,32 @@ public class S3InputSourceTest extends InitializedNullHandlingTest @Test public void testSerdeWithUris() throws Exception { - final ObjectMapper mapper = createS3ObjectMapper(); - final S3InputSource withUris = new S3InputSource(SERVICE, EXPECTED_URIS, null, null); - final S3InputSource serdeWithUris = mapper.readValue(mapper.writeValueAsString(withUris), S3InputSource.class); + final S3InputSource serdeWithUris = MAPPER.readValue(MAPPER.writeValueAsString(withUris), S3InputSource.class); Assert.assertEquals(withUris, serdeWithUris); } @Test public void testSerdeWithPrefixes() throws Exception { - final ObjectMapper mapper = createS3ObjectMapper(); - final S3InputSource withPrefixes = new S3InputSource(SERVICE, null, PREFIXES, null); final S3InputSource serdeWithPrefixes = - mapper.readValue(mapper.writeValueAsString(withPrefixes), S3InputSource.class); + MAPPER.readValue(MAPPER.writeValueAsString(withPrefixes), S3InputSource.class); + Assert.assertEquals(withPrefixes, serdeWithPrefixes); + } + + @Test + public void testSerdeWithObjects() throws Exception + { + + final S3InputSource withPrefixes = new S3InputSource( + SERVICE, + null, + null, + ImmutableList.of(new CloudObjectLocation("foo", "bar/file.csv")) + ); + final S3InputSource serdeWithPrefixes = + MAPPER.readValue(MAPPER.writeValueAsString(withPrefixes), S3InputSource.class); Assert.assertEquals(withPrefixes, serdeWithPrefixes); } From fd3c7aefa792db9284613bd680bff74abd45c952 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Fri, 22 Nov 2019 13:00:58 -0800 Subject: [PATCH 19/28] refactor and such --- .../druid/data/input/RetryingInputEntity.java | 10 +++- .../data/input/impl/CloudObjectLocation.java | 6 +- .../druid/data/input/impl/HttpEntity.java | 12 ++-- .../druid/java/util/common/StringUtils.java | 5 ++ docs/development/extensions-core/s3.md | 32 +++++++++- .../StaticAzureBlobStoreFirehoseFactory.java | 5 +- .../google/GoogleCloudStorageEntity.java | 9 ++- .../StaticGoogleBlobStoreFirehoseFactory.java | 5 +- .../google/GoogleDataSegmentPuller.java | 11 +--- .../druid/storage/google/GoogleUtils.java | 6 -- .../inputsource/hdfs/HdfsInputEntity.java | 9 ++- .../apache/druid/data/input/s3/S3Entity.java | 6 ++ .../druid/data/input/s3/S3InputSource.java | 16 ++--- .../s3/S3TimestampVersionedDataFinder.java | 6 +- .../org/apache/druid/storage/s3/S3Utils.java | 18 +++--- .../data/input/s3/S3InputSourceTest.java | 60 +++++++++++++++++++ 16 files changed, 159 insertions(+), 57 deletions(-) diff --git a/core/src/main/java/org/apache/druid/data/input/RetryingInputEntity.java b/core/src/main/java/org/apache/druid/data/input/RetryingInputEntity.java index 6a47f0b55936..4f7b45b0fb5c 100644 --- a/core/src/main/java/org/apache/druid/data/input/RetryingInputEntity.java +++ b/core/src/main/java/org/apache/druid/data/input/RetryingInputEntity.java @@ -23,6 +23,7 @@ import org.apache.druid.data.input.impl.RetryingInputStream; import org.apache.druid.data.input.impl.prefetch.ObjectOpenFunction; import org.apache.druid.java.util.common.RetryUtils; +import org.apache.druid.utils.CompressionUtils; import java.io.IOException; import java.io.InputStream; @@ -32,12 +33,13 @@ public interface RetryingInputEntity extends InputEntity @Override default InputStream open() throws IOException { - return new RetryingInputStream<>( + RetryingInputStream retryingInputStream = new RetryingInputStream<>( this, new RetryingInputEntityOpenFunction(), getRetryCondition(), RetryUtils.DEFAULT_MAX_TRIES ); + return CompressionUtils.decompress(retryingInputStream, getDecompressionPath()); } /** @@ -56,6 +58,12 @@ default InputStream readFromStart() throws IOException */ InputStream readFrom(long offset) throws IOException; + + /** + * Get path to decompress a compressed stream for the entity + */ + String getDecompressionPath(); + @Override Predicate getRetryCondition(); diff --git a/core/src/main/java/org/apache/druid/data/input/impl/CloudObjectLocation.java b/core/src/main/java/org/apache/druid/data/input/impl/CloudObjectLocation.java index 42f44abaedb1..9078708aa9c6 100644 --- a/core/src/main/java/org/apache/druid/data/input/impl/CloudObjectLocation.java +++ b/core/src/main/java/org/apache/druid/data/input/impl/CloudObjectLocation.java @@ -22,6 +22,7 @@ import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.Preconditions; +import org.apache.druid.java.util.common.StringUtils; import java.net.URI; import java.util.Objects; @@ -41,10 +42,7 @@ public CloudObjectLocation(@JsonProperty("bucket") String bucket, @JsonProperty( public CloudObjectLocation(URI uri) { bucket = uri.getHost(); - String path = uri.getPath(); - if (path.startsWith("/")) { - path = path.substring(1); - } + String path = StringUtils.maybeRemoveLeadingSlash(uri.getPath()); this.path = path; } diff --git a/core/src/main/java/org/apache/druid/data/input/impl/HttpEntity.java b/core/src/main/java/org/apache/druid/data/input/impl/HttpEntity.java index 22ce64afb8c4..d3d47c81b8d2 100644 --- a/core/src/main/java/org/apache/druid/data/input/impl/HttpEntity.java +++ b/core/src/main/java/org/apache/druid/data/input/impl/HttpEntity.java @@ -27,7 +27,6 @@ import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.java.util.common.logger.Logger; import org.apache.druid.metadata.PasswordProvider; -import org.apache.druid.utils.CompressionUtils; import javax.annotation.Nullable; import java.io.IOException; @@ -66,10 +65,13 @@ public URI getUri() @Override public InputStream readFrom(long offset) throws IOException { - return CompressionUtils.decompress( - openInputStream(uri, httpAuthenticationUsername, httpAuthenticationPasswordProvider, offset), - uri.toString() - ); + return openInputStream(uri, httpAuthenticationUsername, httpAuthenticationPasswordProvider, offset); + } + + @Override + public String getDecompressionPath() + { + return uri.getPath(); } @Override diff --git a/core/src/main/java/org/apache/druid/java/util/common/StringUtils.java b/core/src/main/java/org/apache/druid/java/util/common/StringUtils.java index d5e39e31e4d8..f5d1266be0d3 100644 --- a/core/src/main/java/org/apache/druid/java/util/common/StringUtils.java +++ b/core/src/main/java/org/apache/druid/java/util/common/StringUtils.java @@ -237,6 +237,11 @@ public static String urlDecode(String s) } } + public static String maybeRemoveLeadingSlash(String s) + { + return s.startsWith("/") ? s.substring(1) : s; + } + /** * Removes all occurrences of the given char from the given string. This method is an optimal version of * {@link String#replace(CharSequence, CharSequence) s.replace("c", "")}. diff --git a/docs/development/extensions-core/s3.md b/docs/development/extensions-core/s3.md index 5cee9de23c4f..fd82b37e8eb3 100644 --- a/docs/development/extensions-core/s3.md +++ b/docs/development/extensions-core/s3.md @@ -139,12 +139,40 @@ Sample spec: ... ``` + +```json +... + "ioConfig": { + "type": "index_parallel", + "inputSource": { + "type": "s3", + "objects": [ + { "bucket": "foo", "path": "bar/file1.json"}, + { "bucket": "bar", "path": "foo/file2.json"} + ] + }, + "inputFormat": { + "type": "json" + }, + ... + }, +... +``` + |property|description|default|required?| |--------|-----------|-------|---------| |type|This should be `s3`.|N/A|yes| -|uris|JSON array of URIs where s3 files to be ingested are located.|N/A|`uris` or `prefixes` must be set| -|prefixes|JSON array of URI prefixes for the locations of s3 files to be ingested.|N/A|`uris` or `prefixes` must be set| +|uris|JSON array of URIs where S3 objects to be ingested are located.|N/A|`uris` or `prefixes` or `objects` must be set| +|prefixes|JSON array of URI prefixes for the locations of S3 objects to be ingested.|N/A|`uris` or `prefixes` or `objects` must be set| +|objects|JSON array of S3 Objects to be ingested.|N/A|`uris` or `prefixes` or `objects` must be set| + + +S3 Object: +|property|description|default|required?| +|--------|-----------|-------|---------| +|bucket|Name of the S3 bucket|N/A|yes| +|path|The path where data is located.|N/A|yes| diff --git a/extensions-contrib/azure-extensions/src/main/java/org/apache/druid/firehose/azure/StaticAzureBlobStoreFirehoseFactory.java b/extensions-contrib/azure-extensions/src/main/java/org/apache/druid/firehose/azure/StaticAzureBlobStoreFirehoseFactory.java index d27970adea8e..c3919e9a0fd5 100644 --- a/extensions-contrib/azure-extensions/src/main/java/org/apache/druid/firehose/azure/StaticAzureBlobStoreFirehoseFactory.java +++ b/extensions-contrib/azure-extensions/src/main/java/org/apache/druid/firehose/azure/StaticAzureBlobStoreFirehoseFactory.java @@ -28,6 +28,7 @@ import org.apache.druid.data.input.InputSplit; import org.apache.druid.data.input.impl.StringInputRowParser; import org.apache.druid.data.input.impl.prefetch.PrefetchableTextFilesFirehoseFactory; +import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.storage.azure.AzureByteSource; import org.apache.druid.storage.azure.AzureStorage; import org.apache.druid.storage.azure.AzureUtils; @@ -101,9 +102,7 @@ protected InputStream wrapObjectStream(AzureBlob object, InputStream stream) thr private static AzureByteSource makeByteSource(AzureStorage azureStorage, AzureBlob object) { final String container = object.getContainer(); - final String path = object.getPath().startsWith("/") - ? object.getPath().substring(1) - : object.getPath(); + final String path = StringUtils.maybeRemoveLeadingSlash(object.getPath()); return new AzureByteSource(azureStorage, container, path); } diff --git a/extensions-core/google-extensions/src/main/java/org/apache/druid/data/input/google/GoogleCloudStorageEntity.java b/extensions-core/google-extensions/src/main/java/org/apache/druid/data/input/google/GoogleCloudStorageEntity.java index 6b96b89f0e62..a9230f8380cd 100644 --- a/extensions-core/google-extensions/src/main/java/org/apache/druid/data/input/google/GoogleCloudStorageEntity.java +++ b/extensions-core/google-extensions/src/main/java/org/apache/druid/data/input/google/GoogleCloudStorageEntity.java @@ -24,7 +24,6 @@ import org.apache.druid.storage.google.GoogleByteSource; import org.apache.druid.storage.google.GoogleStorage; import org.apache.druid.storage.google.GoogleUtils; -import org.apache.druid.utils.CompressionUtils; import javax.annotation.Nullable; import java.io.IOException; @@ -56,7 +55,13 @@ public InputStream readFrom(long offset) throws IOException final String bucket = uri.getAuthority(); final String key = GoogleUtils.extractGoogleCloudStorageObjectKey(uri); final GoogleByteSource byteSource = new GoogleByteSource(storage, bucket, key); - return CompressionUtils.decompress(byteSource.openStream(offset), key); + return byteSource.openStream(offset); + } + + @Override + public String getDecompressionPath() + { + return GoogleUtils.extractGoogleCloudStorageObjectKey(uri); } @Override diff --git a/extensions-core/google-extensions/src/main/java/org/apache/druid/firehose/google/StaticGoogleBlobStoreFirehoseFactory.java b/extensions-core/google-extensions/src/main/java/org/apache/druid/firehose/google/StaticGoogleBlobStoreFirehoseFactory.java index 82116e884c14..22437d780e13 100644 --- a/extensions-core/google-extensions/src/main/java/org/apache/druid/firehose/google/StaticGoogleBlobStoreFirehoseFactory.java +++ b/extensions-core/google-extensions/src/main/java/org/apache/druid/firehose/google/StaticGoogleBlobStoreFirehoseFactory.java @@ -27,6 +27,7 @@ import org.apache.druid.data.input.InputSplit; import org.apache.druid.data.input.impl.StringInputRowParser; import org.apache.druid.data.input.impl.prefetch.PrefetchableTextFilesFirehoseFactory; +import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.storage.google.GoogleByteSource; import org.apache.druid.storage.google.GoogleStorage; import org.apache.druid.storage.google.GoogleUtils; @@ -87,9 +88,7 @@ protected InputStream openObjectStream(GoogleBlob object, long start) throws IOE private GoogleByteSource createGoogleByteSource(GoogleBlob object) { final String bucket = object.getBucket(); - final String path = object.getPath().startsWith("/") - ? object.getPath().substring(1) - : object.getPath(); + final String path = StringUtils.maybeRemoveLeadingSlash(object.getPath()); return new GoogleByteSource(storage, bucket, path); } diff --git a/extensions-core/google-extensions/src/main/java/org/apache/druid/storage/google/GoogleDataSegmentPuller.java b/extensions-core/google-extensions/src/main/java/org/apache/druid/storage/google/GoogleDataSegmentPuller.java index 0de3a2be732b..7f2d469402cc 100644 --- a/extensions-core/google-extensions/src/main/java/org/apache/druid/storage/google/GoogleDataSegmentPuller.java +++ b/extensions-core/google-extensions/src/main/java/org/apache/druid/storage/google/GoogleDataSegmentPuller.java @@ -22,6 +22,7 @@ import com.google.common.base.Predicate; import com.google.inject.Inject; import org.apache.druid.java.util.common.FileUtils; +import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.java.util.common.logger.Logger; import org.apache.druid.segment.loading.SegmentLoadingException; import org.apache.druid.segment.loading.URIDataPuller; @@ -81,20 +82,14 @@ FileUtils.FileCopyResult getSegmentFiles(final String bucket, final String path, @Override public InputStream getInputStream(URI uri) throws IOException { - String path = uri.getPath(); - if (path.startsWith("/")) { - path = path.substring(1); - } + String path = StringUtils.maybeRemoveLeadingSlash(uri.getPath()); return storage.get(uri.getHost(), path); } @Override public String getVersion(URI uri) throws IOException { - String path = uri.getPath(); - if (path.startsWith("/")) { - path = path.substring(1); - } + String path = StringUtils.maybeRemoveLeadingSlash(uri.getPath()); return storage.version(uri.getHost(), path); } diff --git a/extensions-core/google-extensions/src/main/java/org/apache/druid/storage/google/GoogleUtils.java b/extensions-core/google-extensions/src/main/java/org/apache/druid/storage/google/GoogleUtils.java index 1dd12ef2388e..4acd9e050698 100644 --- a/extensions-core/google-extensions/src/main/java/org/apache/druid/storage/google/GoogleUtils.java +++ b/extensions-core/google-extensions/src/main/java/org/apache/druid/storage/google/GoogleUtils.java @@ -23,7 +23,6 @@ import com.google.common.base.Predicate; import java.io.IOException; -import java.net.URI; public class GoogleUtils { @@ -36,10 +35,5 @@ public static boolean isRetryable(Throwable t) return t instanceof IOException; } - public static String extractGoogleCloudStorageObjectKey(URI uri) - { - return uri.getPath().startsWith("/") ? uri.getPath().substring(1) : uri.getPath(); - } - public static final Predicate GOOGLE_RETRY = e -> isRetryable(e); } diff --git a/extensions-core/hdfs-storage/src/main/java/org/apache/druid/inputsource/hdfs/HdfsInputEntity.java b/extensions-core/hdfs-storage/src/main/java/org/apache/druid/inputsource/hdfs/HdfsInputEntity.java index 9a3786ed3dcf..1a0f93137e69 100644 --- a/extensions-core/hdfs-storage/src/main/java/org/apache/druid/inputsource/hdfs/HdfsInputEntity.java +++ b/extensions-core/hdfs-storage/src/main/java/org/apache/druid/inputsource/hdfs/HdfsInputEntity.java @@ -22,7 +22,6 @@ import com.google.common.base.Predicate; import org.apache.druid.data.input.RetryingInputEntity; import org.apache.druid.storage.hdfs.HdfsDataSegmentPuller; -import org.apache.druid.utils.CompressionUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; @@ -55,7 +54,13 @@ public InputStream readFrom(long offset) throws IOException final FileSystem fs = path.getFileSystem(conf); final FSDataInputStream inputStream = fs.open(path); inputStream.seek(offset); - return CompressionUtils.decompress(inputStream, path.getName()); + return inputStream; + } + + @Override + public String getDecompressionPath() + { + return path.getName(); } @Override diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java index aedb631280b1..7ba75796fdce 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java @@ -72,6 +72,12 @@ public InputStream readFrom(long offset) throws IOException } } + @Override + public String getDecompressionPath() + { + return object.getPath(); + } + @Override public Predicate getRetryCondition() { diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java index d6f9b223c997..cfd1333b38ca 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java @@ -78,12 +78,8 @@ public S3InputSource( } } else { this.objects = null; - if (!this.uris.isEmpty() && !this.prefixes.isEmpty()) { - throw new IAE("uris and prefixes cannot be used together"); - } - - if (this.uris.isEmpty() && this.prefixes.isEmpty()) { - throw new IAE("uris or prefixes must be specified"); + if (this.uris.isEmpty() == this.prefixes.isEmpty()) { + throw new IAE("exactly one of either uris or prefixes must be specified"); } for (final URI inputURI : this.uris) { @@ -104,20 +100,20 @@ private S3InputSource(ServerSideEncryptingAmazonS3 s3Client, CloudObjectLocation this.prefixes = new ArrayList<>(); } - @JsonProperty("uris") + @JsonProperty public List getUris() { return uris; } - @JsonProperty("prefixes") + @JsonProperty public List getPrefixes() { return prefixes; } - @JsonProperty("objects") - public List getObject() + @JsonProperty + public List getObjects() { return objects; } diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3TimestampVersionedDataFinder.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3TimestampVersionedDataFinder.java index e4ff32b76045..61bf7b14a4fd 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3TimestampVersionedDataFinder.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3TimestampVersionedDataFinder.java @@ -67,10 +67,8 @@ public URI getLatestVersion(final URI uri, final @Nullable Pattern pattern) ); while (objectSummaryIterator.hasNext()) { final S3ObjectSummary objectSummary = objectSummaryIterator.next(); - String keyString = objectSummary.getKey().substring(coords.getPath().length()); - if (keyString.startsWith("/")) { - keyString = keyString.substring(1); - } + String keyString = + StringUtils.maybeRemoveLeadingSlash(objectSummary.getKey().substring(coords.getPath().length())); if (pattern != null && !pattern.matcher(keyString).matches()) { continue; } diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Utils.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Utils.java index 8c222b7e97cd..613c51715bcc 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Utils.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Utils.java @@ -57,8 +57,6 @@ public class S3Utils private static final String MIMETYPE_JETS3T_DIRECTORY = "application/x-directory"; private static final Logger log = new Logger(S3Utils.class); - public static final int MAX_S3_RETRIES = 10; - static boolean isServiceExceptionRecoverable(AmazonServiceException ex) { final boolean isIOException = ex.getCause() instanceof IOException; @@ -88,7 +86,7 @@ public boolean apply(Throwable e) * Retries S3 operations that fail due to io-related exceptions. Service-level exceptions (access denied, file not * found, etc) are not retried. */ - public static T retryS3Operation(Task f) throws Exception + static T retryS3Operation(Task f) throws Exception { return RetryUtils.retry(f, S3RETRY, RetryUtils.DEFAULT_MAX_TRIES); } @@ -262,7 +260,7 @@ public static URI summaryToUri(S3ObjectSummary object) final String authority = originalAuthority.endsWith("/") ? originalAuthority.substring(0, originalAuthority.length() - 1) : originalAuthority; - final String path = originalPath.startsWith("/") ? originalPath.substring(1) : originalPath; + final String path = StringUtils.maybeRemoveLeadingSlash(originalPath); return URI.create(StringUtils.format("s3://%s/%s", authority, path)); } @@ -272,7 +270,7 @@ public static CloudObjectLocation summaryToCloudObjectLocation(S3ObjectSummary o return new CloudObjectLocation(object.getBucketName(), object.getKey()); } - public static String constructSegmentPath(String baseKey, String storageDir) + static String constructSegmentPath(String baseKey, String storageDir) { return JOINER.join( baseKey.isEmpty() ? null : baseKey, @@ -289,7 +287,7 @@ static AccessControlList grantFullControlToBucketOwner(ServerSideEncryptingAmazo public static String extractS3Key(URI uri) { - return uri.getPath().startsWith("/") ? uri.getPath().substring(1) : uri.getPath(); + return StringUtils.maybeRemoveLeadingSlash(uri.getPath()); } public static URI checkURI(URI uri) @@ -369,7 +367,13 @@ public static S3ObjectSummary getSingleObjectSummary(ServerSideEncryptingAmazonS * @param key The key under which to store the new object. * @param file The path of the file to upload to Amazon S3. */ - public static void uploadFileIfPossible(ServerSideEncryptingAmazonS3 service, boolean disableAcl, String bucket, String key, File file) + static void uploadFileIfPossible( + ServerSideEncryptingAmazonS3 service, + boolean disableAcl, + String bucket, + String key, + File file + ) { final PutObjectRequest putObjectRequest = new PutObjectRequest(bucket, key, file); diff --git a/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java b/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java index aef6ddc594c7..84d6a9940321 100644 --- a/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java +++ b/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java @@ -60,12 +60,14 @@ import org.apache.druid.storage.s3.S3Utils; import org.apache.druid.storage.s3.ServerSideEncryptingAmazonS3; import org.apache.druid.testing.InitializedNullHandlingTest; +import org.apache.druid.utils.CompressionUtils; import org.easymock.EasyMock; import org.joda.time.DateTime; import org.junit.Assert; import org.junit.Test; import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; import java.io.IOException; import java.net.URI; import java.util.Arrays; @@ -87,6 +89,11 @@ public class S3InputSourceTest extends InitializedNullHandlingTest URI.create("s3://bar/foo/file2.csv") ); + private static final List EXPECTED_COMPRESSED_URIS = Arrays.asList( + URI.create("s3://foo/bar/file.csv.gz"), + URI.create("s3://bar/foo/file2.csv.gz") + ); + private static final List EXPECTED_COORDS = EXPECTED_URIS.stream().map(CloudObjectLocation::new).collect(Collectors.toList()); @@ -224,6 +231,45 @@ public void testReader() throws IOException } } + @Test + public void testCompressedReader() throws IOException + { + EasyMock.reset(S3_CLIENT); + addExpectedPrefixObjects(PREFIXES.get(0), ImmutableList.of(EXPECTED_COMPRESSED_URIS.get(0))); + addExpectedNonPrefixObjectsWithNoListPermission(); + addExpectedGetCompressedObjectMock(EXPECTED_COMPRESSED_URIS.get(0)); + addExpectedGetCompressedObjectMock(EXPECTED_COMPRESSED_URIS.get(1)); + EasyMock.replay(S3_CLIENT); + + S3InputSource inputSource = new S3InputSource( + SERVICE, + null, + ImmutableList.of(PREFIXES.get(0), EXPECTED_COMPRESSED_URIS.get(1)), + null + ); + + InputRowSchema someSchema = new InputRowSchema( + new TimestampSpec("time", "auto", null), + new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("dim1", "dim2"))), + ImmutableList.of("count") + ); + + InputSourceReader reader = inputSource.reader( + someSchema, + new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, 0), + null + ); + + CloseableIterator iterator = reader.read(); + + while (iterator.hasNext()) { + InputRow nextRow = iterator.next(); + Assert.assertEquals(NOW, nextRow.getTimestamp()); + Assert.assertEquals("hello", nextRow.getDimension("dim1").get(0)); + Assert.assertEquals("world", nextRow.getDimension("dim2").get(0)); + } + } + private static void addExpectedPrefixObjects(URI prefix, List uris) { final String s3Bucket = prefix.getAuthority(); @@ -267,6 +313,20 @@ private static void addExpectedGetObjectMock(URI uri) EasyMock.expect(S3_CLIENT.getObject(EasyMock.anyObject(GetObjectRequest.class))).andReturn(someObject).once(); } + private static void addExpectedGetCompressedObjectMock(URI uri) throws IOException + { + final String s3Bucket = uri.getAuthority(); + final String key = S3Utils.extractS3Key(uri); + + S3Object someObject = new S3Object(); + someObject.setBucketName(s3Bucket); + someObject.setKey(key); + ByteArrayOutputStream gzipped = new ByteArrayOutputStream(); + CompressionUtils.gzip(new ByteArrayInputStream(CONTENT), gzipped); + someObject.setObjectContent(new ByteArrayInputStream(gzipped.toByteArray())); + EasyMock.expect(S3_CLIENT.getObject(EasyMock.anyObject(GetObjectRequest.class))).andReturn(someObject).once(); + } + public static ObjectMapper createS3ObjectMapper() { DruidModule baseModule = new TestS3Module(); From 3f6d54848ccedc847ccecc21fca6c4ade587c8e3 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Fri, 22 Nov 2019 13:28:24 -0800 Subject: [PATCH 20/28] now with the ability to compile --- .../druid/data/input/google/GoogleCloudStorageEntity.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/extensions-core/google-extensions/src/main/java/org/apache/druid/data/input/google/GoogleCloudStorageEntity.java b/extensions-core/google-extensions/src/main/java/org/apache/druid/data/input/google/GoogleCloudStorageEntity.java index a9230f8380cd..968de50cf3bb 100644 --- a/extensions-core/google-extensions/src/main/java/org/apache/druid/data/input/google/GoogleCloudStorageEntity.java +++ b/extensions-core/google-extensions/src/main/java/org/apache/druid/data/input/google/GoogleCloudStorageEntity.java @@ -21,6 +21,7 @@ import com.google.common.base.Predicate; import org.apache.druid.data.input.RetryingInputEntity; +import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.storage.google.GoogleByteSource; import org.apache.druid.storage.google.GoogleStorage; import org.apache.druid.storage.google.GoogleUtils; @@ -53,7 +54,7 @@ public InputStream readFrom(long offset) throws IOException { // Get data of the given object and open an input stream final String bucket = uri.getAuthority(); - final String key = GoogleUtils.extractGoogleCloudStorageObjectKey(uri); + final String key = StringUtils.maybeRemoveLeadingSlash(uri.getPath()); final GoogleByteSource byteSource = new GoogleByteSource(storage, bucket, key); return byteSource.openStream(offset); } @@ -61,7 +62,7 @@ public InputStream readFrom(long offset) throws IOException @Override public String getDecompressionPath() { - return GoogleUtils.extractGoogleCloudStorageObjectKey(uri); + return StringUtils.maybeRemoveLeadingSlash(uri.getPath()); } @Override From a0677b8626b562c99e6121d3e91ee44f6d2efa00 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Fri, 22 Nov 2019 15:31:40 -0800 Subject: [PATCH 21/28] fix signature and javadocs --- .../java/org/apache/druid/data/input/InputSource.java | 6 +----- .../org/apache/druid/data/input/RetryingInputEntity.java | 6 ++++-- .../apache/druid/data/input/s3/S3InputSourceTest.java | 9 +++++++-- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/core/src/main/java/org/apache/druid/data/input/InputSource.java b/core/src/main/java/org/apache/druid/data/input/InputSource.java index 09e95116a0a5..8932c857928e 100644 --- a/core/src/main/java/org/apache/druid/data/input/InputSource.java +++ b/core/src/main/java/org/apache/druid/data/input/InputSource.java @@ -75,9 +75,5 @@ public interface InputSource * @param inputFormat to parse data. It can be null if {@link #needsFormat()} = true * @param temporaryDirectory to store temp data. It will be cleaned up automatically once the task is finished. */ - InputSourceReader reader( - InputRowSchema inputRowSchema, - @Nullable InputFormat inputFormat, - @Nullable File temporaryDirectory - ); + InputSourceReader reader(InputRowSchema inputRowSchema, @Nullable InputFormat inputFormat, File temporaryDirectory); } diff --git a/core/src/main/java/org/apache/druid/data/input/RetryingInputEntity.java b/core/src/main/java/org/apache/druid/data/input/RetryingInputEntity.java index 4f7b45b0fb5c..0cec7e20a59e 100644 --- a/core/src/main/java/org/apache/druid/data/input/RetryingInputEntity.java +++ b/core/src/main/java/org/apache/druid/data/input/RetryingInputEntity.java @@ -43,7 +43,8 @@ default InputStream open() throws IOException } /** - * Directly opens an {@link InputStream} on the input entity. + * Directly opens an {@link InputStream} on the input entity. Decompression should be handled externally, this should + * return the raw stream for the object. */ default InputStream readFromStart() throws IOException { @@ -51,7 +52,8 @@ default InputStream readFromStart() throws IOException } /** - * Directly opens an {@link InputStream} starting at the given offset on the input entity. + * Directly opens an {@link InputStream} starting at the given offset on the input entity. Decompression should be + * handled externally, this should return the raw stream for the object. * * @param offset an offset to start reading from. A non-negative integer counting * the number of bytes from the beginning of the entity diff --git a/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java b/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java index 84d6a9940321..29a040b886e0 100644 --- a/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java +++ b/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java @@ -64,7 +64,9 @@ import org.easymock.EasyMock; import org.joda.time.DateTime; import org.junit.Assert; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.TemporaryFolder; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; @@ -77,6 +79,9 @@ public class S3InputSourceTest extends InitializedNullHandlingTest { + @Rule + public TemporaryFolder temporaryFolder = new TemporaryFolder(); + private static final ObjectMapper MAPPER = createS3ObjectMapper(); private static final AmazonS3Client S3_CLIENT = EasyMock.createNiceMock(AmazonS3Client.class); private static final ServerSideEncryptingAmazonS3 SERVICE = new ServerSideEncryptingAmazonS3( @@ -218,7 +223,7 @@ public void testReader() throws IOException InputSourceReader reader = inputSource.reader( someSchema, new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, 0), - null + temporaryFolder.newFolder() ); CloseableIterator iterator = reader.read(); @@ -257,7 +262,7 @@ public void testCompressedReader() throws IOException InputSourceReader reader = inputSource.reader( someSchema, new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, 0), - null + temporaryFolder.newFolder() ); CloseableIterator iterator = reader.read(); From 50746a6f41ff2717f8e138b6047f82a4bb383d20 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Sat, 23 Nov 2019 22:59:46 -0800 Subject: [PATCH 22/28] fix conflicts yet again, fix S3 uri stuffs --- .../data/input/impl/CloudObjectLocation.java | 38 ++++++-- .../druid/java/util/common/StringUtils.java | 7 +- .../input/impl/CloudObjectLocationTest.java | 87 +++++++++++++++++++ .../druid/data/input/s3/S3InputSource.java | 63 ++++++-------- .../input/s3/S3InputSourceDruidModule.java | 3 +- .../druid/storage/s3/S3DataSegmentPuller.java | 2 +- .../apache/druid/storage/s3/S3LoadSpec.java | 2 +- .../storage/s3/S3StorageDruidModule.java | 18 ++-- .../s3/S3TimestampVersionedDataFinder.java | 11 +-- .../org/apache/druid/storage/s3/S3Utils.java | 19 ++-- .../data/input/s3/S3InputSourceTest.java | 4 +- .../storage/s3/S3DataSegmentArchiverTest.java | 6 +- 12 files changed, 184 insertions(+), 76 deletions(-) create mode 100644 core/src/test/java/org/apache/druid/data/input/impl/CloudObjectLocationTest.java diff --git a/core/src/main/java/org/apache/druid/data/input/impl/CloudObjectLocation.java b/core/src/main/java/org/apache/druid/data/input/impl/CloudObjectLocation.java index 9078708aa9c6..2bc7eedb3790 100644 --- a/core/src/main/java/org/apache/druid/data/input/impl/CloudObjectLocation.java +++ b/core/src/main/java/org/apache/druid/data/input/impl/CloudObjectLocation.java @@ -27,6 +27,13 @@ import java.net.URI; import java.util.Objects; +/** + * Common type for 'bucket' and 'path' concept of cloud objects to allow code sharing between cloud specific + * implementations. {@link #bucket} and {@link #path} should NOT be URL encoded. + * + * The intention is that this is used as a common representation for storage objects as an alternative to dealing in + * {@link URI} directly, but still provide a mechansim to round-trip with a URI. + */ public class CloudObjectLocation { private final String bucket; @@ -35,15 +42,36 @@ public class CloudObjectLocation @JsonCreator public CloudObjectLocation(@JsonProperty("bucket") String bucket, @JsonProperty("path") String path) { - this.bucket = Preconditions.checkNotNull(bucket); - this.path = Preconditions.checkNotNull(path); + this.bucket = Preconditions.checkNotNull(StringUtils.maybeRemoveTrailingSlash(bucket)); + this.path = Preconditions.checkNotNull(StringUtils.maybeRemoveLeadingSlash(path)); } public CloudObjectLocation(URI uri) { - bucket = uri.getHost(); - String path = StringUtils.maybeRemoveLeadingSlash(uri.getPath()); - this.path = path; + this(uri.getHost(), uri.getPath()); + } + + /** + * Given a scheme, encode {@link #bucket} and {@link #path} into a {@link URI}. + * + * In all clouds bucket names must be dns compliant, so it does not require encoding + * https://docs.aws.amazon.com/AmazonS3/latest/dev/BucketRestrictions.html + * https://docs.microsoft.com/en-us/rest/api/storageservices/naming-and-referencing-containers--blobs--and-metadata + * https://cloud.google.com/storage/docs/naming + * + * There is no such restriction on object names, so they will be URL encoded when constructing the URI + */ + public URI toUri(String scheme) + { + // Encode path, except leave '/' characters unencoded + return URI.create( + StringUtils.format( + "%s://%s/%s", + scheme, + bucket, + StringUtils.replace(StringUtils.urlEncode(path), "%2F", "/") + ) + ); } @JsonProperty diff --git a/core/src/main/java/org/apache/druid/java/util/common/StringUtils.java b/core/src/main/java/org/apache/druid/java/util/common/StringUtils.java index f5d1266be0d3..7485802ff36a 100644 --- a/core/src/main/java/org/apache/druid/java/util/common/StringUtils.java +++ b/core/src/main/java/org/apache/druid/java/util/common/StringUtils.java @@ -239,7 +239,12 @@ public static String urlDecode(String s) public static String maybeRemoveLeadingSlash(String s) { - return s.startsWith("/") ? s.substring(1) : s; + return s != null && s.startsWith("/") ? s.substring(1) : s; + } + + public static String maybeRemoveTrailingSlash(String s) + { + return s != null && s.endsWith("/") ? s.substring(0, s.length() - 1) : s; } /** diff --git a/core/src/test/java/org/apache/druid/data/input/impl/CloudObjectLocationTest.java b/core/src/test/java/org/apache/druid/data/input/impl/CloudObjectLocationTest.java new file mode 100644 index 000000000000..ec41df9c27ed --- /dev/null +++ b/core/src/test/java/org/apache/druid/data/input/impl/CloudObjectLocationTest.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.data.input.impl; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.junit.Assert; +import org.junit.Test; + +import java.net.URI; + +public class CloudObjectLocationTest +{ + private static final ObjectMapper MAPPER = new ObjectMapper(); + private static final String SCHEME = "s3"; + + private static final CloudObjectLocation LOCATION = + new CloudObjectLocation("someBucket", "path/to/myobject"); + + private static final CloudObjectLocation LOCATION_URLENCODE = + new CloudObjectLocation("someBucket", "path/to/myobject?question"); + + private static final CloudObjectLocation LOCATION_NON_ASCII = + new CloudObjectLocation("someBucket", "pÄth/tø/myøbject"); + + @Test + public void testSerde() throws Exception + { + Assert.assertEquals( + LOCATION, + MAPPER.readValue(MAPPER.writeValueAsString(LOCATION), CloudObjectLocation.class) + ); + + Assert.assertEquals( + LOCATION_URLENCODE, + MAPPER.readValue(MAPPER.writeValueAsString(LOCATION_URLENCODE), CloudObjectLocation.class) + ); + + Assert.assertEquals( + LOCATION_NON_ASCII, + MAPPER.readValue(MAPPER.writeValueAsString(LOCATION_NON_ASCII), CloudObjectLocation.class) + ); + } + + @Test + public void testToUri() + { + Assert.assertEquals( + URI.create("s3://someBucket/path/to/myobject"), + LOCATION.toUri(SCHEME) + ); + + Assert.assertEquals( + URI.create("s3://someBucket/path/to/myobject%3Fquestion"), + LOCATION_URLENCODE.toUri(SCHEME) + ); + + Assert.assertEquals( + URI.create("s3://someBucket/p%C3%84th/t%C3%B8/my%C3%B8bject"), + LOCATION_NON_ASCII.toUri(SCHEME) + ); + } + + @Test + public void testUriRoundTrip() + { + Assert.assertEquals(LOCATION, new CloudObjectLocation(LOCATION.toUri(SCHEME))); + Assert.assertEquals(LOCATION_URLENCODE, new CloudObjectLocation(LOCATION_URLENCODE.toUri(SCHEME))); + Assert.assertEquals(LOCATION_NON_ASCII, new CloudObjectLocation(LOCATION_NON_ASCII.toUri(SCHEME))); + } +} diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java index cfd1333b38ca..3ae7dc762506 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java @@ -34,14 +34,12 @@ import org.apache.druid.data.input.impl.CloudObjectLocation; import org.apache.druid.data.input.impl.InputEntityIteratingReader; import org.apache.druid.data.input.impl.SplittableInputSource; -import org.apache.druid.java.util.common.IAE; import org.apache.druid.storage.s3.S3Utils; import org.apache.druid.storage.s3.ServerSideEncryptingAmazonS3; import javax.annotation.Nullable; import java.io.File; import java.net.URI; -import java.util.ArrayList; import java.util.List; import java.util.Objects; import java.util.stream.Stream; @@ -65,41 +63,22 @@ public S3InputSource( ) { this.s3Client = Preconditions.checkNotNull(s3Client, "s3Client"); - this.uris = uris == null ? new ArrayList<>() : uris; - this.prefixes = prefixes == null ? new ArrayList<>() : prefixes; + this.uris = uris; + this.prefixes = prefixes; + this.objects = objects; if (objects != null) { - this.objects = objects; - if (!this.uris.isEmpty()) { - throw new IAE("uris cannot be used with object"); - } - if (!this.prefixes.isEmpty()) { - throw new IAE("prefixes cannot be used with object"); - } + throwIfIllegalArgs(uris != null || prefixes != null); + } else if (uris != null) { + throwIfIllegalArgs(prefixes != null); + uris.forEach(S3Utils::checkURI); + } else if (prefixes != null) { + prefixes.forEach(S3Utils::checkURI); } else { - this.objects = null; - if (this.uris.isEmpty() == this.prefixes.isEmpty()) { - throw new IAE("exactly one of either uris or prefixes must be specified"); - } - - for (final URI inputURI : this.uris) { - Preconditions.checkArgument("s3".equals(inputURI.getScheme()), "input uri scheme == s3 (%s)", inputURI); - } - - for (final URI inputURI : this.prefixes) { - Preconditions.checkArgument("s3".equals(inputURI.getScheme()), "input uri scheme == s3 (%s)", inputURI); - } + throwIfIllegalArgs(true); } } - private S3InputSource(ServerSideEncryptingAmazonS3 s3Client, CloudObjectLocation inputSplit) - { - this.s3Client = Preconditions.checkNotNull(s3Client, "s3Client"); - this.objects = ImmutableList.of(Preconditions.checkNotNull(inputSplit, "object")); - this.uris = new ArrayList<>(); - this.prefixes = new ArrayList<>(); - } - @JsonProperty public List getUris() { @@ -119,14 +98,17 @@ public List getObjects() } @Override - public Stream> createSplits(InputFormat inputFormat, @Nullable SplitHintSpec splitHintSpec) + public Stream> createSplits( + InputFormat inputFormat, + @Nullable SplitHintSpec splitHintSpec + ) { if (objects != null) { return objects.stream().map(InputSplit::new); } - if (!uris.isEmpty()) { - return uris.stream().map(S3Utils::checkURI).map(CloudObjectLocation::new).map(InputSplit::new); + if (uris != null) { + return uris.stream().map(CloudObjectLocation::new).map(InputSplit::new); } return StreamSupport.stream(getIterableObjectsFromPrefixes().spliterator(), false) @@ -135,13 +117,13 @@ public Stream> createSplits(InputFormat inputFor } @Override - public int getNumSplits(InputFormat inputFormat, @Nullable SplitHintSpec splitHintSpec) + public int estimateNumSplits(InputFormat inputFormat, @Nullable SplitHintSpec splitHintSpec) { if (objects != null) { return objects.size(); } - if (!uris.isEmpty()) { + if (uris != null) { return uris.size(); } @@ -151,7 +133,7 @@ public int getNumSplits(InputFormat inputFormat, @Nullable SplitHintSpec splitHi @Override public SplittableInputSource withSplit(InputSplit split) { - return new S3InputSource(s3Client, split.get()); + return new S3InputSource(s3Client, null, null, ImmutableList.of(split.get())); } @Override @@ -213,4 +195,11 @@ private Iterable getIterableObjectsFromPrefixes() { return () -> S3Utils.lazyFetchingObjectSummariesIterator(s3Client, prefixes.iterator(), MAX_LISTING_LENGTH); } + + private void throwIfIllegalArgs(boolean clause) throws IllegalArgumentException + { + if (clause) { + throw new IllegalArgumentException("exactly one of either uris or prefixes or objects must be specified"); + } + } } diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSourceDruidModule.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSourceDruidModule.java index c445f7f29a80..241c7c164097 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSourceDruidModule.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSourceDruidModule.java @@ -25,6 +25,7 @@ import com.google.common.collect.ImmutableList; import com.google.inject.Binder; import org.apache.druid.initialization.DruidModule; +import org.apache.druid.storage.s3.S3StorageDruidModule; import java.util.List; @@ -37,7 +38,7 @@ public class S3InputSourceDruidModule implements DruidModule public List getJacksonModules() { return ImmutableList.of( - new SimpleModule().registerSubtypes(new NamedType(S3InputSource.class, "s3")) + new SimpleModule().registerSubtypes(new NamedType(S3InputSource.class, S3StorageDruidModule.SCHEME)) ); } diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3DataSegmentPuller.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3DataSegmentPuller.java index 8b4294385aa5..9c413e7ec4d0 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3DataSegmentPuller.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3DataSegmentPuller.java @@ -81,7 +81,7 @@ FileUtils.FileCopyResult getSegmentFiles(final CloudObjectLocation s3Coords, fin try { org.apache.commons.io.FileUtils.forceMkdir(outDir); - final URI uri = URI.create(StringUtils.format("s3://%s/%s", s3Coords.getBucket(), s3Coords.getPath())); + final URI uri = s3Coords.toUri(S3StorageDruidModule.SCHEME); final ByteSource byteSource = new ByteSource() { @Override diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3LoadSpec.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3LoadSpec.java index 804e0c64d72d..5338abfe4140 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3LoadSpec.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3LoadSpec.java @@ -33,7 +33,7 @@ /** * */ -@JsonTypeName(S3StorageDruidModule.SCHEME) +@JsonTypeName(S3StorageDruidModule.SCHEME_S3_ZIP) public class S3LoadSpec implements LoadSpec { private final String bucket; diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3StorageDruidModule.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3StorageDruidModule.java index 2daaadbe7e22..8cc6c250b541 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3StorageDruidModule.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3StorageDruidModule.java @@ -54,7 +54,9 @@ */ public class S3StorageDruidModule implements DruidModule { - public static final String SCHEME = "s3_zip"; + public static final String SCHEME = "s3"; + public static final String SCHEME_S3N = "s3n"; + public static final String SCHEME_S3_ZIP = "s3_zip"; private static final Logger log = new Logger(S3StorageDruidModule.class); @@ -139,27 +141,27 @@ public void setupModule(SetupContext context) public void configure(Binder binder) { MapBinder.newMapBinder(binder, String.class, SearchableVersionedDataFinder.class) - .addBinding("s3") + .addBinding(SCHEME) .to(S3TimestampVersionedDataFinder.class) .in(LazySingleton.class); MapBinder.newMapBinder(binder, String.class, SearchableVersionedDataFinder.class) - .addBinding("s3n") + .addBinding(SCHEME_S3N) .to(S3TimestampVersionedDataFinder.class) .in(LazySingleton.class); - Binders.dataSegmentKillerBinder(binder).addBinding(SCHEME).to(S3DataSegmentKiller.class).in(LazySingleton.class); - Binders.dataSegmentMoverBinder(binder).addBinding(SCHEME).to(S3DataSegmentMover.class).in(LazySingleton.class); + Binders.dataSegmentKillerBinder(binder).addBinding(SCHEME_S3_ZIP).to(S3DataSegmentKiller.class).in(LazySingleton.class); + Binders.dataSegmentMoverBinder(binder).addBinding(SCHEME_S3_ZIP).to(S3DataSegmentMover.class).in(LazySingleton.class); Binders.dataSegmentArchiverBinder(binder) - .addBinding(SCHEME) + .addBinding(SCHEME_S3_ZIP) .to(S3DataSegmentArchiver.class) .in(LazySingleton.class); - Binders.dataSegmentPusherBinder(binder).addBinding("s3").to(S3DataSegmentPusher.class).in(LazySingleton.class); + Binders.dataSegmentPusherBinder(binder).addBinding(SCHEME).to(S3DataSegmentPusher.class).in(LazySingleton.class); JsonConfigProvider.bind(binder, "druid.storage", S3DataSegmentPusherConfig.class); JsonConfigProvider.bind(binder, "druid.storage", S3DataSegmentArchiverConfig.class); JsonConfigProvider.bind(binder, "druid.storage", S3StorageConfig.class); JsonConfigProvider.bind(binder, "druid.storage.sse.kms", S3SSEKmsConfig.class); JsonConfigProvider.bind(binder, "druid.storage.sse.custom", S3SSECustomConfig.class); - Binders.taskLogsBinder(binder).addBinding("s3").to(S3TaskLogs.class); + Binders.taskLogsBinder(binder).addBinding(SCHEME).to(S3TaskLogs.class); JsonConfigProvider.bind(binder, "druid.indexer.logs", S3TaskLogsConfig.class); binder.bind(S3TaskLogs.class).in(LazySingleton.class); } diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3TimestampVersionedDataFinder.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3TimestampVersionedDataFinder.java index 61bf7b14a4fd..075d94bb4647 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3TimestampVersionedDataFinder.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3TimestampVersionedDataFinder.java @@ -67,17 +67,18 @@ public URI getLatestVersion(final URI uri, final @Nullable Pattern pattern) ); while (objectSummaryIterator.hasNext()) { final S3ObjectSummary objectSummary = objectSummaryIterator.next(); - String keyString = - StringUtils.maybeRemoveLeadingSlash(objectSummary.getKey().substring(coords.getPath().length())); + final CloudObjectLocation objectLocation = S3Utils.summaryToCloudObjectLocation(objectSummary); + // remove coords path prefix from object path + String keyString = StringUtils.maybeRemoveLeadingSlash( + objectLocation.getPath().substring(coords.getPath().length()) + ); if (pattern != null && !pattern.matcher(keyString).matches()) { continue; } final long latestModified = objectSummary.getLastModified().getTime(); if (latestModified >= mostRecent) { mostRecent = latestModified; - latest = new URI( - StringUtils.format("s3://%s/%s", objectSummary.getBucketName(), objectSummary.getKey()) - ); + latest = objectLocation.toUri(S3StorageDruidModule.SCHEME); } } return latest; diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Utils.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Utils.java index 613c51715bcc..5aaa0f187709 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Utils.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/storage/s3/S3Utils.java @@ -53,10 +53,12 @@ */ public class S3Utils { + private static final String SCHEME = S3StorageDruidModule.SCHEME; private static final Joiner JOINER = Joiner.on("/").skipNulls(); private static final String MIMETYPE_JETS3T_DIRECTORY = "application/x-directory"; private static final Logger log = new Logger(S3Utils.class); + static boolean isServiceExceptionRecoverable(AmazonServiceException ex) { final boolean isIOException = ex.getCause() instanceof IOException; @@ -255,14 +257,7 @@ public S3ObjectSummary next() */ public static URI summaryToUri(S3ObjectSummary object) { - final String originalAuthority = object.getBucketName(); - final String originalPath = object.getKey(); - final String authority = originalAuthority.endsWith("/") ? - originalAuthority.substring(0, originalAuthority.length() - 1) : - originalAuthority; - final String path = StringUtils.maybeRemoveLeadingSlash(originalPath); - - return URI.create(StringUtils.format("s3://%s/%s", authority, path)); + return summaryToCloudObjectLocation(object).toUri(SCHEME); } public static CloudObjectLocation summaryToCloudObjectLocation(S3ObjectSummary object) @@ -292,10 +287,10 @@ public static String extractS3Key(URI uri) public static URI checkURI(URI uri) { - if (uri.getScheme().equalsIgnoreCase(S3StorageDruidModule.SCHEME)) { - uri = URI.create("s3" + uri.toString().substring(S3StorageDruidModule.SCHEME.length())); - } else if (!"s3".equalsIgnoreCase(uri.getScheme())) { - throw new IAE("Don't know how to load scheme for URI [%s]", uri.toString()); + if (uri.getScheme().equalsIgnoreCase(S3StorageDruidModule.SCHEME_S3_ZIP)) { + uri = URI.create(SCHEME + uri.toString().substring(S3StorageDruidModule.SCHEME_S3_ZIP.length())); + } else if (!SCHEME.equalsIgnoreCase(uri.getScheme())) { + throw new IAE("Invalid URI scheme [%s] must be [%s]", uri.toString(), SCHEME); } return uri; } diff --git a/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java b/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java index 29a040b886e0..5286d6da49e5 100644 --- a/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java +++ b/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java @@ -222,7 +222,7 @@ public void testReader() throws IOException InputSourceReader reader = inputSource.reader( someSchema, - new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, 0), + new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, null, 0), temporaryFolder.newFolder() ); @@ -261,7 +261,7 @@ public void testCompressedReader() throws IOException InputSourceReader reader = inputSource.reader( someSchema, - new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, 0), + new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, null, 0), temporaryFolder.newFolder() ); diff --git a/extensions-core/s3-extensions/src/test/java/org/apache/druid/storage/s3/S3DataSegmentArchiverTest.java b/extensions-core/s3-extensions/src/test/java/org/apache/druid/storage/s3/S3DataSegmentArchiverTest.java index 660ef35552b6..f2531d4e0fcf 100644 --- a/extensions-core/s3-extensions/src/test/java/org/apache/druid/storage/s3/S3DataSegmentArchiverTest.java +++ b/extensions-core/s3-extensions/src/test/java/org/apache/druid/storage/s3/S3DataSegmentArchiverTest.java @@ -85,7 +85,7 @@ public String getArchiveBaseKey() .version("version") .loadSpec(ImmutableMap.of( "type", - S3StorageDruidModule.SCHEME, + S3StorageDruidModule.SCHEME_S3_ZIP, S3DataSegmentPuller.BUCKET, "source_bucket", S3DataSegmentPuller.KEY, @@ -107,7 +107,7 @@ public void testSimpleArchive() throws Exception final DataSegment archivedSegment = SOURCE_SEGMENT .withLoadSpec(ImmutableMap.of( "type", - S3StorageDruidModule.SCHEME, + S3StorageDruidModule.SCHEME_S3_ZIP, S3DataSegmentPuller.BUCKET, ARCHIVER_CONFIG.getArchiveBucket(), S3DataSegmentPuller.KEY, @@ -144,7 +144,7 @@ public void testSimpleRestore() throws Exception final DataSegment archivedSegment = SOURCE_SEGMENT .withLoadSpec(ImmutableMap.of( "type", - S3StorageDruidModule.SCHEME, + S3StorageDruidModule.SCHEME_S3_ZIP, S3DataSegmentPuller.BUCKET, ARCHIVER_CONFIG.getArchiveBucket(), S3DataSegmentPuller.KEY, From 8e96e55cb4e0f995d54f7dc7ccfd04c8b22fb274 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Sat, 23 Nov 2019 23:24:03 -0800 Subject: [PATCH 23/28] more tests, enforce uri for bucket --- .../data/input/impl/CloudObjectLocation.java | 14 +++++-- .../input/impl/CloudObjectLocationTest.java | 41 ++++++++++++++++--- 2 files changed, 45 insertions(+), 10 deletions(-) diff --git a/core/src/main/java/org/apache/druid/data/input/impl/CloudObjectLocation.java b/core/src/main/java/org/apache/druid/data/input/impl/CloudObjectLocation.java index 2bc7eedb3790..5fcdf2815e12 100644 --- a/core/src/main/java/org/apache/druid/data/input/impl/CloudObjectLocation.java +++ b/core/src/main/java/org/apache/druid/data/input/impl/CloudObjectLocation.java @@ -33,6 +33,15 @@ * * The intention is that this is used as a common representation for storage objects as an alternative to dealing in * {@link URI} directly, but still provide a mechansim to round-trip with a URI. + * + * In common clouds, bucket names must be dns compliant: + * https://docs.aws.amazon.com/AmazonS3/latest/dev/BucketRestrictions.html + * https://docs.microsoft.com/en-us/rest/api/storageservices/naming-and-referencing-containers--blobs--and-metadata + * https://cloud.google.com/storage/docs/naming + * + * The constructor ensures that bucket names are DNS compliant by checking that the URL encoded form of the bucket + * matches the supplied value. Technically it should probably confirm that the bucket is also all lower-case, but + * S3 has a legacy mode where buckets did not have to be compliant so we can't enforce that here unfortunately. */ public class CloudObjectLocation { @@ -44,6 +53,7 @@ public CloudObjectLocation(@JsonProperty("bucket") String bucket, @JsonProperty( { this.bucket = Preconditions.checkNotNull(StringUtils.maybeRemoveTrailingSlash(bucket)); this.path = Preconditions.checkNotNull(StringUtils.maybeRemoveLeadingSlash(path)); + Preconditions.checkArgument(this.bucket.equals(StringUtils.urlEncode(this.bucket))); } public CloudObjectLocation(URI uri) @@ -55,10 +65,6 @@ public CloudObjectLocation(URI uri) * Given a scheme, encode {@link #bucket} and {@link #path} into a {@link URI}. * * In all clouds bucket names must be dns compliant, so it does not require encoding - * https://docs.aws.amazon.com/AmazonS3/latest/dev/BucketRestrictions.html - * https://docs.microsoft.com/en-us/rest/api/storageservices/naming-and-referencing-containers--blobs--and-metadata - * https://cloud.google.com/storage/docs/naming - * * There is no such restriction on object names, so they will be URL encoded when constructing the URI */ public URI toUri(String scheme) diff --git a/core/src/test/java/org/apache/druid/data/input/impl/CloudObjectLocationTest.java b/core/src/test/java/org/apache/druid/data/input/impl/CloudObjectLocationTest.java index ec41df9c27ed..990fa0e1e088 100644 --- a/core/src/test/java/org/apache/druid/data/input/impl/CloudObjectLocationTest.java +++ b/core/src/test/java/org/apache/druid/data/input/impl/CloudObjectLocationTest.java @@ -21,7 +21,9 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.junit.Assert; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.ExpectedException; import java.net.URI; @@ -29,15 +31,22 @@ public class CloudObjectLocationTest { private static final ObjectMapper MAPPER = new ObjectMapper(); private static final String SCHEME = "s3"; + private static final String BUCKET_NAME = "bucket"; private static final CloudObjectLocation LOCATION = - new CloudObjectLocation("someBucket", "path/to/myobject"); + new CloudObjectLocation(BUCKET_NAME, "path/to/myobject"); + + private static final CloudObjectLocation LOCATION_EXTRA_SLASHES = + new CloudObjectLocation(BUCKET_NAME + '/', "/path/to/myobject"); private static final CloudObjectLocation LOCATION_URLENCODE = - new CloudObjectLocation("someBucket", "path/to/myobject?question"); + new CloudObjectLocation(BUCKET_NAME, "path/to/myobject?question"); private static final CloudObjectLocation LOCATION_NON_ASCII = - new CloudObjectLocation("someBucket", "pÄth/tø/myøbject"); + new CloudObjectLocation(BUCKET_NAME, "pÄth/tø/myøbject"); + + @Rule + public ExpectedException expectedException = ExpectedException.none(); @Test public void testSerde() throws Exception @@ -47,6 +56,11 @@ public void testSerde() throws Exception MAPPER.readValue(MAPPER.writeValueAsString(LOCATION), CloudObjectLocation.class) ); + Assert.assertEquals( + LOCATION_EXTRA_SLASHES, + MAPPER.readValue(MAPPER.writeValueAsString(LOCATION_EXTRA_SLASHES), CloudObjectLocation.class) + ); + Assert.assertEquals( LOCATION_URLENCODE, MAPPER.readValue(MAPPER.writeValueAsString(LOCATION_URLENCODE), CloudObjectLocation.class) @@ -62,17 +76,22 @@ public void testSerde() throws Exception public void testToUri() { Assert.assertEquals( - URI.create("s3://someBucket/path/to/myobject"), + URI.create("s3://bucket/path/to/myobject"), LOCATION.toUri(SCHEME) ); Assert.assertEquals( - URI.create("s3://someBucket/path/to/myobject%3Fquestion"), + URI.create("s3://bucket/path/to/myobject"), + LOCATION_EXTRA_SLASHES.toUri(SCHEME) + ); + + Assert.assertEquals( + URI.create("s3://bucket/path/to/myobject%3Fquestion"), LOCATION_URLENCODE.toUri(SCHEME) ); Assert.assertEquals( - URI.create("s3://someBucket/p%C3%84th/t%C3%B8/my%C3%B8bject"), + URI.create("s3://bucket/p%C3%84th/t%C3%B8/my%C3%B8bject"), LOCATION_NON_ASCII.toUri(SCHEME) ); } @@ -81,7 +100,17 @@ public void testToUri() public void testUriRoundTrip() { Assert.assertEquals(LOCATION, new CloudObjectLocation(LOCATION.toUri(SCHEME))); + Assert.assertEquals(LOCATION_EXTRA_SLASHES, new CloudObjectLocation(LOCATION_EXTRA_SLASHES.toUri(SCHEME))); Assert.assertEquals(LOCATION_URLENCODE, new CloudObjectLocation(LOCATION_URLENCODE.toUri(SCHEME))); Assert.assertEquals(LOCATION_NON_ASCII, new CloudObjectLocation(LOCATION_NON_ASCII.toUri(SCHEME))); } + + @Test + public void testBucketName() + { + expectedException.expect(IllegalArgumentException.class); + CloudObjectLocation invalidBucket = new CloudObjectLocation("someBÜcket", "some/path"); + // will never get here + Assert.assertEquals(invalidBucket, new CloudObjectLocation(invalidBucket.toUri(SCHEME))); + } } From 283d457e9e2911cfcca12141771746f3aa294b1a Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Sat, 23 Nov 2019 23:31:03 -0800 Subject: [PATCH 24/28] javadoc --- .../druid/data/input/RetryingInputEntity.java | 18 ++++++++++++------ .../druid/data/input/impl/HttpEntity.java | 2 +- .../input/google/GoogleCloudStorageEntity.java | 2 +- .../inputsource/hdfs/HdfsInputEntity.java | 2 +- .../apache/druid/data/input/s3/S3Entity.java | 2 +- 5 files changed, 16 insertions(+), 10 deletions(-) diff --git a/core/src/main/java/org/apache/druid/data/input/RetryingInputEntity.java b/core/src/main/java/org/apache/druid/data/input/RetryingInputEntity.java index 0cec7e20a59e..cd9efb113f16 100644 --- a/core/src/main/java/org/apache/druid/data/input/RetryingInputEntity.java +++ b/core/src/main/java/org/apache/druid/data/input/RetryingInputEntity.java @@ -30,6 +30,10 @@ public interface RetryingInputEntity extends InputEntity { + /** + * Open a {@link RetryingInputStream} wrapper for an underlying input stream, optionally decompressing the retrying + * stream if the file extension matches a known compression, otherwise passing through the retrying stream directly. + */ @Override default InputStream open() throws IOException { @@ -39,12 +43,12 @@ default InputStream open() throws IOException getRetryCondition(), RetryUtils.DEFAULT_MAX_TRIES ); - return CompressionUtils.decompress(retryingInputStream, getDecompressionPath()); + return CompressionUtils.decompress(retryingInputStream, getPath()); } /** - * Directly opens an {@link InputStream} on the input entity. Decompression should be handled externally, this should - * return the raw stream for the object. + * Directly opens an {@link InputStream} on the input entity. Decompression should be handled externally, and is + * handled by the default implementation of {@link #open}, so this should return the raw stream for the object. */ default InputStream readFromStart() throws IOException { @@ -53,7 +57,8 @@ default InputStream readFromStart() throws IOException /** * Directly opens an {@link InputStream} starting at the given offset on the input entity. Decompression should be - * handled externally, this should return the raw stream for the object. + * handled externally, and is handled by the default implementation of {@link #open},this should return the raw stream + * for the object. * * @param offset an offset to start reading from. A non-negative integer counting * the number of bytes from the beginning of the entity @@ -62,9 +67,10 @@ default InputStream readFromStart() throws IOException /** - * Get path to decompress a compressed stream for the entity + * Get path name for this entity, used by the default implementation of {@link #open} to determine if the underlying + * stream needs decompressed, based on file extension of the path */ - String getDecompressionPath(); + String getPath(); @Override Predicate getRetryCondition(); diff --git a/core/src/main/java/org/apache/druid/data/input/impl/HttpEntity.java b/core/src/main/java/org/apache/druid/data/input/impl/HttpEntity.java index d3d47c81b8d2..2a7795e0d9aa 100644 --- a/core/src/main/java/org/apache/druid/data/input/impl/HttpEntity.java +++ b/core/src/main/java/org/apache/druid/data/input/impl/HttpEntity.java @@ -69,7 +69,7 @@ public InputStream readFrom(long offset) throws IOException } @Override - public String getDecompressionPath() + public String getPath() { return uri.getPath(); } diff --git a/extensions-core/google-extensions/src/main/java/org/apache/druid/data/input/google/GoogleCloudStorageEntity.java b/extensions-core/google-extensions/src/main/java/org/apache/druid/data/input/google/GoogleCloudStorageEntity.java index 968de50cf3bb..7248631f7912 100644 --- a/extensions-core/google-extensions/src/main/java/org/apache/druid/data/input/google/GoogleCloudStorageEntity.java +++ b/extensions-core/google-extensions/src/main/java/org/apache/druid/data/input/google/GoogleCloudStorageEntity.java @@ -60,7 +60,7 @@ public InputStream readFrom(long offset) throws IOException } @Override - public String getDecompressionPath() + public String getPath() { return StringUtils.maybeRemoveLeadingSlash(uri.getPath()); } diff --git a/extensions-core/hdfs-storage/src/main/java/org/apache/druid/inputsource/hdfs/HdfsInputEntity.java b/extensions-core/hdfs-storage/src/main/java/org/apache/druid/inputsource/hdfs/HdfsInputEntity.java index 1a0f93137e69..4caf3f42ee6f 100644 --- a/extensions-core/hdfs-storage/src/main/java/org/apache/druid/inputsource/hdfs/HdfsInputEntity.java +++ b/extensions-core/hdfs-storage/src/main/java/org/apache/druid/inputsource/hdfs/HdfsInputEntity.java @@ -58,7 +58,7 @@ public InputStream readFrom(long offset) throws IOException } @Override - public String getDecompressionPath() + public String getPath() { return path.getName(); } diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java index 7ba75796fdce..9071cf4dd90b 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java @@ -73,7 +73,7 @@ public InputStream readFrom(long offset) throws IOException } @Override - public String getDecompressionPath() + public String getPath() { return object.getPath(); } From bdb22ca7e8aa213cd4d7cf21ea45572906d5648c Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Mon, 25 Nov 2019 13:00:26 -0800 Subject: [PATCH 25/28] oops --- extensions-core/s3-extensions/pom.xml | 1 - 1 file changed, 1 deletion(-) diff --git a/extensions-core/s3-extensions/pom.xml b/extensions-core/s3-extensions/pom.xml index 25e3c04d6d94..473002ccf8ae 100644 --- a/extensions-core/s3-extensions/pom.xml +++ b/extensions-core/s3-extensions/pom.xml @@ -116,7 +116,6 @@ joda-time joda-time - 2.10.5 provided From bcca7822589460f4927acfd842210384bf8d3580 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Mon, 25 Nov 2019 15:29:50 -0800 Subject: [PATCH 26/28] abstract class instead of interface --- .../druid/data/input/RetryingInputEntity.java | 17 ++++++----------- .../data/input/impl/CloudObjectLocation.java | 17 +++++++++-------- .../druid/data/input/impl/HttpEntity.java | 6 +++--- .../input/google/GoogleCloudStorageEntity.java | 6 +++--- .../druid/inputsource/hdfs/HdfsInputEntity.java | 6 +++--- .../apache/druid/data/input/s3/S3Entity.java | 6 +++--- 6 files changed, 27 insertions(+), 31 deletions(-) diff --git a/core/src/main/java/org/apache/druid/data/input/RetryingInputEntity.java b/core/src/main/java/org/apache/druid/data/input/RetryingInputEntity.java index cd9efb113f16..4dd512ffd395 100644 --- a/core/src/main/java/org/apache/druid/data/input/RetryingInputEntity.java +++ b/core/src/main/java/org/apache/druid/data/input/RetryingInputEntity.java @@ -19,7 +19,6 @@ package org.apache.druid.data.input; -import com.google.common.base.Predicate; import org.apache.druid.data.input.impl.RetryingInputStream; import org.apache.druid.data.input.impl.prefetch.ObjectOpenFunction; import org.apache.druid.java.util.common.RetryUtils; @@ -28,14 +27,14 @@ import java.io.IOException; import java.io.InputStream; -public interface RetryingInputEntity extends InputEntity +public abstract class RetryingInputEntity implements InputEntity { /** * Open a {@link RetryingInputStream} wrapper for an underlying input stream, optionally decompressing the retrying * stream if the file extension matches a known compression, otherwise passing through the retrying stream directly. */ @Override - default InputStream open() throws IOException + public InputStream open() throws IOException { RetryingInputStream retryingInputStream = new RetryingInputStream<>( this, @@ -50,7 +49,7 @@ default InputStream open() throws IOException * Directly opens an {@link InputStream} on the input entity. Decompression should be handled externally, and is * handled by the default implementation of {@link #open}, so this should return the raw stream for the object. */ - default InputStream readFromStart() throws IOException + private InputStream readFromStart() throws IOException { return readFrom(0); } @@ -63,19 +62,15 @@ default InputStream readFromStart() throws IOException * @param offset an offset to start reading from. A non-negative integer counting * the number of bytes from the beginning of the entity */ - InputStream readFrom(long offset) throws IOException; - + protected abstract InputStream readFrom(long offset) throws IOException; /** * Get path name for this entity, used by the default implementation of {@link #open} to determine if the underlying * stream needs decompressed, based on file extension of the path */ - String getPath(); - - @Override - Predicate getRetryCondition(); + protected abstract String getPath(); - class RetryingInputEntityOpenFunction implements ObjectOpenFunction + private static class RetryingInputEntityOpenFunction implements ObjectOpenFunction { @Override public InputStream open(RetryingInputEntity object) throws IOException diff --git a/core/src/main/java/org/apache/druid/data/input/impl/CloudObjectLocation.java b/core/src/main/java/org/apache/druid/data/input/impl/CloudObjectLocation.java index 5fcdf2815e12..71e1d0ee76bf 100644 --- a/core/src/main/java/org/apache/druid/data/input/impl/CloudObjectLocation.java +++ b/core/src/main/java/org/apache/druid/data/input/impl/CloudObjectLocation.java @@ -32,12 +32,12 @@ * implementations. {@link #bucket} and {@link #path} should NOT be URL encoded. * * The intention is that this is used as a common representation for storage objects as an alternative to dealing in - * {@link URI} directly, but still provide a mechansim to round-trip with a URI. + * {@link URI} directly, but still provide a mechanism to round-trip with a URI. * * In common clouds, bucket names must be dns compliant: - * https://docs.aws.amazon.com/AmazonS3/latest/dev/BucketRestrictions.html - * https://docs.microsoft.com/en-us/rest/api/storageservices/naming-and-referencing-containers--blobs--and-metadata - * https://cloud.google.com/storage/docs/naming + * https://docs.aws.amazon.com/AmazonS3/latest/dev/BucketRestrictions.html + * https://docs.microsoft.com/en-us/rest/api/storageservices/naming-and-referencing-containers--blobs--and-metadata + * https://cloud.google.com/storage/docs/naming * * The constructor ensures that bucket names are DNS compliant by checking that the URL encoded form of the bucket * matches the supplied value. Technically it should probably confirm that the bucket is also all lower-case, but @@ -95,10 +95,10 @@ public String getPath() @Override public String toString() { - return "CloudObjectLocation {" - + "bucket=" + bucket - + ",path=" + path - + "}"; + return "CloudObjectLocation{" + + "bucket='" + bucket + '\'' + + ", path='" + path + '\'' + + '}'; } @Override @@ -122,4 +122,5 @@ public int hashCode() { return Objects.hash(bucket, path); } + } diff --git a/core/src/main/java/org/apache/druid/data/input/impl/HttpEntity.java b/core/src/main/java/org/apache/druid/data/input/impl/HttpEntity.java index 2a7795e0d9aa..e5f5e2408260 100644 --- a/core/src/main/java/org/apache/druid/data/input/impl/HttpEntity.java +++ b/core/src/main/java/org/apache/druid/data/input/impl/HttpEntity.java @@ -35,7 +35,7 @@ import java.net.URLConnection; import java.util.Base64; -public class HttpEntity implements RetryingInputEntity +public class HttpEntity extends RetryingInputEntity { private static final Logger LOG = new Logger(HttpEntity.class); @@ -63,13 +63,13 @@ public URI getUri() } @Override - public InputStream readFrom(long offset) throws IOException + protected InputStream readFrom(long offset) throws IOException { return openInputStream(uri, httpAuthenticationUsername, httpAuthenticationPasswordProvider, offset); } @Override - public String getPath() + protected String getPath() { return uri.getPath(); } diff --git a/extensions-core/google-extensions/src/main/java/org/apache/druid/data/input/google/GoogleCloudStorageEntity.java b/extensions-core/google-extensions/src/main/java/org/apache/druid/data/input/google/GoogleCloudStorageEntity.java index 7248631f7912..77a1a2e2f30b 100644 --- a/extensions-core/google-extensions/src/main/java/org/apache/druid/data/input/google/GoogleCloudStorageEntity.java +++ b/extensions-core/google-extensions/src/main/java/org/apache/druid/data/input/google/GoogleCloudStorageEntity.java @@ -31,7 +31,7 @@ import java.io.InputStream; import java.net.URI; -public class GoogleCloudStorageEntity implements RetryingInputEntity +public class GoogleCloudStorageEntity extends RetryingInputEntity { private final GoogleStorage storage; private final URI uri; @@ -50,7 +50,7 @@ public URI getUri() } @Override - public InputStream readFrom(long offset) throws IOException + protected InputStream readFrom(long offset) throws IOException { // Get data of the given object and open an input stream final String bucket = uri.getAuthority(); @@ -60,7 +60,7 @@ public InputStream readFrom(long offset) throws IOException } @Override - public String getPath() + protected String getPath() { return StringUtils.maybeRemoveLeadingSlash(uri.getPath()); } diff --git a/extensions-core/hdfs-storage/src/main/java/org/apache/druid/inputsource/hdfs/HdfsInputEntity.java b/extensions-core/hdfs-storage/src/main/java/org/apache/druid/inputsource/hdfs/HdfsInputEntity.java index 4caf3f42ee6f..cd75d6c2a720 100644 --- a/extensions-core/hdfs-storage/src/main/java/org/apache/druid/inputsource/hdfs/HdfsInputEntity.java +++ b/extensions-core/hdfs-storage/src/main/java/org/apache/druid/inputsource/hdfs/HdfsInputEntity.java @@ -31,7 +31,7 @@ import java.io.InputStream; import java.net.URI; -public class HdfsInputEntity implements RetryingInputEntity +public class HdfsInputEntity extends RetryingInputEntity { private final Configuration conf; private final Path path; @@ -49,7 +49,7 @@ public URI getUri() } @Override - public InputStream readFrom(long offset) throws IOException + protected InputStream readFrom(long offset) throws IOException { final FileSystem fs = path.getFileSystem(conf); final FSDataInputStream inputStream = fs.open(path); @@ -58,7 +58,7 @@ public InputStream readFrom(long offset) throws IOException } @Override - public String getPath() + protected String getPath() { return path.getName(); } diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java index 9071cf4dd90b..00efaaa0691b 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3Entity.java @@ -33,7 +33,7 @@ import java.io.InputStream; import java.net.URI; -public class S3Entity implements RetryingInputEntity +public class S3Entity extends RetryingInputEntity { private final ServerSideEncryptingAmazonS3 s3Client; private final CloudObjectLocation object; @@ -51,7 +51,7 @@ public URI getUri() } @Override - public InputStream readFrom(long offset) throws IOException + protected InputStream readFrom(long offset) throws IOException { final GetObjectRequest request = new GetObjectRequest(object.getBucket(), object.getPath()); request.setRange(offset); @@ -73,7 +73,7 @@ public InputStream readFrom(long offset) throws IOException } @Override - public String getPath() + protected String getPath() { return object.getPath(); } From 42837690b13d2cfbe9082ab969f9299c7714009a Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Mon, 25 Nov 2019 15:58:44 -0800 Subject: [PATCH 27/28] null or empty --- .../druid/data/input/RetryingInputEntity.java | 2 +- .../apache/druid/utils/CollectionUtils.java | 7 ++ .../druid/data/input/s3/S3InputSource.java | 11 +-- .../data/input/s3/S3InputSourceTest.java | 68 +++++++++++++++++-- 4 files changed, 78 insertions(+), 10 deletions(-) diff --git a/core/src/main/java/org/apache/druid/data/input/RetryingInputEntity.java b/core/src/main/java/org/apache/druid/data/input/RetryingInputEntity.java index 4dd512ffd395..3c0aa4415009 100644 --- a/core/src/main/java/org/apache/druid/data/input/RetryingInputEntity.java +++ b/core/src/main/java/org/apache/druid/data/input/RetryingInputEntity.java @@ -49,7 +49,7 @@ public InputStream open() throws IOException * Directly opens an {@link InputStream} on the input entity. Decompression should be handled externally, and is * handled by the default implementation of {@link #open}, so this should return the raw stream for the object. */ - private InputStream readFromStart() throws IOException + protected InputStream readFromStart() throws IOException { return readFrom(0); } diff --git a/core/src/main/java/org/apache/druid/utils/CollectionUtils.java b/core/src/main/java/org/apache/druid/utils/CollectionUtils.java index 530feb855b60..02d55d06a5d8 100644 --- a/core/src/main/java/org/apache/druid/utils/CollectionUtils.java +++ b/core/src/main/java/org/apache/druid/utils/CollectionUtils.java @@ -23,11 +23,13 @@ import com.google.common.collect.Maps; import org.apache.druid.java.util.common.ISE; +import javax.annotation.Nullable; import java.util.AbstractCollection; import java.util.Collection; import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; +import java.util.List; import java.util.Map; import java.util.Spliterator; import java.util.TreeSet; @@ -116,6 +118,11 @@ public static Map mapKeys(Map map, Function keyMa return result; } + public static boolean isNullOrEmpty(@Nullable List list) + { + return list == null || list.isEmpty(); + } + private CollectionUtils() { } diff --git a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java index 3ae7dc762506..0080ad8b7c6a 100644 --- a/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java +++ b/extensions-core/s3-extensions/src/main/java/org/apache/druid/data/input/s3/S3InputSource.java @@ -36,6 +36,7 @@ import org.apache.druid.data.input.impl.SplittableInputSource; import org.apache.druid.storage.s3.S3Utils; import org.apache.druid.storage.s3.ServerSideEncryptingAmazonS3; +import org.apache.druid.utils.CollectionUtils; import javax.annotation.Nullable; import java.io.File; @@ -67,12 +68,12 @@ public S3InputSource( this.prefixes = prefixes; this.objects = objects; - if (objects != null) { - throwIfIllegalArgs(uris != null || prefixes != null); - } else if (uris != null) { - throwIfIllegalArgs(prefixes != null); + if (!CollectionUtils.isNullOrEmpty(objects)) { + throwIfIllegalArgs(!CollectionUtils.isNullOrEmpty(uris) || !CollectionUtils.isNullOrEmpty(prefixes)); + } else if (!CollectionUtils.isNullOrEmpty(uris)) { + throwIfIllegalArgs(!CollectionUtils.isNullOrEmpty(prefixes)); uris.forEach(S3Utils::checkURI); - } else if (prefixes != null) { + } else if (!CollectionUtils.isNullOrEmpty(prefixes)) { prefixes.forEach(S3Utils::checkURI); } else { throwIfIllegalArgs(true); diff --git a/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java b/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java index 5286d6da49e5..1c1bc0199123 100644 --- a/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java +++ b/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java @@ -66,6 +66,7 @@ import org.junit.Assert; import org.junit.Rule; import org.junit.Test; +import org.junit.rules.ExpectedException; import org.junit.rules.TemporaryFolder; import java.io.ByteArrayInputStream; @@ -79,9 +80,6 @@ public class S3InputSourceTest extends InitializedNullHandlingTest { - @Rule - public TemporaryFolder temporaryFolder = new TemporaryFolder(); - private static final ObjectMapper MAPPER = createS3ObjectMapper(); private static final AmazonS3Client S3_CLIENT = EasyMock.createNiceMock(AmazonS3Client.class); private static final ServerSideEncryptingAmazonS3 SERVICE = new ServerSideEncryptingAmazonS3( @@ -107,10 +105,19 @@ public class S3InputSourceTest extends InitializedNullHandlingTest URI.create("s3://bar/foo") ); + private static final List EXPECTED_LOCATION = + ImmutableList.of(new CloudObjectLocation("foo", "bar/file.csv")); + private static final DateTime NOW = DateTimes.nowUtc(); private static final byte[] CONTENT = StringUtils.toUtf8(StringUtils.format("%d,hello,world", NOW.getMillis())); + @Rule + public TemporaryFolder temporaryFolder = new TemporaryFolder(); + + @Rule + public ExpectedException expectedException = ExpectedException.none(); + @Test public void testSerdeWithUris() throws Exception { @@ -136,13 +143,66 @@ public void testSerdeWithObjects() throws Exception SERVICE, null, null, - ImmutableList.of(new CloudObjectLocation("foo", "bar/file.csv")) + EXPECTED_LOCATION + ); + final S3InputSource serdeWithPrefixes = + MAPPER.readValue(MAPPER.writeValueAsString(withPrefixes), S3InputSource.class); + Assert.assertEquals(withPrefixes, serdeWithPrefixes); + } + + @Test + public void testSerdeWithExtraEmptyLists() throws Exception + { + final S3InputSource withPrefixes = new S3InputSource( + SERVICE, + ImmutableList.of(), + ImmutableList.of(), + EXPECTED_LOCATION ); final S3InputSource serdeWithPrefixes = MAPPER.readValue(MAPPER.writeValueAsString(withPrefixes), S3InputSource.class); Assert.assertEquals(withPrefixes, serdeWithPrefixes); } + @Test + public void testSerdeWithInvalidArgs() throws Exception + { + expectedException.expect(IllegalArgumentException.class); + // constructor will explode + new S3InputSource( + SERVICE, + EXPECTED_URIS, + PREFIXES, + EXPECTED_LOCATION + ); + } + + @Test + public void testSerdeWithOtherInvalidArgs() + { + expectedException.expect(IllegalArgumentException.class); + // constructor will explode + new S3InputSource( + SERVICE, + EXPECTED_URIS, + PREFIXES, + ImmutableList.of() + ); + } + + @Test + public void testSerdeWithOtherOtherInvalidArgs() + { + expectedException.expect(IllegalArgumentException.class); + // constructor will explode + new S3InputSource( + SERVICE, + ImmutableList.of(), + PREFIXES, + EXPECTED_LOCATION + ); + } + @Test public void testWithUrisSplit() { From 81ebdd4e8dd2ebf8c147f2f44031acd91586c987 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Mon, 25 Nov 2019 19:36:19 -0800 Subject: [PATCH 28/28] better error --- .../apache/druid/data/input/impl/CloudObjectLocation.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/core/src/main/java/org/apache/druid/data/input/impl/CloudObjectLocation.java b/core/src/main/java/org/apache/druid/data/input/impl/CloudObjectLocation.java index 71e1d0ee76bf..d98100722586 100644 --- a/core/src/main/java/org/apache/druid/data/input/impl/CloudObjectLocation.java +++ b/core/src/main/java/org/apache/druid/data/input/impl/CloudObjectLocation.java @@ -53,7 +53,10 @@ public CloudObjectLocation(@JsonProperty("bucket") String bucket, @JsonProperty( { this.bucket = Preconditions.checkNotNull(StringUtils.maybeRemoveTrailingSlash(bucket)); this.path = Preconditions.checkNotNull(StringUtils.maybeRemoveLeadingSlash(path)); - Preconditions.checkArgument(this.bucket.equals(StringUtils.urlEncode(this.bucket))); + Preconditions.checkArgument( + this.bucket.equals(StringUtils.urlEncode(this.bucket)), + "bucket must follow DNS-compliant naming conventions" + ); } public CloudObjectLocation(URI uri)