diff --git a/docs/configuration/index.md b/docs/configuration/index.md index 853ec3878a26..c31cf0747cba 100644 --- a/docs/configuration/index.md +++ b/docs/configuration/index.md @@ -1424,7 +1424,7 @@ Additional Peon configs include: |`druid.indexer.task.storeEmptyColumns`|Boolean value for whether or not to store empty columns during ingestion. When set to true, Druid stores every column specified in the [`dimensionsSpec`](../ingestion/ingestion-spec.md#dimensionsspec). If you use the string-based schemaless ingestion and don't specify any dimensions to ingest, you must also set [`includeAllDimensions`](../ingestion/ingestion-spec.md#dimensionsspec) for Druid to store empty columns.

If you set `storeEmptyColumns` to false, Druid SQL queries referencing empty columns will fail. If you intend to leave `storeEmptyColumns` disabled, you should either ingest placeholder data for empty columns or else not query on empty columns.

You can overwrite this configuration by setting `storeEmptyColumns` in the [task context](../ingestion/tasks.md#context-parameters).|true| |`druid.indexer.task.tmpStorageBytesPerTask`|Maximum number of bytes per task to be used to store temporary files on disk. This config is generally intended for internal usage. Attempts to set it are very likely to be overwritten by the TaskRunner that executes the task, so be sure of what you expect to happen before directly adjusting this configuration parameter. The config is documented here primarily to provide an understanding of what it means if/when someone sees that it has been set. A value of -1 disables this limit. |-1| |`druid.indexer.server.maxChatRequests`|Maximum number of concurrent requests served by a task's chat handler. Set to 0 to disable limiting.|0| -|`druid.indexing.formats.maxStringLength`|Maximum number of characters to store per string dimension value. Longer values are truncated during ingestion. Does not apply to multi-value string dimensions. Set to 0 to disable. Can be overridden per-dimension using `maxStringLength` in the [dimension object](../ingestion/ingestion-spec.md#dimension-objects).|0 (no truncation)| +|`druid.indexing.formats.maxStringLength`|Maximum number of characters to store per string dimension value. Longer values are truncated during ingestion. Does not apply to multi-value string dimensions. Can be overridden per-dimension using `maxStringLength` in the [dimension object](../ingestion/ingestion-spec.md#dimension-objects). Value must be >= 0.|`null` (no truncation)| If the Peon is running in remote mode, there must be an Overlord up and running. Peons in remote mode can set the following configurations: diff --git a/docs/ingestion/ingestion-spec.md b/docs/ingestion/ingestion-spec.md index 72ec6d793d38..b2a0e41f48d7 100644 --- a/docs/ingestion/ingestion-spec.md +++ b/docs/ingestion/ingestion-spec.md @@ -243,7 +243,7 @@ Dimension objects can have the following components: | name | The name of the dimension. This will be used as the field name to read from input records, as well as the column name stored in generated segments.

Note that you can use a [`transformSpec`](#transformspec) if you want to rename columns during ingestion time. | none (required) | | createBitmapIndex | For `string` typed dimensions, whether or not bitmap indexes should be created for the column in generated segments. Creating a bitmap index requires more storage, but speeds up certain kinds of filtering (especially equality and prefix filtering). Only supported for `string` typed dimensions. | `true` | | multiValueHandling | For `string` typed dimensions, specifies the type of handling for [multi-value fields](../querying/multi-value-dimensions.md). Possible values are `array` (ingest string arrays as-is), `sorted_array` (sort string arrays during ingestion), and `sorted_set` (sort and de-duplicate string arrays during ingestion). This parameter is ignored for types other than `string`. | `sorted_array` | -| maxStringLength | For `string` typed dimensions, the maximum number of characters to store per value. Longer values are truncated during ingestion. Does not apply to multi-value string dimensions. Set to 0 to disable. Overrides the global [`druid.indexing.formats.maxStringLength`](../configuration/index.md#additional-peon-configuration) property. | `0` (no truncation) | +| maxStringLength | For `string` typed dimensions, the maximum number of characters to store per value. Longer values are truncated during ingestion. Does not apply to multi-value string dimensions. Overrides the global [`druid.indexing.formats.maxStringLength`](../configuration/index.md#additional-peon-configuration) property. Value must be >= 0. | `null` (no truncation) | #### Inclusions and exclusions diff --git a/processing/src/main/java/org/apache/druid/data/input/impl/StringDimensionSchema.java b/processing/src/main/java/org/apache/druid/data/input/impl/StringDimensionSchema.java index ab00952e867a..018d9ca5c35e 100644 --- a/processing/src/main/java/org/apache/druid/data/input/impl/StringDimensionSchema.java +++ b/processing/src/main/java/org/apache/druid/data/input/impl/StringDimensionSchema.java @@ -23,6 +23,7 @@ import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonProperty; +import org.apache.druid.error.DruidException; import org.apache.druid.guice.BuiltInTypesModule; import org.apache.druid.segment.DimensionHandler; import org.apache.druid.segment.StringDimensionHandler; @@ -34,12 +35,25 @@ public class StringDimensionSchema extends DimensionSchema { private static final boolean DEFAULT_CREATE_BITMAP_INDEX = true; - public static int getDefaultMaxStringLength() + @Nullable + public static Integer getDefaultMaxStringLength() { return BuiltInTypesModule.getMaxStringLength(); } - private final int maxStringLength; + @Nullable + private static Integer validateMaxStringLength(String name, @Nullable Integer maxStringLength) + { + if (maxStringLength != null && maxStringLength < 0) { + throw DruidException.forPersona(DruidException.Persona.USER) + .ofCategory(DruidException.Category.INVALID_INPUT) + .build("maxStringLength for column [%s] must be >= 0, got [%s]", name, maxStringLength); + } + return maxStringLength != null ? maxStringLength : getDefaultMaxStringLength(); + } + + @Nullable + private final Integer maxStringLength; @JsonCreator public static StringDimensionSchema create(String name) @@ -56,7 +70,7 @@ public StringDimensionSchema( ) { super(name, multiValueHandling, createBitmapIndex == null ? DEFAULT_CREATE_BITMAP_INDEX : createBitmapIndex); - this.maxStringLength = maxStringLength != null && maxStringLength > 0 ? maxStringLength : getDefaultMaxStringLength(); + this.maxStringLength = validateMaxStringLength(name, maxStringLength); } public StringDimensionSchema( @@ -65,17 +79,18 @@ public StringDimensionSchema( Boolean createBitmapIndex ) { - this(name, multiValueHandling, createBitmapIndex, getDefaultMaxStringLength()); + this(name, multiValueHandling, createBitmapIndex, null); } public StringDimensionSchema(String name) { - this(name, null, DEFAULT_CREATE_BITMAP_INDEX, getDefaultMaxStringLength()); + this(name, null, DEFAULT_CREATE_BITMAP_INDEX, null); } @JsonProperty - @JsonInclude(JsonInclude.Include.NON_DEFAULT) - public int getMaxStringLength() + @JsonInclude(JsonInclude.Include.NON_NULL) + @Nullable + public Integer getMaxStringLength() { return maxStringLength; } diff --git a/processing/src/main/java/org/apache/druid/guice/BuiltInTypesModule.java b/processing/src/main/java/org/apache/druid/guice/BuiltInTypesModule.java index e260a4bd8b66..5205c6ba311b 100644 --- a/processing/src/main/java/org/apache/druid/guice/BuiltInTypesModule.java +++ b/processing/src/main/java/org/apache/druid/guice/BuiltInTypesModule.java @@ -53,7 +53,8 @@ public class BuiltInTypesModule implements DruidModule */ private static DimensionSchema.MultiValueHandling STRING_MV_MODE = DimensionSchema.MultiValueHandling.SORTED_ARRAY; private static IndexSpec DEFAULT_INDEX_SPEC = IndexSpec.builder().build(); - private static int MAX_STRING_LENGTH = 0; + @Nullable + private static Integer MAX_STRING_LENGTH = null; /** * @return the configured string multi value handling mode from the system config if set; otherwise, returns @@ -138,12 +139,13 @@ private static void setMaxStringLengthIfConfigured(@Nullable Integer maxStringLe } @VisibleForTesting - public static void setMaxStringLength(int maxStringLength) + public static void setMaxStringLength(@Nullable Integer maxStringLength) { MAX_STRING_LENGTH = maxStringLength; } - public static int getMaxStringLength() + @Nullable + public static Integer getMaxStringLength() { return MAX_STRING_LENGTH; } diff --git a/processing/src/main/java/org/apache/druid/segment/DefaultColumnFormatConfig.java b/processing/src/main/java/org/apache/druid/segment/DefaultColumnFormatConfig.java index 19b875b5f6cb..56118b02686d 100644 --- a/processing/src/main/java/org/apache/druid/segment/DefaultColumnFormatConfig.java +++ b/processing/src/main/java/org/apache/druid/segment/DefaultColumnFormatConfig.java @@ -71,12 +71,12 @@ private static String validateMultiValueHandlingMode( @Nullable private static Integer validateMaxStringLength(@Nullable Integer maxStringLength) { - if (maxStringLength != null && maxStringLength <= 0) { + if (maxStringLength != null && maxStringLength < 0) { throw DruidException.forPersona(DruidException.Persona.OPERATOR) .ofCategory(DruidException.Category.INVALID_INPUT) .build( "Invalid value[%s] specified for 'druid.indexing.formats.maxStringLength'." - + " Value must be a positive integer.", + + " Value must be a non-negative integer.", maxStringLength ); } diff --git a/processing/src/main/java/org/apache/druid/segment/StringDimensionHandler.java b/processing/src/main/java/org/apache/druid/segment/StringDimensionHandler.java index d2b41ab7a4ba..0d23fe24aa77 100644 --- a/processing/src/main/java/org/apache/druid/segment/StringDimensionHandler.java +++ b/processing/src/main/java/org/apache/druid/segment/StringDimensionHandler.java @@ -32,6 +32,7 @@ import org.apache.druid.segment.selector.settable.SettableDimensionValueSelector; import org.apache.druid.segment.writeout.SegmentWriteOutMedium; +import javax.annotation.Nullable; import java.io.File; import java.util.Collections; import java.util.Comparator; @@ -104,7 +105,8 @@ private static IndexedInts getRow(ColumnValueSelector s) private final MultiValueHandling multiValueHandling; private final boolean hasBitmapIndexes; private final boolean hasSpatialIndexes; - private final int maxStringLength; + @Nullable + private final Integer maxStringLength; public StringDimensionHandler( String dimensionName, @@ -121,7 +123,7 @@ public StringDimensionHandler( MultiValueHandling multiValueHandling, boolean hasBitmapIndexes, boolean hasSpatialIndexes, - int maxStringLength + @Nullable Integer maxStringLength ) { this.dimensionName = dimensionName; diff --git a/processing/src/main/java/org/apache/druid/segment/StringDimensionIndexer.java b/processing/src/main/java/org/apache/druid/segment/StringDimensionIndexer.java index d41fe6fea980..88f60ee8042d 100644 --- a/processing/src/main/java/org/apache/druid/segment/StringDimensionIndexer.java +++ b/processing/src/main/java/org/apache/druid/segment/StringDimensionIndexer.java @@ -58,7 +58,8 @@ public class StringDimensionIndexer extends DictionaryEncodedColumnIndexer 0 && value != null && value.length() > maxStringLength) { + if (maxStringLength != null && value != null && value.length() > maxStringLength) { return value.substring(0, maxStringLength); } return value; diff --git a/processing/src/test/java/org/apache/druid/data/input/impl/StringDimensionSchemaTest.java b/processing/src/test/java/org/apache/druid/data/input/impl/StringDimensionSchemaTest.java index 3354ac8b82a1..dbee07bddb81 100644 --- a/processing/src/test/java/org/apache/druid/data/input/impl/StringDimensionSchemaTest.java +++ b/processing/src/test/java/org/apache/druid/data/input/impl/StringDimensionSchemaTest.java @@ -23,6 +23,7 @@ import com.fasterxml.jackson.databind.AnnotationIntrospector; import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.druid.data.input.impl.DimensionSchema.MultiValueHandling; +import org.apache.druid.error.DruidException; import org.apache.druid.guice.DruidSecondaryModule; import org.apache.druid.guice.GuiceAnnotationIntrospector; import org.junit.Assert; @@ -59,6 +60,16 @@ public void testDeserializeFromJson() throws JsonProcessingException + "}"; final StringDimensionSchema schema = (StringDimensionSchema) jsonMapper.readValue(json, DimensionSchema.class); Assert.assertEquals(new StringDimensionSchema("dim", MultiValueHandling.SORTED_SET, false), schema); - Assert.assertEquals(200, schema.getMaxStringLength()); + Assert.assertEquals(Integer.valueOf(200), schema.getMaxStringLength()); + } + + @Test + public void testInvalidMaxStringLength() + { + final Exception exception = Assert.assertThrows( + DruidException.class, + () -> new StringDimensionSchema("dim", null, true, -1) + ); + Assert.assertTrue(exception.getMessage().contains("maxStringLength for column [dim] must be >= 0")); } } diff --git a/processing/src/test/java/org/apache/druid/guice/BuiltInTypesModuleTest.java b/processing/src/test/java/org/apache/druid/guice/BuiltInTypesModuleTest.java index 189a8a2bdf38..4fdcad50e87c 100644 --- a/processing/src/test/java/org/apache/druid/guice/BuiltInTypesModuleTest.java +++ b/processing/src/test/java/org/apache/druid/guice/BuiltInTypesModuleTest.java @@ -60,7 +60,7 @@ public static void setup() public void teardownEach() { BuiltInTypesModule.setIndexSpecDefaults(IndexSpec.builder().build()); - BuiltInTypesModule.setMaxStringLength(0); + BuiltInTypesModule.setMaxStringLength(null); } @AfterClass @@ -75,7 +75,7 @@ public static void teardown() ); } BuiltInTypesModule.setIndexSpecDefaults(IndexSpec.builder().build()); - BuiltInTypesModule.setMaxStringLength(0); + BuiltInTypesModule.setMaxStringLength(null); } @Test @@ -98,7 +98,7 @@ public void testDefaults() BuiltInTypesModule.getStringMultiValueHandlingMode() ); - Assertions.assertEquals(0, BuiltInTypesModule.getMaxStringLength()); + Assertions.assertNull(BuiltInTypesModule.getMaxStringLength()); } @Test