From 569ee6aedf34d958841b599d3ba9151feafa2176 Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Tue, 30 Dec 2025 13:07:47 -0800 Subject: [PATCH] Coerce types in RowBasedColumnSelectorFactory. This patch adds type coercion to RowBasedColumnSelectorFactory, which allows it to better handle cases where the underlying data does not match the expected type. It also adds numeric accessor methods (isNull, getFloat, getDouble, getLong) to dimension selectors, similar to the ones that exist for nested and auto-typed columns. --- .../msq/exec/MSQParseExceptionsTest.java | 31 ++----- .../src/test/resources/not-json.txt | 3 + .../unparseable-mv-string-array.json | 3 - .../RowBasedColumnSelectorFactory.java | 92 ++++++++++++++++++- .../query/IterableRowsCursorHelperTest.java | 6 +- .../scan/ScanQueryResultOrderingTest.java | 2 +- .../server/ClientQuerySegmentWalkerTest.java | 16 ++-- 7 files changed, 113 insertions(+), 40 deletions(-) create mode 100644 multi-stage-query/src/test/resources/not-json.txt delete mode 100644 multi-stage-query/src/test/resources/unparseable-mv-string-array.json diff --git a/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQParseExceptionsTest.java b/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQParseExceptionsTest.java index c46d63864151..eacb9e0ac3f2 100644 --- a/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQParseExceptionsTest.java +++ b/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQParseExceptionsTest.java @@ -30,13 +30,11 @@ import org.apache.druid.msq.indexing.destination.DataSourceMSQDestination; import org.apache.druid.msq.indexing.error.CannotParseExternalDataFault; import org.apache.druid.msq.indexing.error.InvalidNullByteFault; -import org.apache.druid.msq.querykit.scan.ExternalColumnSelectorFactory; import org.apache.druid.msq.test.MSQTestBase; import org.apache.druid.msq.util.MultiStageQueryContext; import org.apache.druid.query.dimension.DefaultDimensionSpec; import org.apache.druid.query.groupby.GroupByQuery; import org.apache.druid.query.scan.ScanQuery; -import org.apache.druid.segment.SimpleAscendingOffset; import org.apache.druid.segment.column.ColumnType; import org.apache.druid.segment.column.RowSignature; import org.apache.druid.sql.calcite.external.ExternalDataSource; @@ -316,9 +314,9 @@ public void testIngestWithSanitizedNullByteUsingContextParameter() throws IOExce } @Test - public void testMultiValueStringWithIncorrectType() throws IOException + public void testCannotParseJson() throws IOException { - final File toRead = getResourceAsTemporaryFile("/unparseable-mv-string-array.json"); + final File toRead = getResourceAsTemporaryFile("/not-json.txt"); final String toReadAsJson = queryFramework().queryJsonMapper().writeValueAsString(toRead.getAbsolutePath()); RowSignature rowSignature = RowSignature.builder() @@ -337,25 +335,20 @@ public void testMultiValueStringWithIncorrectType() throws IOException testSelectQuery() .setSql("WITH\n" - + "kttm_data AS (\n" + + "ext AS (\n" + "SELECT * FROM TABLE(\n" + " EXTERN(\n" + " '{ \"files\": [" + toReadAsJson + "],\"type\":\"local\"}',\n" + " '{\"type\":\"json\"}',\n" - + " '[{\"name\":\"timestamp\",\"type\":\"string\"},{\"name\":\"agent_category\",\"type\":\"string\"},{\"name\":\"agent_type\",\"type\":\"string\"},{\"name\":\"browser\",\"type\":\"string\"},{\"name\":\"browser_version\",\"type\":\"string\"},{\"name\":\"city\",\"type\":\"string\"},{\"name\":\"continent\",\"type\":\"string\"},{\"name\":\"country\",\"type\":\"string\"},{\"name\":\"version\",\"type\":\"string\"},{\"name\":\"event_type\",\"type\":\"string\"},{\"name\":\"event_subtype\",\"type\":\"string\"},{\"name\":\"loaded_image\",\"type\":\"string\"},{\"name\":\"adblock_list\",\"type\":\"string\"},{\"name\":\"forwarded_for\",\"type\":\"string\"},{\"name\":\"language\",\"type\":\"string\"},{\"name\":\"number\",\"type\":\"long\"},{\"name\":\"os\",\"type\":\"string\"},{\"name\":\"path\",\"type\":\"string\"},{\"name\":\"platform\",\"type\":\"string\"},{\"name\":\"referrer\",\"type\":\"string\"},{\"name\":\"referrer_host\",\"type\":\"string\"},{\"name\":\"region\",\"type\":\"string\"},{\"name\":\"remote_address\",\"type\":\"string\"},{\"name\":\"screen\",\"type\":\"string\"},{\"name\":\"session\",\"type\":\"string\"},{\"name\":\"session_length\",\"type\":\"long\"},{\"name\":\"timezone\",\"type\":\"string\"},{\"name\":\"timezone_offset\",\"type\":\"long\"},{\"name\":\"window\",\"type\":\"string\"}]'\n" + + " '[{\"name\":\"timestamp\",\"type\":\"string\"},{\"name\":\"thisRow\",\"type\":\"string\"}]'\n" + " )\n" + "))\n" + "\n" + "SELECT\n" - + " FLOOR(TIME_PARSE(\"timestamp\") TO MINUTE) AS __time,\n" - + " MV_TO_ARRAY(\"language\") AS \"language\"\n" - + "FROM kttm_data") + + " TIME_PARSE(\"timestamp\") AS __time,\n" + + " thisRow\n" + + "FROM ext") .setExpectedRowSignature(rowSignature) - .setExpectedResultRows(ImmutableList.of( - new Object[]{1566691200000L, ImmutableList.of("en")}, - new Object[]{1566691200000L, ImmutableList.of("en", "es", "es-419", "es-MX")}, - new Object[]{1566691200000L, ImmutableList.of("en", "es", "es-419", "es-US")} - )) .setExpectedMSQSpec( LegacyMSQSpec .builder() @@ -370,14 +363,8 @@ public void testMultiValueStringWithIncorrectType() throws IOException .build()) .setExpectedMSQFault( new CannotParseExternalDataFault( - ExternalColumnSelectorFactory - .createException( - new Exception("dummy"), - "v1", - new LocalInputSource(null, null, ImmutableList.of(toRead), SystemFields.none()), - new SimpleAscendingOffset(Integer.MAX_VALUE) - ) - .getMessage() + "Unable to parse row [this row is not json] " + + "(Path: file:" + toRead.getAbsolutePath() + ", Record: 3, Line: 3)" ) ) .setQueryContext(DEFAULT_MSQ_CONTEXT) diff --git a/multi-stage-query/src/test/resources/not-json.txt b/multi-stage-query/src/test/resources/not-json.txt new file mode 100644 index 000000000000..bcab0172c5b2 --- /dev/null +++ b/multi-stage-query/src/test/resources/not-json.txt @@ -0,0 +1,3 @@ +{"timestamp":"2000-01-01","thisRow":"isJson"} +{"timestamp":"2000-01-01","thisRow":"isAlsoJson"} +this row is not json diff --git a/multi-stage-query/src/test/resources/unparseable-mv-string-array.json b/multi-stage-query/src/test/resources/unparseable-mv-string-array.json deleted file mode 100644 index 57b9a7709b09..000000000000 --- a/multi-stage-query/src/test/resources/unparseable-mv-string-array.json +++ /dev/null @@ -1,3 +0,0 @@ -{"timestamp":"2019-08-25T00:00:00.031Z","agent_category":"Personal computer","agent_type":"Browser","browser":"Chrome","browser_version":"76.0.3809.100","city":"Rosario","continent":"South America","country":"Argentina","version":"1.9.6","event_type":"PercentClear","event_subtype":"55","loaded_image":"http://www.koalastothemax.com/img/koalas2.jpg","adblock_list":"NoAdblock","forwarded_for":"181.13.41.82","language":[{},{}],"number":"16","os":"Windows 7","path":"http://www.koalastothemax.com/","platform":"Windows","referrer":"Direct","referrer_host":"Direct","region":"Santa Fe","remote_address":"172.31.57.89","screen":"1680x1050","session":"S56194838","session_length":76261,"timezone":"N/A","timezone_offset":"180","window":"1680x939"} -{"timestamp":"2019-08-25T00:00:00.059Z","agent_category":"Smartphone","agent_type":"Mobile Browser","browser":"Chrome Mobile","browser_version":"50.0.2661.89","city":"Nuevo Casas Grandes","continent":"North America","country":"Mexico","version":"1.9.6","event_type":"PercentClear","event_subtype":"85","loaded_image":"https://koalastothemax.com/img/koalas1.jpg","adblock_list":"NoAdblock","forwarded_for":"177.242.100.0","language":["en","es","es-419","es-MX"],"number":"24","os":"Android","path":"https://koalastothemax.com/","platform":"Android","referrer":"https://www.google.com/","referrer_host":"www.google.com","region":"Chihuahua","remote_address":"172.31.11.5","screen":"320x570","session":"S46093731","session_length":252689,"timezone":"CDT","timezone_offset":"300","window":"540x743"} -{"timestamp":"2019-08-25T00:00:00.178Z","agent_category":"Personal computer","agent_type":"Browser","browser":"Chrome","browser_version":"76.0.3809.100","city":"Luis Guillon","continent":"South America","country":"Argentina","version":"1.9.6","event_type":"PercentClear","event_subtype":"90","loaded_image":"http://www.koalastothemax.com/img/koalas.jpg","adblock_list":"NoAdblock","forwarded_for":"181.46.136.44","language":["en","es","es-419","es-US"],"number":"24","os":"Windows 7","path":"http://www.koalastothemax.com/","platform":"Windows","referrer":"Direct","referrer_host":"Direct","region":"Buenos Aires","remote_address":"172.31.11.5","screen":"1366x768","session":"S13352079","session_length":1753602,"timezone":"N/A","timezone_offset":"180","window":"1366x652"} diff --git a/processing/src/main/java/org/apache/druid/segment/RowBasedColumnSelectorFactory.java b/processing/src/main/java/org/apache/druid/segment/RowBasedColumnSelectorFactory.java index eb1f79937eae..8db55d1bd83a 100644 --- a/processing/src/main/java/org/apache/druid/segment/RowBasedColumnSelectorFactory.java +++ b/processing/src/main/java/org/apache/druid/segment/RowBasedColumnSelectorFactory.java @@ -20,7 +20,15 @@ package org.apache.druid.segment; import com.google.common.base.Preconditions; +import com.google.common.collect.Iterables; +import com.google.common.primitives.Doubles; +import org.apache.druid.common.guava.GuavaUtils; import org.apache.druid.data.input.Rows; +import org.apache.druid.java.util.common.parsers.ParseException; +import org.apache.druid.math.expr.ExprEval; +import org.apache.druid.math.expr.ExprType; +import org.apache.druid.math.expr.ExpressionType; +import org.apache.druid.query.dimension.DefaultDimensionSpec; import org.apache.druid.query.dimension.DimensionSpec; import org.apache.druid.query.extraction.ExtractionFn; import org.apache.druid.query.filter.DruidObjectPredicate; @@ -75,8 +83,7 @@ public RowBasedColumnSelectorFactory( this.rowSupplier = rowSupplier; this.rowIdSupplier = rowIdSupplier; this.adapter = adapter; - this.columnInspector = - Preconditions.checkNotNull(columnInspector, "columnInspector must be nonnull"); + this.columnInspector = Preconditions.checkNotNull(columnInspector, "columnInspector must be nonnull"); this.throwParseExceptions = throwParseExceptions; this.useStringValueOfNullInLists = useStringValueOfNullInLists; } @@ -336,6 +343,45 @@ public Class classOfObject() return Object.class; } + @Override + public boolean isNull() + { + updateCurrentValues(); + return DimensionHandlerUtils.isNumericNull(getObject()); + } + + @Override + public float getFloat() + { + updateCurrentValues(); + return (float) getDouble(); + } + + @Override + public double getDouble() + { + updateCurrentValues(); + + // Below is safe since isNull() returned true. + final String str = Iterables.getOnlyElement(dimensionValues); + return Doubles.tryParse(str); + } + + @Override + public long getLong() + { + updateCurrentValues(); + + // Below is safe since isNull() returned true. + final String str = Iterables.getOnlyElement(dimensionValues); + final Long n = GuavaUtils.tryParseLong(str); + if (n != null) { + return n; + } else { + return (long) getDouble(); + } + } + @Override public void inspectRuntimeShape(RuntimeShapeInspector inspector) { @@ -412,6 +458,8 @@ private void updateCurrentValues() @Override public ColumnValueSelector makeColumnValueSelector(String columnName) { + final ExpressionType expressionType = columnInspector.getType(columnName); + if (columnName.equals(ColumnHolder.TIME_COLUMN_NAME)) { final ToLongFunction timestampFunction = adapter.timestampFunction(); @@ -437,6 +485,8 @@ public void inspectRuntimeShape(RuntimeShapeInspector inspector) } } return new TimeLongColumnSelector(); + } else if (expressionType != null && expressionType.is(ExprType.STRING)) { + return makeDimensionSelector(DefaultDimensionSpec.of(columnName)); } else { final Function columnFunction = adapter.columnFunction(columnName); final ColumnCapabilities capabilities = columnInspector.getColumnCapabilities(columnName); @@ -488,7 +538,33 @@ public long getLong() public Object getObject() { updateCurrentValue(); - return currentValue; + + if (expressionType != null && !expressionType.is(ExprType.COMPLEX)) { + try { + final Object val = ExprEval.bestEffortOf(currentValue).castTo(expressionType).value(); + if (val != null && expressionType.is(ExprType.DOUBLE) && numberType == ValueType.FLOAT) { + // Adjustment for FLOAT. Expressions don't speak float, so we need to cast it ourselves. + return ((Number) val).floatValue(); + } else { + return val; + } + } + catch (Exception e) { + if (throwParseExceptions) { + throw new ParseException( + String.valueOf(currentValue), + "Error reading column[%s] as type[%s]", + columnName, + expressionType + ); + } else { + // if !throwParseExceptions, return the original uncasted value and hope for the best. + return currentValue; + } + } + } else { + return currentValue; + } } @Override @@ -557,4 +633,14 @@ public ColumnCapabilities getColumnCapabilities(String columnName) { return getColumnCapabilities(columnInspector, columnName); } + + /** + * Determines whether the provided object should be coerced using the provided type. Generally this is true, + * except for STRING type with List objects. This allows multi-value strings to be passed through without being + * coereced by the expression engine, which would turn arrays into nulls. + */ + private static boolean shouldCoerce(@Nullable final Object obj, @Nullable final ExpressionType expressionType) + { + return obj != null && expressionType != null && !(expressionType.is(ExprType.STRING) && obj instanceof List); + } } diff --git a/processing/src/test/java/org/apache/druid/query/IterableRowsCursorHelperTest.java b/processing/src/test/java/org/apache/druid/query/IterableRowsCursorHelperTest.java index 7628c3289dd1..6eac24e7f04e 100644 --- a/processing/src/test/java/org/apache/druid/query/IterableRowsCursorHelperTest.java +++ b/processing/src/test/java/org/apache/druid/query/IterableRowsCursorHelperTest.java @@ -36,9 +36,9 @@ public class IterableRowsCursorHelperTest { List rows = ImmutableList.of( - new Object[]{1, "a"}, - new Object[]{3, "b"}, - new Object[]{2, "b"} + new Object[]{1L, "a"}, + new Object[]{3L, "b"}, + new Object[]{2L, "b"} ); RowSignature rowSignature = RowSignature.builder() diff --git a/processing/src/test/java/org/apache/druid/query/scan/ScanQueryResultOrderingTest.java b/processing/src/test/java/org/apache/druid/query/scan/ScanQueryResultOrderingTest.java index 7ff0d26773ec..7004dec6565b 100644 --- a/processing/src/test/java/org/apache/druid/query/scan/ScanQueryResultOrderingTest.java +++ b/processing/src/test/java/org/apache/druid/query/scan/ScanQueryResultOrderingTest.java @@ -396,6 +396,6 @@ private List runQuery(final ScanQuery query, final QueryRunner (int) row[1]).boxed().collect(Collectors.toList()); + return results.stream().mapToInt(row -> ((Number) row[1]).intValue()).boxed().collect(Collectors.toList()); } } diff --git a/server/src/test/java/org/apache/druid/server/ClientQuerySegmentWalkerTest.java b/server/src/test/java/org/apache/druid/server/ClientQuerySegmentWalkerTest.java index 021227a3ca36..19fb82b4fcf1 100644 --- a/server/src/test/java/org/apache/druid/server/ClientQuerySegmentWalkerTest.java +++ b/server/src/test/java/org/apache/druid/server/ClientQuerySegmentWalkerTest.java @@ -717,10 +717,10 @@ public void testGroupByOnScanMultiValue() query.withDataSource( InlineDataSource.fromIterable( ImmutableList.of( - new Object[]{ImmutableList.of("a", "b"), 1}, - new Object[]{ImmutableList.of("a", "c"), 2}, - new Object[]{ImmutableList.of("b"), 3}, - new Object[]{ImmutableList.of("c"), 4} + new Object[]{List.of("a", "b"), 1L}, + new Object[]{List.of("a", "c"), 2L}, + new Object[]{"b", 3L}, + new Object[]{"c", 4L} ), RowSignature.builder().add("s", null).add("n", null).build() ) @@ -767,10 +767,10 @@ public void testTopNScanMultiValue() query.withDataSource( InlineDataSource.fromIterable( ImmutableList.of( - new Object[]{ImmutableList.of("a", "b"), 1}, - new Object[]{ImmutableList.of("a", "c"), 2}, - new Object[]{ImmutableList.of("b"), 3}, - new Object[]{ImmutableList.of("c"), 4} + new Object[]{List.of("a", "b"), 1L}, + new Object[]{List.of("a", "c"), 2L}, + new Object[]{"b", 3L}, + new Object[]{"c", 4L} ), RowSignature.builder().add("s", null).add("n", null).build() )