apache · gianm · Aug 13, 2020 · Jul 17, 2020 · Jul 21, 2020 · Jul 30, 2020
diff --git a/docs/querying/segmentmetadataquery.md b/docs/querying/segmentmetadataquery.md
@@ -30,15 +30,18 @@ sidebar_label: "SegmentMetadata"
 
 Segment metadata queries return per-segment information about:
 
-* Cardinality of all columns in the segment
-* Min/max values of string type columns in the segment
-* Estimated byte size for the segment columns if they were stored in a flat format
 * Number of rows stored inside the segment
 * Interval the segment covers
-* Column type of all the columns in the segment
-* Estimated total segment byte size in if it was stored in a flat format
-* Is the segment rolled up
+* Estimated total segment byte size in if it was stored in a 'flat format' (e.g. a csv file)
 * Segment id
+* Is the segment rolled up
+* Detailed per column information such as:
+  - type
+  - cardinality
+  - min/max values
+  - presence of null values
+  - estimated 'flat format' byte size
+
 
 ```json
 {
@@ -68,10 +71,10 @@ The format of the result is:
   "id" : "some_id",
   "intervals" : [ "2013-05-13T00:00:00.000Z/2013-05-14T00:00:00.000Z" ],
   "columns" : {
-    "__time" : { "type" : "LONG", "hasMultipleValues" : false, "size" : 407240380, "cardinality" : null, "errorMessage" : null },
-    "dim1" : { "type" : "STRING", "hasMultipleValues" : false, "size" : 100000, "cardinality" : 1944, "errorMessage" : null },
-    "dim2" : { "type" : "STRING", "hasMultipleValues" : true, "size" : 100000, "cardinality" : 1504, "errorMessage" : null },
-    "metric1" : { "type" : "FLOAT", "hasMultipleValues" : false, "size" : 100000, "cardinality" : null, "errorMessage" : null }
+    "__time" : { "type" : "LONG", "hasMultipleValues" : false, "hasNulls": false, "size" : 407240380, "cardinality" : null, "errorMessage" : null },
+    "dim1" : { "type" : "STRING", "hasMultipleValues" : false, "hasNulls": false, "size" : 100000, "cardinality" : 1944, "errorMessage" : null },
+    "dim2" : { "type" : "STRING", "hasMultipleValues" : true, "hasNulls": true, "size" : 100000, "cardinality" : 1504, "errorMessage" : null },
+    "metric1" : { "type" : "FLOAT", "hasMultipleValues" : false, "hasNulls": false, "size" : 100000, "cardinality" : null, "errorMessage" : null }
   },
   "aggregators" : {
     "metric1" : { "type" : "longSum", "name" : "metric1", "fieldName" : "metric1" }
@@ -84,14 +87,14 @@ The format of the result is:
 } ]
 ```
 
-Dimension columns will have type `STRING`.
-Metric columns will have type `FLOAT` or `LONG` or name of the underlying complex type such as `hyperUnique` in case of COMPLEX metric.
+Dimension columns will have type `STRING`, `FLOAT`, `DOUBLE`, or `LONG`.
+Metric columns will have type `FLOAT`, `DOUBLE`, or `LONG`, or the name of the underlying complex type such as `hyperUnique` in case of COMPLEX metric.
 Timestamp column will have type `LONG`.
 
 If the `errorMessage` field is non-null, you should not trust the other fields in the response. Their contents are
 undefined.
 
-Only columns which are dimensions (i.e., have type `STRING`) will have any cardinality. Rest of the columns (timestamp and metric columns) will show cardinality as `null`.
+Only columns which are dictionary encoded (i.e., have type `STRING`) will have any cardinality. Rest of the columns (timestamp and metric columns) will show cardinality as `null`.
 
 ## intervals
 

diff --git a/integration-tests/docker/druid.sh b/integration-tests/docker/druid.sh
@@ -90,7 +90,7 @@ setupData()
     # below s3 credentials needed to access the pre-existing s3 bucket
     setKey $DRUID_SERVICE druid.s3.accessKey AKIAJI7DG7CDECGBQ6NA
     setKey $DRUID_SERVICE druid.s3.secretKey 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
-    if [[ "$DRUID_INTEGRATION_TEST_GROUP" = "query-retry" ]]; then
+    if [ "$DRUID_INTEGRATION_TEST_GROUP" = "query-retry" ]; then
       setKey $DRUID_SERVICE druid.extensions.loadList [\"druid-s3-extensions\",\"druid-integration-tests\"]
     else
       setKey $DRUID_SERVICE druid.extensions.loadList [\"druid-s3-extensions\"]

diff --git a/integration-tests/src/test/resources/queries/twitterstream_queries.json b/integration-tests/src/test/resources/queries/twitterstream_queries.json
@@ -592,66 +592,69 @@
         },
         "expectedResults": [
             {
-                "id": "twitterstream_2013-01-01T00:00:00.000Z_2013-01-02T00:00:00.000Z_2013-01-02T04:13:41.980Z_v9",
-                "intervals": ["2013-01-01T00:00:00.000Z/2013-01-02T00:00:00.000Z"],
-                "columns": {
-                    "has_links": {
-                        "type": "STRING",
-                        "hasMultipleValues": false,
-                        "size": 0,
-                        "cardinality": 2,
+                "id":"twitterstream_2013-01-01T00:00:00.000Z_2013-01-02T00:00:00.000Z_2013-01-02T04:13:41.980Z_v9",
+                "intervals":["2013-01-01T00:00:00.000Z/2013-01-02T00:00:00.000Z"],
+                "columns":{
+                    "has_links":{
+                        "type":"STRING",
+                        "hasMultipleValues":false,
+                        "size":0,
+                        "cardinality":2,
                         "minValue":"No",
                         "maxValue":"Yes",
-                        "errorMessage": null
+                        "errorMessage":null,
+                        "hasNulls":false
                     }
                 },
-                "size": 0,
-                "numRows": 3702583,
-                "aggregators": null,
-                "timestampSpec": null,
-                "queryGranularity": null,
+                "size":0,
+                "numRows":3702583,
+                "aggregators":null,
+                "timestampSpec":null,
+                "queryGranularity":null,
                 "rollup":null
             },
             {
-                "id": "twitterstream_2013-01-02T00:00:00.000Z_2013-01-03T00:00:00.000Z_2013-01-03T03:44:58.791Z_v9",
-                "intervals": ["2013-01-02T00:00:00.000Z/2013-01-03T00:00:00.000Z"],
-                "columns": {
-                    "has_links": {
-                        "type": "STRING",
-                        "hasMultipleValues": false,
-                        "size": 0,
-                        "cardinality": 2,
+                "id":"twitterstream_2013-01-02T00:00:00.000Z_2013-01-03T00:00:00.000Z_2013-01-03T03:44:58.791Z_v9",
+                "intervals":["2013-01-02T00:00:00.000Z/2013-01-03T00:00:00.000Z"],
+                "columns":{
+                    "has_links":{
+                        "type":"STRING",
+                        "hasMultipleValues":false,
+                        "size":0,
+                        "cardinality":2,
                         "minValue":"No",
                         "maxValue":"Yes",
-                        "errorMessage": null
+                        "errorMessage":null,
+                        "hasNulls":false
                     }
                 },
-                "size": 0,
-                "numRows": 3743002,
-                "aggregators": null,
-                "timestampSpec": null,
-                "queryGranularity": null,
+                "size":0,
+                "numRows":3743002,
+                "aggregators":null,
+                "timestampSpec":null,
+                "queryGranularity":null,
                 "rollup":null
             },
             {
-                "id": "twitterstream_2013-01-03T00:00:00.000Z_2013-01-04T00:00:00.000Z_2013-01-04T04:09:13.590Z_v9",
-                "intervals": ["2013-01-03T00:00:00.000Z/2013-01-04T00:00:00.000Z"],
-                "columns": {
-                    "has_links": {
-                        "type": "STRING",
-                        "hasMultipleValues": false,
-                        "size": 0,
-                        "cardinality": 2,
+                "id":"twitterstream_2013-01-03T00:00:00.000Z_2013-01-04T00:00:00.000Z_2013-01-04T04:09:13.590Z_v9",
+                "intervals":["2013-01-03T00:00:00.000Z/2013-01-04T00:00:00.000Z"],
+                "columns":{
+                    "has_links":{
+                        "type":"STRING",
+                        "hasMultipleValues":false,
+                        "size":0,
+                        "cardinality":2,
                         "minValue":"No",
                         "maxValue":"Yes",
-                        "errorMessage": null
+                        "errorMessage":null,
+                        "hasNulls":false
                     }
                 },
-                "size": 0,
+                "size":0,
                 "numRows":3502959,
-                "aggregators": null,
-                "timestampSpec": null,
-                "queryGranularity": null,
+                "aggregators":null,
+                "timestampSpec":null,
+                "queryGranularity":null,
                 "rollup":null
             }
         ]

diff --git a/integration-tests/src/test/resources/queries/wikipedia_editstream_queries.json b/integration-tests/src/test/resources/queries/wikipedia_editstream_queries.json
@@ -1398,33 +1398,35 @@
         },
         "expectedResults": [
             {
-                "id": "wikipedia_editstream_2012-12-29T00:00:00.000Z_2013-01-10T08:00:00.000Z_2013-01-10T08:13:47.830Z_v9",
-                "intervals": ["2012-12-29T00:00:00.000Z/2013-01-10T08:00:00.000Z"],
-                "columns": {
-                    "country_name": {
-                        "type": "STRING",
-                        "hasMultipleValues": false,
-                        "size": 0,
-                        "cardinality": 208,
+                "id":"wikipedia_editstream_2012-12-29T00:00:00.000Z_2013-01-10T08:00:00.000Z_2013-01-10T08:13:47.830Z_v9",
+                "intervals":["2012-12-29T00:00:00.000Z/2013-01-10T08:00:00.000Z"],
+                "columns":{
+                    "country_name":{
+                        "type":"STRING",
+                        "hasMultipleValues":false,
+                        "size":0,
+                        "cardinality":208,
                         "minValue":"",
                         "maxValue":"mmx._unknown",
-                        "errorMessage": null
+                        "errorMessage":null,
+                        "hasNulls":true
                     },
-                    "language": {
-                        "type": "STRING",
-                        "hasMultipleValues": false,
-                        "size": 0,
-                        "cardinality": 36,
+                    "language":{
+                        "type":"STRING",
+                        "hasMultipleValues":false,
+                        "size":0,
+                        "cardinality":36,
                         "minValue":"ar",
                         "maxValue":"zh",
-                        "errorMessage": null
+                        "errorMessage":null,
+                        "hasNulls":false
                     }
                 },
-                "size": 0,
-                "numRows": 4462111,
-                "aggregators": null,
-                "timestampSpec": null,
-                "queryGranularity": null,
+                "size":0,
+                "numRows":4462111,
+                "aggregators":null,
+                "timestampSpec":null,
+                "queryGranularity":null,
                 "rollup":null
             }
         ]

diff --git a/processing/src/main/java/org/apache/druid/query/metadata/SegmentAnalyzer.java b/processing/src/main/java/org/apache/druid/query/metadata/SegmentAnalyzer.java
@@ -100,6 +100,11 @@ public Map<String, ColumnAnalysis> analyze(Segment segment)
 
     Map<String, ColumnAnalysis> columns = new TreeMap<>();
 
+    Function<String, ColumnCapabilities> adapterCapabilitesFn =
+        storageAdapter instanceof IncrementalIndexStorageAdapter
+        ? ((IncrementalIndexStorageAdapter) storageAdapter)::getSnapshotColumnCapabilities
+        : storageAdapter::getColumnCapabilities;
+
     for (String columnName : columnNames) {
       final ColumnHolder columnHolder = index == null ? null : index.getColumnHolder(columnName);
       final ColumnCapabilities capabilities;
@@ -108,11 +113,7 @@ public Map<String, ColumnAnalysis> analyze(Segment segment)
       } else {
         // this can be removed if we get to the point where IncrementalIndexStorageAdapter.getColumnCapabilities
         // accurately reports the capabilities
-        if (storageAdapter instanceof IncrementalIndexStorageAdapter) {
-          capabilities = ((IncrementalIndexStorageAdapter) storageAdapter).getSnapshotColumnCapabilities(columnName);
-        } else {
-          capabilities = storageAdapter.getColumnCapabilities(columnName);
-        }
+        capabilities = adapterCapabilitesFn.apply(columnName);
       }
 
       final ColumnAnalysis analysis;
@@ -146,7 +147,7 @@ public Map<String, ColumnAnalysis> analyze(Segment segment)
     }
 
     // Add time column too
-    ColumnCapabilities timeCapabilities = storageAdapter.getColumnCapabilities(ColumnHolder.TIME_COLUMN_NAME);
+    ColumnCapabilities timeCapabilities = adapterCapabilitesFn.apply(ColumnHolder.TIME_COLUMN_NAME);
     if (timeCapabilities == null) {
       timeCapabilities = ColumnCapabilitiesImpl.createSimpleNumericColumnCapabilities(ValueType.LONG);
     }
@@ -192,6 +193,7 @@ private ColumnAnalysis analyzeNumericColumn(
     return new ColumnAnalysis(
         capabilities.getType().name(),
         capabilities.hasMultipleValues().isTrue(),
+        capabilities.hasNulls().isMaybeTrue(), // if we don't know for sure, then we should plan to check for nulls
         size,
         null,
         null,
@@ -242,6 +244,7 @@ private ColumnAnalysis analyzeStringColumn(
     return new ColumnAnalysis(
         capabilities.getType().name(),
         capabilities.hasMultipleValues().isTrue(),
+        capabilities.hasNulls().isMaybeTrue(), // if we don't know for sure, then we should plan to check for nulls
         size,
         analyzingCardinality() ? cardinality : 0,
         min,
@@ -319,6 +322,7 @@ public Long accumulate(Long accumulated, Cursor cursor)
     return new ColumnAnalysis(
         capabilities.getType().name(),
         capabilities.hasMultipleValues().isTrue(),
+        capabilities.hasNulls().isMaybeTrue(), // if we don't know for sure, then we should plan to check for nulls
         size,
         cardinality,
         min,
@@ -335,6 +339,7 @@ private ColumnAnalysis analyzeComplexColumn(
   {
     try (final ComplexColumn complexColumn = columnHolder != null ? (ComplexColumn) columnHolder.getColumn() : null) {
       final boolean hasMultipleValues = capabilities != null && capabilities.hasMultipleValues().isTrue();
+      final boolean hasNulls = capabilities != null && capabilities.hasNulls().isMaybeTrue();
       long size = 0;
 
       if (analyzingSize() && complexColumn != null) {
@@ -345,7 +350,7 @@ private ColumnAnalysis analyzeComplexColumn(
 
         final Function<Object, Long> inputSizeFn = serde.inputSizeFn();
         if (inputSizeFn == null) {
-          return new ColumnAnalysis(typeName, hasMultipleValues, 0, null, null, null, null);
+          return new ColumnAnalysis(typeName, hasMultipleValues, hasNulls, 0, null, null, null, null);
         }
 
         final int length = complexColumn.getLength();
@@ -357,6 +362,7 @@ private ColumnAnalysis analyzeComplexColumn(
       return new ColumnAnalysis(
           typeName,
           hasMultipleValues,
+          hasNulls,
           size,
           null,
           null,