If you attempt to re-index a druid datasource using a Druid InputSource, and you explicitly set the columns in the dimension spec, the re-index job will ingest all columns. To exclude columns, they need to be manually added to the dimensionsExclusion field
This ingestion spec used in integration tests re-produces the issue. The field "robot" is included in the re-indexed datasource even though it's not explicitly specified in the ingestionSpec
{
"type": "index",
"spec": {
"ioConfig": {
"type": "index",
"inputSource": {
"type": "druid",
"dataSource": "%%DATASOURCE%%",
"interval": "2013-08-31/2013-09-01"
}
},
"tuningConfig": {
"type": "index",
"partitionsSpec": {
"type": "dynamic"
}
},
"dataSchema": {
"dataSource": "%%REINDEX_DATASOURCE%%",
"granularitySpec": {
"type": "uniform",
"queryGranularity": "SECOND",
"segmentGranularity": "DAY"
},
"timestampSpec": {
"column": "__time",
"format": "iso"
},
"dimensionsSpec": {
"dimensions": [
"page",
{"type": "string", "name": "language", "createBitmapIndex": false},
"user",
"unpatrolled",
"newPage",
"anonymous",
"namespace",
"country",
"region",
"city"
]
},
"transformSpec": {
"transforms": [
{
"type": "expression",
"name": "newPage",
"expression": "page"
},
{
"type": "expression",
"name": "one-plus-triple-added",
"expression": "\"triple-added\" + 1"
},
{
"type": "expression",
"name": "double-deleted",
"expression": "deleted * 2"
}
]
},
"metricsSpec": [
{
"type": "doubleSum",
"name": "added",
"fieldName": "added"
},
{
"type": "doubleSum",
"name": "triple-added",
"fieldName": "triple-added"
},
{
"type": "doubleSum",
"name": "one-plus-triple-added",
"fieldName": "one-plus-triple-added"
},
{
"type": "doubleSum",
"name": "deleted",
"fieldName": "deleted"
},
{
"type": "doubleSum",
"name": "double-deleted",
"fieldName": "double-deleted"
},
{
"type": "doubleSum",
"name": "delta",
"fieldName": "delta"
}
]
}
}
}
Affected Version
0.18
Description
If you attempt to re-index a druid datasource using a Druid InputSource, and you explicitly set the columns in the dimension spec, the re-index job will ingest all columns. To exclude columns, they need to be manually added to the dimensionsExclusion field
This ingestion spec used in integration tests re-produces the issue. The field "robot" is included in the re-indexed datasource even though it's not explicitly specified in the ingestionSpec