Affected Version
0.13.0-incubating
Description
One of our clusters running on 0.13.0 is seeing data corruption issues with thetasketches. There are two datasources used in this testcase: basefact and slice. slice is essentially same as basefact except with fewer dimensions, so as to improve the rollup ratio.
Ingestion Spec for basefact datasource:
{
"type" : "index_hadoop",
"id" : "index_base",
"spec" : {
"dataSchema" : {
"dataSource" : "basefact",
"parser" : {
"type" : "avro_hadoop",
"parseSpec" : {
"format" : "avro",
"timestampSpec" : {
"column" : "date",
"format" : "yyyyMMdd"
},
"dimensionsSpec" : {
"dimensions" : [ "src_pty_id" ]
}
}
},
"metricsSpec" : [ {
"type" : "thetaSketch",
"name" : "test_sketch",
"fieldName" : "test_sketch",
"size" : 1048576,
"shouldFinalize" : true,
"isInputThetaSketch" : true,
"errorBoundsStdDev" : null
} ],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "DAY",
"queryGranularity" : {
"type" : "none"
},
"rollup" : true,
"intervals" : [ "2019-05-22T00:00:00.000Z/2019-05-23T00:00:00.000Z" ]
},
"transformSpec" : {
"filter" : null,
"transforms" : [ ]
}
},
"ioConfig" : {
"type" : "hadoop",
"inputSpec" : {
"type" : "static",
"paths" : "///projects/indexData",
"inputFormat" : "org.apache.druid.data.input.avro.AvroValueInputFormat"
},
"metadataUpdateSpec" : null,
"segmentOutputPath" : null
},
"tuningConfig" : {
"type" : "hadoop",
"workingPath" : null,
"version" : "2019-05-23T08:33:19.990Z",
"partitionsSpec" : {
"type" : "hashed",
"targetPartitionSize" : 1050000,
"maxPartitionSize" : 1575000,
"assumeGrouped" : true,
"numShards" : -1,
"partitionDimensions" : [ ]
},
"shardSpecs" : { },
"indexSpec" : {
"bitmap" : {
"type" : "concise"
},
"dimensionCompression" : "lz4",
"metricCompression" : "lz4",
"longEncoding" : "longs"
},
"maxRowsInMemory" : 150000,
"maxBytesInMemory" : -1,
"leaveIntermediate" : false,
"cleanupOnFailure" : true,
"overwriteFiles" : false,
"ignoreInvalidRows" : false,
"jobProperties" : {
"fs.permissions.umask-mode" : "027"
},
"combineText" : false,
"useCombiner" : false,
"buildV9Directly" : true,
"numBackgroundPersistThreads" : 0,
"forceExtendableShardSpecs" : false,
"useExplicitVersion" : false,
"allowedHadoopPrefix" : [ ],
"logParseExceptions" : false,
"maxParseExceptions" : 0
}
},
"hadoopDependencyCoordinates" : null,
"classpathPrefix" : null,
"context" : { }
}
Ingestion spec for slice datasource:
{
"type" : "index_hadoop",
"id" : "index_slice",
"spec" : {
"dataSchema" : {
"dataSource" : "slice",
"parser" : {
"type" : "avro_hadoop",
"parseSpec" : {
"format" : "avro",
"timestampSpec" : {
"column" : "date",
"format" : "yyyyMMdd"
},
"dimensionsSpec" : {
"dimensions" : [ "src_pty_id" ]
}
}
},
"metricsSpec" : [ {
"type" : "thetaSketch",
"name" : "test_sketch",
"fieldName" : "test_sketch",
"size" : 131072,
"shouldFinalize" : true,
"isInputThetaSketch" : true,
"errorBoundsStdDev" : null
} ],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "DAY",
"queryGranularity" : {
"type" : "none"
},
"rollup" : true,
"intervals" : [ "2019-05-22T00:00:00.000Z/2019-05-23T00:00:00.000Z" ]
},
"transformSpec" : {
"filter" : null,
"transforms" : [ ]
}
},
"ioConfig" : {
"type" : "hadoop",
"inputSpec" : {
"type" : "dataSource",
"ingestionSpec" : {
"dataSource" : "basefact",
"intervals" : [ "2019-05-22T00:00:00Z/P1D" ]
}
},
"metadataUpdateSpec" : null,
"segmentOutputPath" : null
},
"tuningConfig" : {
"type" : "hadoop",
"workingPath" : null,
"version" : "2019-05-23T08:54:27.227Z",
"partitionsSpec" : {
"type" : "hashed",
"targetPartitionSize" : 133000,
"maxPartitionSize" : 199500,
"assumeGrouped" : true,
"numShards" : -1,
"partitionDimensions" : [ ]
},
"shardSpecs" : { },
"indexSpec" : {
"bitmap" : {
"type" : "concise"
},
"dimensionCompression" : "lz4",
"metricCompression" : "lz4",
"longEncoding" : "longs"
},
"maxRowsInMemory" : 10000,
"maxBytesInMemory" : -1,
"leaveIntermediate" : false,
"cleanupOnFailure" : true,
"overwriteFiles" : false,
"ignoreInvalidRows" : false,
"jobProperties" : {
"fs.permissions.umask-mode" : "027"
},
"combineText" : false,
"useCombiner" : false,
"buildV9Directly" : true,
"numBackgroundPersistThreads" : 0,
"forceExtendableShardSpecs" : false,
"useExplicitVersion" : false,
"allowedHadoopPrefix" : [ ],
"logParseExceptions" : false,
"maxParseExceptions" : 0
}
},
"hadoopDependencyCoordinates" : null,
"classpathPrefix" : null,
"context" : { }
}
Querying the basefact datasource provides the result as: "test_sketch":43672556.4879819 while the slice datasource results in "test_sketch":43676771.06402646
Still investigating the issue, but has anyone observed similar behavior?
Affected Version
0.13.0-incubating
Description
One of our clusters running on 0.13.0 is seeing data corruption issues with thetasketches. There are two datasources used in this testcase: basefact and slice. slice is essentially same as basefact except with fewer dimensions, so as to improve the rollup ratio.
Ingestion Spec for basefact datasource:
Ingestion spec for slice datasource:
Querying the basefact datasource provides the result as:
"test_sketch":43672556.4879819while the slice datasource results in"test_sketch":43676771.06402646Still investigating the issue, but has anyone observed similar behavior?