Skip to content

Indexing tasks containing thetaSketches results in incorrect sketch values #7741

@a2l007

Description

@a2l007

Affected Version

0.13.0-incubating

Description

One of our clusters running on 0.13.0 is seeing data corruption issues with thetasketches. There are two datasources used in this testcase: basefact and slice. slice is essentially same as basefact except with fewer dimensions, so as to improve the rollup ratio.
Ingestion Spec for basefact datasource:

{
  "type" : "index_hadoop",
  "id" : "index_base",
  "spec" : {
    "dataSchema" : {
      "dataSource" : "basefact",
      "parser" : {
        "type" : "avro_hadoop",
        "parseSpec" : {
          "format" : "avro",
          "timestampSpec" : {
            "column" : "date",
            "format" : "yyyyMMdd"
          },
          "dimensionsSpec" : {
            "dimensions" : [ "src_pty_id" ]
          }
        }
      },
      "metricsSpec" : [ {
        "type" : "thetaSketch",
        "name" : "test_sketch",
        "fieldName" : "test_sketch",
        "size" : 1048576,
        "shouldFinalize" : true,
        "isInputThetaSketch" : true,
        "errorBoundsStdDev" : null
      } ],
      "granularitySpec" : {
        "type" : "uniform",
        "segmentGranularity" : "DAY",
        "queryGranularity" : {
          "type" : "none"
        },
        "rollup" : true,
        "intervals" : [ "2019-05-22T00:00:00.000Z/2019-05-23T00:00:00.000Z" ]
      },
      "transformSpec" : {
        "filter" : null,
        "transforms" : [ ]
      }
    },
    "ioConfig" : {
      "type" : "hadoop",
      "inputSpec" : {
        "type" : "static",
        "paths" : "///projects/indexData",
        "inputFormat" : "org.apache.druid.data.input.avro.AvroValueInputFormat"
      },
      "metadataUpdateSpec" : null,
      "segmentOutputPath" : null
    },
    "tuningConfig" : {
      "type" : "hadoop",
      "workingPath" : null,
      "version" : "2019-05-23T08:33:19.990Z",
      "partitionsSpec" : {
        "type" : "hashed",
        "targetPartitionSize" : 1050000,
        "maxPartitionSize" : 1575000,
        "assumeGrouped" : true,
        "numShards" : -1,
        "partitionDimensions" : [ ]
      },
      "shardSpecs" : { },
      "indexSpec" : {
        "bitmap" : {
          "type" : "concise"
        },
        "dimensionCompression" : "lz4",
        "metricCompression" : "lz4",
        "longEncoding" : "longs"
      },
      "maxRowsInMemory" : 150000,
      "maxBytesInMemory" : -1,
      "leaveIntermediate" : false,
      "cleanupOnFailure" : true,
      "overwriteFiles" : false,
      "ignoreInvalidRows" : false,
      "jobProperties" : {
        "fs.permissions.umask-mode" : "027"
      },
      "combineText" : false,
      "useCombiner" : false,
      "buildV9Directly" : true,
      "numBackgroundPersistThreads" : 0,
      "forceExtendableShardSpecs" : false,
      "useExplicitVersion" : false,
      "allowedHadoopPrefix" : [ ],
      "logParseExceptions" : false,
      "maxParseExceptions" : 0
    }
  },
  "hadoopDependencyCoordinates" : null,
  "classpathPrefix" : null,
  "context" : { }
}

Ingestion spec for slice datasource:

{
  "type" : "index_hadoop",
  "id" : "index_slice",
  "spec" : {
    "dataSchema" : {
      "dataSource" : "slice",
      "parser" : {
        "type" : "avro_hadoop",
        "parseSpec" : {
          "format" : "avro",
          "timestampSpec" : {
            "column" : "date",
            "format" : "yyyyMMdd"
          },
          "dimensionsSpec" : {
            "dimensions" : [ "src_pty_id" ]
          }
        }
      },
      "metricsSpec" : [ {
      
        "type" : "thetaSketch",
        "name" : "test_sketch",
        "fieldName" : "test_sketch",
        "size" : 131072,
        "shouldFinalize" : true,
        "isInputThetaSketch" : true,
        "errorBoundsStdDev" : null
      } ],
      "granularitySpec" : {
        "type" : "uniform",
        "segmentGranularity" : "DAY",
        "queryGranularity" : {
          "type" : "none"
        },
        "rollup" : true,
        "intervals" : [ "2019-05-22T00:00:00.000Z/2019-05-23T00:00:00.000Z" ]
      },
      "transformSpec" : {
        "filter" : null,
        "transforms" : [ ]
      }
    },
    "ioConfig" : {
      "type" : "hadoop",
      "inputSpec" : {
        "type" : "dataSource",
        "ingestionSpec" : {
          "dataSource" : "basefact",
          "intervals" : [ "2019-05-22T00:00:00Z/P1D" ]
        }
      },
      "metadataUpdateSpec" : null,
      "segmentOutputPath" : null
    },
    "tuningConfig" : {
      "type" : "hadoop",
      "workingPath" : null,
      "version" : "2019-05-23T08:54:27.227Z",
      "partitionsSpec" : {
        "type" : "hashed",
        "targetPartitionSize" : 133000,
        "maxPartitionSize" : 199500,
        "assumeGrouped" : true,
        "numShards" : -1,
        "partitionDimensions" : [ ]
      },
      "shardSpecs" : { },
      "indexSpec" : {
        "bitmap" : {
          "type" : "concise"
        },
        "dimensionCompression" : "lz4",
        "metricCompression" : "lz4",
        "longEncoding" : "longs"
      },
      "maxRowsInMemory" : 10000,
      "maxBytesInMemory" : -1,
      "leaveIntermediate" : false,
      "cleanupOnFailure" : true,
      "overwriteFiles" : false,
      "ignoreInvalidRows" : false,
      "jobProperties" : {
        "fs.permissions.umask-mode" : "027"
      },
      "combineText" : false,
      "useCombiner" : false,
      "buildV9Directly" : true,
      "numBackgroundPersistThreads" : 0,
      "forceExtendableShardSpecs" : false,
      "useExplicitVersion" : false,
      "allowedHadoopPrefix" : [ ],
      "logParseExceptions" : false,
      "maxParseExceptions" : 0
    }
  },
  "hadoopDependencyCoordinates" : null,
  "classpathPrefix" : null,
  "context" : { }
}

Querying the basefact datasource provides the result as: "test_sketch":43672556.4879819 while the slice datasource results in "test_sketch":43676771.06402646

Still investigating the issue, but has anyone observed similar behavior?

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions