Skip to content

horrible histogram post-aggregation  #6853

@quenlang

Description

@quenlang

Hello. @jon-wei
I am confused with perform a druid histogram query.
The request body of histogram query like this :

  "queryType": "timeseries",
  "dataSource": {
    "type": "table",
    "name": "BRS_PAGE_HOUR"
  },
  "intervals": {
    "type": "intervals",
    "intervals": [
      "2018-11-14T11:42:00/2019-01-14T11:42:00"
    ]
  },
  "descending": false,
  "virtualColumns": [],
  "filter": {
    "type": "and",
    "fields": [
      {
        "type": "selector",
        "dimension": "browser_application_id",
        "value": "2",
        "extractionFn": null
      },
      {
        "type": "selector",
        "dimension": "host",
        "value": "10.111.243.10:7001",
        "extractionFn": null
      },
      {
        "type": "selector",
        "dimension": "uri",
        "value": "/PMS/framework/popWindow.html",
        "extractionFn": null
      },
      {
        "type": "bound",
        "dimension": "count",
        "lower": "0",
        "upper": null,
        "lowerStrict": true,
        "upperStrict": false,
        "extractionFn": null,
        "ordering": {
          "type": "lexicographic"
        }
      }
    ]
  },
  "granularity": {
    "type": "all"
  },
  "aggregations": [
    {
      "type": "longSum",
      "name": "count_total",
      "fieldName": "count",
      "expression": null
    },
    {
      "type": "approxHistogramFold",
      "name": "first_paint_time_histogram_total",
      "fieldName": "first_paint_time_histogram",
      "resolution": 50,
      "numBuckets": 9,
      "lowerLimit": 0,
      "upperLimit": "Infinity"
    }
  ],
  "postAggregations": [
    {
      "type": "customBuckets",
      "name": "performance_histogram",
      "fieldName": "first_paint_time_histogram_total",
      "breaks": [
        0,
        200,
        400,
        600,
        800,
        1000,
        1200,
        1400,
        "Infinity"
      ]
    },
    { "type" : "min", "name" : "min_value", "fieldName" : "first_paint_time_histogram_total"},
    { "type" : "max", "name" : "max_value", "fieldName" : "first_paint_time_histogram_total"}
  ],
  "context": {
    "skipEmptyBuckets": "true"
  }
}

The first query result like this :

[ {
  "timestamp" : "2018-12-14T02:00:00.000Z",
  "result" : {
    "performance_histogram" : {
      "breaks" : [ 0.0, 200.0, 400.0, 600.0, 800.0, 1000.0, 1200.0, 1400.0, "Infinity" ],
      "counts" : [ 339.6748046875, 96.61546325683594, 70.65382385253906, 44.69218063354492, 18.730541229248047, 0.9215729236602783, 0.43618109822273254, 9.275420188903809 ]
    },
    "min_value" : 0.0,
    "count_total" : 581,
    "first_paint_time_histogram_total" : {
      "breaks" : [ -4443.75, 0.0, 4443.75, 8887.5, 13331.25, 17775.0, 22218.75, 26662.5, 31106.25, 35550.0 ],
      "counts" : [ 0.0, 576.6718139648438, 1.3282045125961304, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0 ]
    },
    "max_value" : 35550.0
  }
} ]

The second query result like this :

[ {
  "timestamp" : "2018-12-14T02:00:00.000Z",
  "result" : {
    "performance_histogram" : {
      "breaks" : [ 0.0, 200.0, 400.0, 600.0, 800.0, 1000.0, 1200.0, 1400.0, "Infinity" ],
      "counts" : [ 550.0, 4.094977378845215, 6.259572505950928, 2.6454498767852783, 1.0, 1.0, 0.0, 9.0 ]
    },
    "min_value" : 0.0,
    "count_total" : 581,
    "first_paint_time_histogram_total" : {
      "breaks" : [ -4443.75, 0.0, 4443.75, 8887.5, 13331.25, 17775.0, 22218.75, 26662.5, 31106.25, 35550.0 ],
      "counts" : [ 7.0, 570.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0 ]
    },
    "max_value" : 35550.0
  } 
} ]

Then i increase the resolution to 500 and the result always like this :

[ {
  "timestamp" : "2018-12-14T02:00:00.000Z",
  "result" : {
    "performance_histogram" : {
      "breaks" : [ 0.0, 200.0, 400.0, 600.0, 800.0, 1000.0, 1200.0, 1400.0, "Infinity" ],
      "counts" : [ 318.0, 79.0, 14.0, 2.0, 1.0, 1.0, 0.0, 9.0 ]
    },
    "min_value" : 0.0,
    "count_total" : 581,
    "first_paint_time_histogram_total" : {
      "breaks" : [ -4443.75, 0.0, 4443.75, 8887.5, 13331.25, 17775.0, 22218.75, 26662.5, 31106.25, 35550.0 ],
      "counts" : [ 157.0, 420.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0 ]
    },
    "max_value" : 35550.0
  }
} ]

Even though i have known results can vary for the same query from docs, but i am confused why the first bucket of first_paint_time_histogram_total was [negative number, 0] and its count also>0. The min value of the column was 0. And the [0.0, 200.0] bucket which is the first bucket of custom buckets looks like not include count of the first bucket of first_paint_time_histogram_total.

My ingestion spce file define the histogram column like this:

            ...
            {
                "name": "first_paint_time_histogram",
                "fieldName": "first_paint_time",
                "type": "approxHistogramFold",
                "resolution": 50,
                "numBuckets": 7,
                "lowerLimit": 0
            },
            ...

And the data of the column like this:

mysql> select first_paint_time from BRS_PAGE_HOUR where __time>='2019-01-13 00:00:00' and browser_application_id='9' limit 100;
+------------------+
| first_paint_time |
+------------------+
|              325 |
|              147 |
|              653 |
|              189 |
|              147 |
|              422 |
|              413 |
|               17 |
|              238 |
|              423 |
|              150 |
|              449 |
|              149 |
|              345 |
|             1661 |
|             2600 |
|              427 |
|              209 |
|             1622 |
|                0 |
|                0 |
|               12 |
|              153 |
|              144 |
|              572 |
|              112 |
|               48 |
|             2831 |
|              493 |
|              577 |
|              164 |
|              360 |
|              234 |
|              368 |
|              408 |
|              108 |
|              241 |
|             2903 |
|             2923 |
|             1921 |
|              343 |
|            12363 |
|              616 |
|              356 |
|              564 |
|              266 |
|              309 |
|              418 |
|              284 |
|              247 |
|             1858 |
|             1488 |
|               15 |
|             1710 |
|              185 |
|              128 |
|              267 |
|              379 |
|              526 |
|              121 |
|              490 |
|              540 |
|              132 |
|              251 |
|              119 |
|              505 |
|              200 |
|             3715 |
|              347 |
|               33 |
|               18 |
|             1140 |
|              371 |
|              232 |
|              300 |
|              304 |
|              189 |
|              373 |
|              297 |
|              302 |
|              257 |
|              401 |
|             2058 |
|              389 |
|              432 |
|              359 |
|              822 |
|              409 |
|             2992 |
|             1045 |
|              386 |
|             1469 |
|             6715 |
|             1046 |
|              389 |
|             1052 |
|              196 |
|             1050 |
|              390 |
|              454 |
+------------------+
100 rows in set (0.07 sec)

Do you have any advices ? Thank you so much.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions