From dfe9911c1ddf85bf08019e1c2cb3d59d8a8baa91 Mon Sep 17 00:00:00 2001 From: Maytas Monsereenusorn Date: Tue, 28 Jan 2020 16:15:45 -0800 Subject: [PATCH 1/2] add datasketch integration test --- .../queries/wikipedia_editstream_queries.json | 36 +++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/integration-tests/src/test/resources/queries/wikipedia_editstream_queries.json b/integration-tests/src/test/resources/queries/wikipedia_editstream_queries.json index 846fcd2ae35b..6ecdaa238849 100644 --- a/integration-tests/src/test/resources/queries/wikipedia_editstream_queries.json +++ b/integration-tests/src/test/resources/queries/wikipedia_editstream_queries.json @@ -27,6 +27,38 @@ } ] }, + { + "description": "timeseries, 1 datasketch agg, all", + "query": { + "queryType": "timeseries", + "dataSource": "wikipedia_editstream", + "intervals": ["2013-01-01T00:00:00.000/2013-01-08T00:00:00.000"], + "granularity": "all", + "aggregations": [ + { + "type": "HLLSketchBuild", + "name": "sketch", + "fieldName": "user", + "lgK": 12, + "tgtHllType": "HLL_4", + "round": true + } + ], + "context": { + "useCache": "true", + "populateCache": "true", + "timeout": 360000 + } + }, + "expectedResults": [ + { + "timestamp": "2013-01-01T00:00:00.000Z", + "result": { + "sketch": 216700 + } + } + ] + }, { "description": "timeseries, all aggs, all", "query": { @@ -1174,12 +1206,12 @@ { "dimension": "page", "value": "League_of_Legends", - "count":21 + "count":21 }, { "dimension": "page", "value": "The_best_ADs_in_The_League_of_legends", - "count":2 + "count":2 } ] } From b62551657802fcdd935566b9326fb02c83334c92 Mon Sep 17 00:00:00 2001 From: Maytas Monsereenusorn Date: Wed, 29 Jan 2020 14:47:16 -0800 Subject: [PATCH 2/2] added datasketch integration tests --- .../indexer/wikipedia_index_queries.json | 48 +++- .../indexer/wikipedia_index_task.json | 15 ++ .../queries/wikipedia_editstream_queries.json | 214 +++++++++++++++--- 3 files changed, 243 insertions(+), 34 deletions(-) diff --git a/integration-tests/src/test/resources/indexer/wikipedia_index_queries.json b/integration-tests/src/test/resources/indexer/wikipedia_index_queries.json index 9618ba9e9b6d..bf2a70b687ac 100644 --- a/integration-tests/src/test/resources/indexer/wikipedia_index_queries.json +++ b/integration-tests/src/test/resources/indexer/wikipedia_index_queries.json @@ -15,7 +15,53 @@ } ] }, - + { + "description": "timeseries, datasketch aggs, all", + "query":{ + "queryType" : "timeseries", + "dataSource": "%%DATASOURCE%%", + "granularity":"day", + "intervals":[ + "2013-08-31T00:00/2013-09-01T00:00" + ], + "filter":null, + "aggregations":[ + { + "type": "HLLSketchMerge", + "name": "approxCountHLL", + "fieldName": "HLLSketchBuild", + "lgK": 12, + "tgtHllType": "HLL_4", + "round": true + }, + { + "type":"thetaSketch", + "name":"approxCountTheta", + "fieldName":"thetaSketch", + "size":16384, + "shouldFinalize":true, + "isInputThetaSketch":false, + "errorBoundsStdDev":null + }, + { + "type":"quantilesDoublesSketch", + "name":"quantilesSketch", + "fieldName":"quantilesDoublesSketch", + "k":128 + } + ] + }, + "expectedResults":[ + { + "timestamp" : "2013-08-31T00:00:00.000Z", + "result" : { + "quantilesSketch":5, + "approxCountTheta":5.0, + "approxCountHLL":5 + } + } + ] + }, { "description":"having spec on post aggregation", "query":{ diff --git a/integration-tests/src/test/resources/indexer/wikipedia_index_task.json b/integration-tests/src/test/resources/indexer/wikipedia_index_task.json index 23532e55942e..c41bee228be1 100644 --- a/integration-tests/src/test/resources/indexer/wikipedia_index_task.json +++ b/integration-tests/src/test/resources/indexer/wikipedia_index_task.json @@ -22,6 +22,21 @@ "type": "doubleSum", "name": "delta", "fieldName": "delta" + }, + { + "name": "thetaSketch", + "type": "thetaSketch", + "fieldName": "user" + }, + { + "name": "quantilesDoublesSketch", + "type": "quantilesDoublesSketch", + "fieldName": "delta" + }, + { + "name": "HLLSketchBuild", + "type": "HLLSketchBuild", + "fieldName": "user" } ], "granularitySpec": { diff --git a/integration-tests/src/test/resources/queries/wikipedia_editstream_queries.json b/integration-tests/src/test/resources/queries/wikipedia_editstream_queries.json index 6ecdaa238849..2ed00e9bba4f 100644 --- a/integration-tests/src/test/resources/queries/wikipedia_editstream_queries.json +++ b/integration-tests/src/test/resources/queries/wikipedia_editstream_queries.json @@ -27,38 +27,6 @@ } ] }, - { - "description": "timeseries, 1 datasketch agg, all", - "query": { - "queryType": "timeseries", - "dataSource": "wikipedia_editstream", - "intervals": ["2013-01-01T00:00:00.000/2013-01-08T00:00:00.000"], - "granularity": "all", - "aggregations": [ - { - "type": "HLLSketchBuild", - "name": "sketch", - "fieldName": "user", - "lgK": 12, - "tgtHllType": "HLL_4", - "round": true - } - ], - "context": { - "useCache": "true", - "populateCache": "true", - "timeout": 360000 - } - }, - "expectedResults": [ - { - "timestamp": "2013-01-01T00:00:00.000Z", - "result": { - "sketch": 216700 - } - } - ] - }, { "description": "timeseries, all aggs, all", "query": { @@ -125,6 +93,29 @@ "type" : "longLast", "name" : "lastCount", "fieldName" : "count" + }, + { + "type":"HLLSketchBuild", + "name":"approxCountHLL", + "fieldName":"user", + "lgK":12, + "tgtHllType":"HLL_4", + "round":true + }, + { + "type":"thetaSketch", + "name":"approxCountTheta", + "fieldName":"user", + "size":16384, + "shouldFinalize":true, + "isInputThetaSketch":false, + "errorBoundsStdDev":null + }, + { + "type":"quantilesDoublesSketch", + "name":"quantilesDoublesSketch", + "fieldName":"user", + "k":2 } ], "context": { @@ -143,6 +134,9 @@ "lastAdded": 210.0, "firstCount": 1, "lastCount": 1, + "quantilesDoublesSketch":2390950, + "approxCountTheta":219483.4076460526, + "approxCountHLL":216700, "delta": 5.48967603E8, "variation": 1.274085073E9, "delta_hist": { @@ -250,6 +244,29 @@ "type" : "longLast", "name" : "lastCount", "fieldName" : "count" + }, + { + "type":"HLLSketchBuild", + "name":"approxCountHLL", + "fieldName":"user", + "lgK":12, + "tgtHllType":"HLL_4", + "round":true + }, + { + "type":"thetaSketch", + "name":"approxCountTheta", + "fieldName":"user", + "size":16384, + "shouldFinalize":true, + "isInputThetaSketch":false, + "errorBoundsStdDev":null + }, + { + "type":"quantilesDoublesSketch", + "name":"quantilesDoublesSketch", + "fieldName":"user", + "k":2 } ], "context": { @@ -268,6 +285,9 @@ "lastAdded": 210.0, "firstCount": 1, "lastCount": 1, + "quantilesDoublesSketch":1556534, + "approxCountTheta":157226.06680543753, + "approxCountHLL":158502, "delta": 2.24089868E8, "variation": 4.74698118E8, "delta_hist": { @@ -465,6 +485,29 @@ "type" : "longLast", "name" : "lastCount", "fieldName" : "count" + }, + { + "type":"HLLSketchBuild", + "name":"approxCountHLL", + "fieldName":"user", + "lgK":12, + "tgtHllType":"HLL_4", + "round":true + }, + { + "type":"thetaSketch", + "name":"approxCountTheta", + "fieldName":"user", + "size":16384, + "shouldFinalize":true, + "isInputThetaSketch":false, + "errorBoundsStdDev":null + }, + { + "type":"quantilesDoublesSketch", + "name":"quantilesDoublesSketch", + "fieldName":"user", + "k":2 } ], "dimension": "page", @@ -485,6 +528,9 @@ "count": 1697, "firstCount": 2, "lastCount": 3, + "quantilesDoublesSketch":990, + "approxCountTheta":330.0, + "approxCountHLL":330, "firstAdded": 462.0, "lastAdded": 1871.0, "page": "Wikipedia:Administrators'_noticeboard/Incidents", @@ -499,6 +545,9 @@ "count": 967, "firstCount": 1, "lastCount": 1, + "quantilesDoublesSketch":773, + "approxCountTheta":309.0, + "approxCountHLL":309, "firstAdded": 12.0, "lastAdded": 129.0, "page": "2013", @@ -513,6 +562,9 @@ "count": 1700, "firstCount": 1, "lastCount": 5, + "quantilesDoublesSketch":991, + "approxCountTheta":302.0, + "approxCountHLL":302, "firstAdded": 0.0, "lastAdded": 2399.0, "page": "Wikipedia:Vandalismusmeldung", @@ -612,6 +664,29 @@ "type" : "longLast", "name" : "lastCount", "fieldName" : "count" + }, + { + "type":"HLLSketchBuild", + "name":"approxCountHLL", + "fieldName":"user", + "lgK":12, + "tgtHllType":"HLL_4", + "round":true + }, + { + "type":"thetaSketch", + "name":"approxCountTheta", + "fieldName":"user", + "size":16384, + "shouldFinalize":true, + "isInputThetaSketch":false, + "errorBoundsStdDev":null + }, + { + "type":"quantilesDoublesSketch", + "name":"quantilesDoublesSketch", + "fieldName":"user", + "k":2 } ], "dimension": "page", @@ -634,6 +709,9 @@ "lastCount": 1, "firstAdded": 12.0, "lastAdded": 129.0, + "quantilesDoublesSketch":692, + "approxCountTheta":251.0, + "approxCountHLL":251, "page": "2013", "delta": 35313.0, "variation": 88165.0, @@ -648,6 +726,9 @@ "lastCount": 1, "firstAdded": 29.0, "lastAdded": 37.0, + "quantilesDoublesSketch":398, + "approxCountTheta":203.0, + "approxCountHLL":203, "page": "Gérard_Depardieu", "delta": 7027.0, "variation": 49549.0, @@ -662,6 +743,9 @@ "lastCount": 1, "firstAdded": 29.0, "lastAdded": 35.0, + "quantilesDoublesSketch":447, + "approxCountTheta":13.0, + "approxCountHLL":13, "page": "Zichyújfalu", "delta": 9030.0, "variation": 12872.0, @@ -734,6 +818,29 @@ "type" : "longLast", "name" : "lastCount", "fieldName" : "count" + }, + { + "type":"HLLSketchBuild", + "name":"approxCountHLL", + "fieldName":"user", + "lgK":12, + "tgtHllType":"HLL_4", + "round":true + }, + { + "type":"thetaSketch", + "name":"approxCountTheta", + "fieldName":"user", + "size":16384, + "shouldFinalize":true, + "isInputThetaSketch":false, + "errorBoundsStdDev":null + }, + { + "type":"quantilesDoublesSketch", + "name":"quantilesDoublesSketch", + "fieldName":"user", + "k":2 } ], "postAggregations": [ @@ -787,6 +894,9 @@ "lastCount": 9, "firstAdded": 1612.0, "lastAdded": 560.0, + "quantilesDoublesSketch":168, + "approxCountTheta":1.0, + "approxCountHLL":1, "page": "User:Cyde/List_of_candidates_for_speedy_deletion/Subpage", "delta": 670.0, "variation": 302148.0, @@ -802,6 +912,9 @@ "lastCount": 5, "firstAdded": 0.0, "lastAdded": 2399.0, + "quantilesDoublesSketch":991, + "approxCountTheta":302.0, + "approxCountHLL":302, "page": "Wikipedia:Vandalismusmeldung", "delta": -5446.0, "variation": 1043750.0, @@ -817,6 +930,9 @@ "lastCount": 3, "firstAdded": 462.0, "lastAdded": 1871.0, + "quantilesDoublesSketch":990, + "approxCountTheta":330.0, + "approxCountHLL":330, "page": "Wikipedia:Administrators'_noticeboard/Incidents", "delta": 770071.0, "variation": 2855849.0, @@ -1041,7 +1157,7 @@ ] }, { - "description": "groupBy, six aggs, namespace + robot dim, postAggs", + "description": "groupBy, nine aggs, namespace + robot dim, postAggs", "query": { "queryType": "groupBy", "dataSource": "wikipedia_editstream", @@ -1076,6 +1192,29 @@ "type" : "longLast", "name" : "lastCount", "fieldName" : "count" + }, + { + "type":"HLLSketchBuild", + "name":"approxCountHLL", + "fieldName":"user", + "lgK":12, + "tgtHllType":"HLL_4", + "round":true + }, + { + "type":"thetaSketch", + "name":"approxCountTheta", + "fieldName":"user", + "size":16384, + "shouldFinalize":true, + "isInputThetaSketch":false, + "errorBoundsStdDev":null + }, + { + "type":"quantilesDoublesSketch", + "name":"quantilesDoublesSketch", + "fieldName":"user", + "k":2 } ], "postAggregations": [ @@ -1116,6 +1255,9 @@ "event": { "sumOfRowsAndCount": 2268154.0, "count": 1286354, + "quantilesDoublesSketch":981800, + "approxCountTheta":196257.61632104203, + "approxCountHLL":194323, "firstCount": 1, "lastCount": 1, "firstAdded": 70.0, @@ -1131,6 +1273,9 @@ "event": { "sumOfRowsAndCount": 1385233.0, "count": 693711, + "quantilesDoublesSketch":691522, + "approxCountTheta":256.0, + "approxCountHLL":256, "firstCount": 1, "lastCount": 1, "firstAdded": 39.0, @@ -1146,6 +1291,9 @@ "event": { "sumOfRowsAndCount": 878393.0, "count": 492643, + "quantilesDoublesSketch":385750, + "approxCountTheta":48129.087284782676, + "approxCountHLL":47963, "firstCount": 2, "lastCount": 1, "firstAdded": 431.0,