From 72207058135b38e885e9859cf9bd0f097c832a9c Mon Sep 17 00:00:00 2001 From: Zac Blanco Date: Sat, 27 Apr 2024 13:18:54 -0400 Subject: [PATCH] Add rough memory size tracking with KllItemsSketch When generating KllSketches, systems may need to have an idea about how much memory utilization there is for a particular sketch. For sketches with fixed-width types the answer can be computed efficiently. With the KllItemsSketch, this is more difficult because the sketch can support String-types with variable widths. This commit adds implementation support to expose the `getTotalItemsNumBytes` method so that external systems can roughly track the memory utilization of a particular sketch. The change accomplishes this by intercepting the code where a new item is added to the items array, or when a new array is generated entirely. This will add a slight overhead due to the sketch now needing to compute the length of inputs. For fixed-width types the overhead is low. For string this will require a call to encode the string as UTF-8/16 before adding it to the array. For fixed-width types, the calculations have little effective overhead as the computation is a single array-access lookup + multiplication with the type width. --- .../common/ArrayOfBooleansSerDe.java | 6 ++ .../common/ArrayOfDoublesSerDe.java | 6 ++ .../common/ArrayOfItemsSerDe.java | 5 + .../common/ArrayOfLongsSerDe.java | 6 ++ .../common/ArrayOfNumbersSerDe.java | 6 ++ .../common/ArrayOfStringsSerDe.java | 6 ++ .../common/ArrayOfUtf16StringsSerDe.java | 6 ++ .../kll/KllDirectCompactItemsSketch.java | 6 ++ .../datasketches/kll/KllDoublesSketch.java | 2 +- .../datasketches/kll/KllFloatsSketch.java | 2 +- .../datasketches/kll/KllHeapItemsSketch.java | 26 +++++ .../datasketches/kll/KllItemsSketch.java | 13 ++- .../apache/datasketches/kll/KllSketch.java | 5 +- .../datasketches/kll/KllItemsSketchTest.java | 1 - .../kll/KllTotalItemsBytesTest.java | 99 +++++++++++++++++++ 15 files changed, 186 insertions(+), 9 deletions(-) create mode 100644 src/test/java/org/apache/datasketches/kll/KllTotalItemsBytesTest.java diff --git a/src/main/java/org/apache/datasketches/common/ArrayOfBooleansSerDe.java b/src/main/java/org/apache/datasketches/common/ArrayOfBooleansSerDe.java index d4e2cc5d6..e970b47f8 100644 --- a/src/main/java/org/apache/datasketches/common/ArrayOfBooleansSerDe.java +++ b/src/main/java/org/apache/datasketches/common/ArrayOfBooleansSerDe.java @@ -122,4 +122,10 @@ public String toString(final Boolean item) { @Override public Class getClassOfT() { return Boolean.class; } + + @Override + public boolean isFixedWidth() + { + return true; + } } diff --git a/src/main/java/org/apache/datasketches/common/ArrayOfDoublesSerDe.java b/src/main/java/org/apache/datasketches/common/ArrayOfDoublesSerDe.java index d07ecc838..7456fb1e6 100644 --- a/src/main/java/org/apache/datasketches/common/ArrayOfDoublesSerDe.java +++ b/src/main/java/org/apache/datasketches/common/ArrayOfDoublesSerDe.java @@ -101,4 +101,10 @@ public String toString(final Double item) { @Override public Class getClassOfT() { return Double.class; } + + @Override + public boolean isFixedWidth() + { + return true; + } } diff --git a/src/main/java/org/apache/datasketches/common/ArrayOfItemsSerDe.java b/src/main/java/org/apache/datasketches/common/ArrayOfItemsSerDe.java index c9438b616..b30a28a7a 100644 --- a/src/main/java/org/apache/datasketches/common/ArrayOfItemsSerDe.java +++ b/src/main/java/org/apache/datasketches/common/ArrayOfItemsSerDe.java @@ -114,4 +114,9 @@ public int sizeOf(final T[] items) { * @return the concrete class of type T */ public abstract Class getClassOfT(); + + /** + * @return if this class serializes all types to a fixed width. + */ + public abstract boolean isFixedWidth(); } diff --git a/src/main/java/org/apache/datasketches/common/ArrayOfLongsSerDe.java b/src/main/java/org/apache/datasketches/common/ArrayOfLongsSerDe.java index 4ef1231ad..7b4c022b2 100644 --- a/src/main/java/org/apache/datasketches/common/ArrayOfLongsSerDe.java +++ b/src/main/java/org/apache/datasketches/common/ArrayOfLongsSerDe.java @@ -100,4 +100,10 @@ public String toString(final Long item) { @Override public Class getClassOfT() { return Long.class; } + + @Override + public boolean isFixedWidth() + { + return true; + } } diff --git a/src/main/java/org/apache/datasketches/common/ArrayOfNumbersSerDe.java b/src/main/java/org/apache/datasketches/common/ArrayOfNumbersSerDe.java index de6346578..de8203520 100644 --- a/src/main/java/org/apache/datasketches/common/ArrayOfNumbersSerDe.java +++ b/src/main/java/org/apache/datasketches/common/ArrayOfNumbersSerDe.java @@ -240,4 +240,10 @@ public String toString(final Number item) { @Override public Class getClassOfT() { return Number.class; } + + @Override + public boolean isFixedWidth() + { + return false; + } } diff --git a/src/main/java/org/apache/datasketches/common/ArrayOfStringsSerDe.java b/src/main/java/org/apache/datasketches/common/ArrayOfStringsSerDe.java index 1afba2a3c..5addbc879 100644 --- a/src/main/java/org/apache/datasketches/common/ArrayOfStringsSerDe.java +++ b/src/main/java/org/apache/datasketches/common/ArrayOfStringsSerDe.java @@ -130,4 +130,10 @@ public String toString(final String item) { @Override public Class getClassOfT() { return String.class; } + + @Override + public boolean isFixedWidth() + { + return false; + } } diff --git a/src/main/java/org/apache/datasketches/common/ArrayOfUtf16StringsSerDe.java b/src/main/java/org/apache/datasketches/common/ArrayOfUtf16StringsSerDe.java index 2ed1b316e..2c854de4f 100644 --- a/src/main/java/org/apache/datasketches/common/ArrayOfUtf16StringsSerDe.java +++ b/src/main/java/org/apache/datasketches/common/ArrayOfUtf16StringsSerDe.java @@ -124,4 +124,10 @@ public String toString(final String item) { @Override public Class getClassOfT() { return String.class; } + + @Override + public boolean isFixedWidth() + { + return false; + } } diff --git a/src/main/java/org/apache/datasketches/kll/KllDirectCompactItemsSketch.java b/src/main/java/org/apache/datasketches/kll/KllDirectCompactItemsSketch.java index 05324d28f..8cf5740e9 100644 --- a/src/main/java/org/apache/datasketches/kll/KllDirectCompactItemsSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllDirectCompactItemsSketch.java @@ -233,6 +233,12 @@ T[] getTotalItemsArray() { return capItems; } + @Override + int getTotalItemsNumBytesInternal() + { + return getRetainedItemsSizeBytes(); + } + @Override WritableMemory getWritableMemory() { return (WritableMemory)mem; diff --git a/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java b/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java index 59cda435f..1757c208c 100644 --- a/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java @@ -514,7 +514,7 @@ int getSingleItemSizeBytes() { abstract byte[] getTotalItemsByteArr(); @Override - int getTotalItemsNumBytes() { + public int getTotalItemsNumBytes() { return levelsArr[getNumLevels()] * Double.BYTES; } diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java b/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java index 48781631d..00110aca1 100644 --- a/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java @@ -514,7 +514,7 @@ int getSingleItemSizeBytes() { abstract byte[] getTotalItemsByteArr(); @Override - int getTotalItemsNumBytes() { + public int getTotalItemsNumBytes() { return levelsArr[getNumLevels()] * Float.BYTES; } diff --git a/src/main/java/org/apache/datasketches/kll/KllHeapItemsSketch.java b/src/main/java/org/apache/datasketches/kll/KllHeapItemsSketch.java index e58516fd3..23102ae94 100644 --- a/src/main/java/org/apache/datasketches/kll/KllHeapItemsSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllHeapItemsSketch.java @@ -53,6 +53,10 @@ final class KllHeapItemsSketch extends KllItemsSketch { private T minItem; private T maxItem; private Object[] itemsArr; + /** + * Roughly tracks the total stored size of items in-memory within the sketch. + */ + private int totalItemsSize = 0; /** * New instance heap constructor. @@ -131,6 +135,7 @@ final class KllHeapItemsSketch extends KllItemsSketch { this.minItem = item; this.maxItem = item; itemsArr[k - 1] = item; + totalItemsSize = serDe.sizeOf(item); } else if (memStruct == COMPACT_FULL) { int offset = DATA_START_ADR + memVal.numLevels * Integer.BYTES; this.minItem = serDe.deserializeFromMemory(srcMem, offset, 1)[0]; @@ -140,6 +145,7 @@ final class KllHeapItemsSketch extends KllItemsSketch { final int numRetained = levelsArr[memVal.numLevels] - levelsArr[0]; final Object[] retItems = serDe.deserializeFromMemory(srcMem, offset, numRetained); System.arraycopy(retItems, 0, itemsArr, levelsArr[0], numRetained); + totalItemsSize = serDe.sizeOf((T[]) retItems); } else { //memStruct == UPDATABLE throw new SketchesArgumentException(UNSUPPORTED_MSG + "UPDATABLE"); } @@ -260,6 +266,12 @@ T[] getTotalItemsArray() { return outArr; } + @Override + int getTotalItemsNumBytesInternal() + { + return totalItemsSize; + } + @Override WritableMemory getWritableMemory() { return null; @@ -290,12 +302,26 @@ void setN(final long n) { @Override void setItemsArray(final Object[] itemsArr) { + int sumItemSize = 0; + for (Object o : itemsArr) { + if (o == null) { + continue; + } + sumItemSize += serDe.sizeOf((T) o); + } + this.totalItemsSize = sumItemSize; this.itemsArr = itemsArr; } @Override void setItemsArrayAt(final int index, final Object item) { + if (itemsArr[index] != null) { + this.totalItemsSize -= serDe.sizeOf((T) itemsArr[index]); + } this.itemsArr[index] = item; + if (item != null) { + this.totalItemsSize += serDe.sizeOf((T) itemsArr[index]); + } } @Override diff --git a/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java b/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java index 6fb9772fb..54727f41e 100644 --- a/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java @@ -353,8 +353,6 @@ MemoryRequestServer getMemoryRequestServer() { @Override abstract int getRetainedItemsSizeBytes(); - //abstract Object[] getRetainedItemsArray(); - @Override ArrayOfItemsSerDe getSerDe() { return serDe; } @@ -378,10 +376,17 @@ byte[] getTotalItemsByteArr() { } @Override - int getTotalItemsNumBytes() { - throw new SketchesArgumentException(UNSUPPORTED_MSG); + public int getTotalItemsNumBytes() { + // if empty, exception is thrown. + if (serDe.isFixedWidth() && !isEmpty()) { + return levelsArr[getNumLevels()] * serDe.sizeOf(getMinItem()); + } else { + return getTotalItemsNumBytesInternal(); + } } + abstract int getTotalItemsNumBytesInternal(); + @Override void incNumLevels() { //this is not used and must be a no-op. diff --git a/src/main/java/org/apache/datasketches/kll/KllSketch.java b/src/main/java/org/apache/datasketches/kll/KllSketch.java index c398ed8ce..4889f6eda 100644 --- a/src/main/java/org/apache/datasketches/kll/KllSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllSketch.java @@ -426,10 +426,11 @@ final int getNumLevels() { /** * Gets the size in bytes of the entire internal items hypothetical structure. * It does not include the preamble, the levels array, or minimum or maximum items. - * It may include empty or free space. + * It may include empty or free space. This can value is useful to estimate + * the in-memory usage of the sketch. * @return the size of the retained data in bytes. */ - abstract int getTotalItemsNumBytes(); + public abstract int getTotalItemsNumBytes(); /** * This returns the WritableMemory for Direct type sketches, diff --git a/src/test/java/org/apache/datasketches/kll/KllItemsSketchTest.java b/src/test/java/org/apache/datasketches/kll/KllItemsSketchTest.java index 00028e341..17b4e9e42 100644 --- a/src/test/java/org/apache/datasketches/kll/KllItemsSketchTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllItemsSketchTest.java @@ -716,7 +716,6 @@ public void checkWrapCausingLevelsCompaction() { public void checkExceptions() { final KllItemsSketch sk = KllItemsSketch.newHeapInstance(20, Comparator.naturalOrder(), serDe); try { sk.getTotalItemsByteArr(); fail(); } catch (SketchesArgumentException e) { } - try { sk.getTotalItemsNumBytes(); fail(); } catch (SketchesArgumentException e) { } try { sk.setWritableMemory(null); fail(); } catch (SketchesArgumentException e) { } byte[] byteArr = sk.toByteArray(); final KllItemsSketch sk2 = KllItemsSketch.wrap(Memory.wrap(byteArr), Comparator.naturalOrder(), serDe); diff --git a/src/test/java/org/apache/datasketches/kll/KllTotalItemsBytesTest.java b/src/test/java/org/apache/datasketches/kll/KllTotalItemsBytesTest.java new file mode 100644 index 000000000..f7f984952 --- /dev/null +++ b/src/test/java/org/apache/datasketches/kll/KllTotalItemsBytesTest.java @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.kll; + +import org.apache.datasketches.common.*; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.function.BiConsumer; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; + +public class KllTotalItemsBytesTest { + + @DataProvider(name = "emptySketchesItems") + public Object[][] emptySketches() { + return new Object[][]{ + new Object[]{KllItemsSketch.newHeapInstance(Double::compareTo, new ArrayOfDoublesSerDe())}, + new Object[]{KllItemsSketch.newHeapInstance(Long::compareTo, new ArrayOfLongsSerDe())}, + new Object[]{KllItemsSketch.newHeapInstance(String::compareTo, new ArrayOfStringsSerDe())}, + new Object[]{KllItemsSketch.newHeapInstance(String::compareTo, new ArrayOfUtf16StringsSerDe())}, + new Object[]{KllItemsSketch.newHeapInstance(Boolean::compareTo, new ArrayOfBooleansSerDe())}, + }; + } + + @Test(dataProvider = "emptySketchesItems") + public void testEmptySketches(KllSketch sketch) { + assertEquals(sketch.getTotalItemsNumBytes(), 0); + } + + @DataProvider(name = "emptySketchesNative") + public Object[][] emptySketchesNative() { + return new Object[][]{ + new Object[]{KllFloatsSketch.newHeapInstance(), Float.BYTES}, + new Object[]{KllDoublesSketch.newHeapInstance(), Double.BYTES}, + }; + } + + @Test(dataProvider = "emptySketchesNative") + public void testEmptySketchesNative(KllSketch sketch, int singleItemSize) { + assertEquals(sketch.getTotalItemsNumBytes(), singleItemSize * sketch.getK()); + } + + + @SuppressWarnings({"rawtypes", "unchecked"}) + @DataProvider(name = "sketchSingleItem") + public Object[][] sketchSingleItem() { + return new Object[][]{ + new Object[]{KllItemsSketch.newHeapInstance(Double::compareTo, new ArrayOfDoublesSerDe()), 1.0d, (BiConsumer) KllItemsSketch::update}, + new Object[]{KllItemsSketch.newHeapInstance(Long::compareTo, new ArrayOfLongsSerDe()), 1L, (BiConsumer) KllItemsSketch::update}, + new Object[]{KllItemsSketch.newHeapInstance(Boolean::compareTo, new ArrayOfBooleansSerDe()), true, (BiConsumer) KllItemsSketch::update}, + new Object[]{KllItemsSketch.newHeapInstance(String::compareTo, new ArrayOfStringsSerDe()), "1", (BiConsumer) KllItemsSketch::update}, + new Object[]{KllItemsSketch.newHeapInstance(String::compareTo, new ArrayOfUtf16StringsSerDe()), "1", (BiConsumer) KllItemsSketch::update}, + }; + } + + @SuppressWarnings("unchecked") + @Test(dataProvider = "sketchSingleItem") + public void testSingleItemIncreases(KllSketch sketch, Object item, BiConsumer update) { + assertEquals(sketch.getTotalItemsNumBytes(), 0); + assertEquals(sketch.getNumRetained(), 0); + update.accept(sketch, item); + assertTrue(sketch.getNumRetained() > 0); + if (sketch.getSerDe().isFixedWidth()) { + assertEquals(((ArrayOfItemsSerDe) sketch.getSerDe()).sizeOf(item) * sketch.getK(), sketch.getTotalItemsNumBytes()); + } else { + assertEquals(((ArrayOfItemsSerDe) sketch.getSerDe()).sizeOf(item), sketch.getTotalItemsNumBytes()); + } + } + + @Test(dataProvider = "sketchSingleItem") + public void testManyItemIncreases(KllSketch sketch, Object item, BiConsumer update) { + assertEquals(sketch.getTotalItemsNumBytes(), 0); + assertEquals(sketch.getNumRetained(), 0); + for (int i = 0; i < 4096; i++) { + update.accept(sketch, item); + } + assertTrue(sketch.getNumRetained() > 0); + assertTrue(sketch.getTotalItemsNumBytes() > 0); + } +}