From c5a3c701a324482fd4eeb53e76a73fc081ae9be1 Mon Sep 17 00:00:00 2001
From: MacVincent Agha-Oko <amacvincent@meta.com>
Date: Thu, 20 Nov 2025 12:38:28 -0800
Subject: [PATCH 1/2] Introduce JK to Disable Memory Reallocation Optimization
 (#325)

Summary:

When implementing the stream chunker, we anticipated that stream buffers after chunking will end up growing to the size that previously triggered chunking. As a tradeoff between minimizing reallocations (for performance) and actually releasing memory (to relieve memory pressure), we heuristically determine the new buffer capacity for each stream to be larger that required. The issue with this optimization is that it conflicts with the rest of our memory tracking logic since we now have retained memory in the memory pool that is not accounted for. We now know through local testing that disabling this optimization leads to better memory pressure relief.

We performed local DISCO tests with and without this optimization for two tables and included the comparison in https://docs.google.com/spreadsheets/d/1kZvBwhVHZRyB7tg-qT2Et4V_7mDWyNr-VrtDzjG_FKY/view?gid=1209014630#gid=1209014630. It shows that for these two tables we save an average of **15%** improvement in average write memory. We also see **4%** write CPU improvement on average. Though we saw **-6%** regression in CPU for a single 256MB stripe experiment. These results indicate that more testing is required in order to understand its impact on a larger sample size.


In this diff, we introduce a JK, [dwio/nimble_chunking:disable_memory_reallocation_optimization](https://www.internalfb.com/intern/justknobs/?name=dwio%2Fnimble_chunking#disable_memory_reallocation_optimization), that will be enabled just for DISCO experiments. This will help us understand the full impact of this optimization and whether it should be retained.

Differential Revision: D87494427
---
 CMakeLists.txt                    |  3 +++
 dwio/nimble/velox/StreamChunker.h | 11 +++++++++++
 2 files changed, 14 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e366a8c1..795d646e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,6 +20,9 @@ set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED True)
 
 add_compile_definitions(DISABLE_META_INTERNAL_COMPRESSOR=1)
+add_compile_definitions(
+  DISABLE_META_INTERNAL_MEMORY_REALLOCATION_OPTIMIZATION=1
+)
 
 # Sets new behavior for CMP0135, which controls how timestamps are extracted
 # when using ExternalProject_Add():
diff --git a/dwio/nimble/velox/StreamChunker.h b/dwio/nimble/velox/StreamChunker.h
index 7b8120f7..e0cdab1a 100644
--- a/dwio/nimble/velox/StreamChunker.h
+++ b/dwio/nimble/velox/StreamChunker.h
@@ -17,6 +17,9 @@
 #pragma once
 
 #include "dwio/nimble/velox/StreamData.h"
+#ifndef DISABLE_META_INTERNAL_MEMORY_REALLOCATION_OPTIMIZATION
+#include "justknobs/JustKnobProxy.h"
+#endif
 
 namespace facebook::nimble {
 namespace detail {
@@ -32,6 +35,14 @@ uint64_t getNewBufferCapacity(
     uint64_t maxChunkSize,
     uint64_t currentCapacityCount,
     uint64_t requiredCapacityCount) {
+#ifndef DISABLE_META_INTERNAL_MEMORY_REALLOCATION_OPTIMIZATION
+  static facebook::jk::BooleanKnob disableMemoryReallocationOptimization(
+      "dwio/nimble_chunking:disable_memory_reallocation_optimization");
+  if (disableMemoryReallocationOptimization()) {
+    return requiredCapacityCount;
+  }
+#endif
+
   const auto maxChunkElementCount = maxChunkSize / sizeof(T);
   if (currentCapacityCount < maxChunkElementCount) {
     return std::max<uint64_t>(

From 9c6749af177cfd1f3832a8b62d7332cb57eadffa Mon Sep 17 00:00:00 2001
From: MacVincent Agha-Oko <amacvincent@meta.com>
Date: Thu, 20 Nov 2025 12:38:28 -0800
Subject: [PATCH 2/2] Remove TODO about Replacing Sorting with Bucketing (#330)

Summary:

We initially planned to improve the performance of the hard chunking stage by bucketing the streams by size (by most significant bit) instead of sorting them. However, after running benchmarks in D87571945. Sorting always performs better for large stream counts.

Differential Revision: D87573831
---
 dwio/nimble/velox/VeloxWriter.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/dwio/nimble/velox/VeloxWriter.cpp b/dwio/nimble/velox/VeloxWriter.cpp
index dd716ce5..45e0b2a5 100644
--- a/dwio/nimble/velox/VeloxWriter.cpp
+++ b/dwio/nimble/velox/VeloxWriter.cpp
@@ -1144,8 +1144,6 @@ bool VeloxWriter::evalauateFlushPolicy() {
     if (continueChunking) {
       // Relieve memory pressure by chunking small streams.
       // Sort streams for chunking based on raw memory usage.
-      // TODO(T240072104): Improve performance by bucketing the streams
-      // by size (by most significant bit) instead of sorting them.
       streamIndices.resize(streams.size());
       std::iota(streamIndices.begin(), streamIndices.end(), 0);
       std::sort(