From ae4bec3972891e60906637cb230079c3b6a8b78b Mon Sep 17 00:00:00 2001
From: Ziqi Liu <ziqi.liu@databricks.com>
Date: Wed, 7 Aug 2024 11:52:13 -0700
Subject: [PATCH 1/3] [SPARK-48628][CORE] Add task peak on/off heap memory
 metrics

### What changes were proposed in this pull request?

Add task on/off heap execution memory in `TaskMetrics`, tracked in `TaskMemoryManager`, **assuming `acquireExecutionMemory` is the only one narrow waist for acquiring execution memory.**

### Why are the changes needed?

Currently there is no task on/off heap execution memory metrics.

There is a [peakExecutionMemory](https://github.com/apache/spark/blob/3cd35f8cb6462051c621cf49de54b9c5692aae1d/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala#L114)  metrics, however, the semantic is a confusing: it only cover the execution memory used by shuffle/join/aggregate/sort, which is accumulated in specific operators and thus not really reflect the real execution memory.

Therefore it's necessary to add these two metrics.

Also I created two followup sub tickets:

- https://issues.apache.org/jira/browse/SPARK-48788 : accumulate task metrics in stage, and display in Spark UI
- https://issues.apache.org/jira/browse/SPARK-48789 : deprecate `peakExecutionMemory` once we have replacement for it.

The ultimate goal is to have these two metrics ready (as accumulated stage metrics in Spark UI as well) and deprecate `peakExecutionMemory`.

### Does this PR introduce _any_ user-facing change?

Supposedly no. But two followup sub tickets will have user-facing change: new metrics exposed to Spark UI, and old metrics deprecation.

### How was this patch tested?
new test

### Was this patch authored or co-authored using generative AI tooling?
NO

Closes #47192 from liuzqt/SPARK-48628.

Authored-by: Ziqi Liu <ziqi.liu@databricks.com>
Signed-off-by: Xingbo Jiang <xingbo.jiang@databricks.com>
---
 .../spark/memory/TaskMemoryManager.java       | 36 ++++++++++
 .../apache/spark/InternalAccumulator.scala    |  2 +
 .../org/apache/spark/executor/Executor.scala  |  2 +
 .../apache/spark/executor/TaskMetrics.scala   | 21 ++++++
 .../org/apache/spark/util/JsonProtocol.scala  |  6 ++
 .../spark/memory/MemoryManagerSuite.scala     | 18 +++++
 .../apache/spark/util/JsonProtocolSuite.scala | 72 ++++++++++++-------
 7 files changed, 132 insertions(+), 25 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java b/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
index fe798e40a6ad7..541ed7e2350b2 100644
--- a/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
+++ b/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
@@ -122,6 +122,16 @@ public class TaskMemoryManager {
    */
   private volatile long acquiredButNotUsed = 0L;
 
+  /**
+   * Peak off heap memory usage by this task.
+   */
+  private volatile long peakOffHeapMemory = 0L;
+
+  /**
+   * Peak on heap memory usage by this task.
+   */
+  private volatile long peakOnHeapMemory = 0L;
+
   /**
    * Construct a new TaskMemoryManager.
    */
@@ -202,6 +212,17 @@ public long acquireExecutionMemory(long required, MemoryConsumer requestingConsu
         logger.debug("Task {} acquired {} for {}", taskAttemptId, Utils.bytesToString(got),
           requestingConsumer);
       }
+
+      // Consumer will update its used memory after acquireExecutionMemory, so we need to add `got`
+      // to compute current peak
+      long currentPeak = consumers.stream().filter(c -> c.getMode() == mode)
+        .mapToLong(MemoryConsumer::getUsed).sum() + got;
+      if (mode == MemoryMode.OFF_HEAP) {
+        peakOffHeapMemory = Math.max(peakOffHeapMemory, currentPeak);
+      } else {
+        peakOnHeapMemory = Math.max(peakOnHeapMemory, currentPeak);
+      }
+
       return got;
     }
   }
@@ -507,4 +528,19 @@ public long getMemoryConsumptionForThisTask() {
   public MemoryMode getTungstenMemoryMode() {
     return tungstenMemoryMode;
   }
+
+  /**
+   * Returns peak task-level off-heap memory usage in bytes.
+   *
+   */
+  public long getPeakOnHeapExecutionMemory() {
+    return peakOnHeapMemory;
+  }
+
+  /**
+   * Returns peak task-level on-heap memory usage in bytes.
+   */
+  public long getPeakOffHeapExecutionMemory() {
+    return peakOffHeapMemory;
+  }
 }
diff --git a/core/src/main/scala/org/apache/spark/InternalAccumulator.scala b/core/src/main/scala/org/apache/spark/InternalAccumulator.scala
index ef4609e6d6456..505634d5bb048 100644
--- a/core/src/main/scala/org/apache/spark/InternalAccumulator.scala
+++ b/core/src/main/scala/org/apache/spark/InternalAccumulator.scala
@@ -41,6 +41,8 @@ private[spark] object InternalAccumulator {
   val MEMORY_BYTES_SPILLED = METRICS_PREFIX + "memoryBytesSpilled"
   val DISK_BYTES_SPILLED = METRICS_PREFIX + "diskBytesSpilled"
   val PEAK_EXECUTION_MEMORY = METRICS_PREFIX + "peakExecutionMemory"
+  val PEAK_ON_HEAP_EXECUTION_MEMORY = METRICS_PREFIX + "peakOnHeapExecutionMemory"
+  val PEAK_OFF_HEAP_EXECUTION_MEMORY = METRICS_PREFIX + "peakOffHeapExecutionMemory"
   val UPDATED_BLOCK_STATUSES = METRICS_PREFIX + "updatedBlockStatuses"
   val TEST_ACCUM = METRICS_PREFIX + "testAccumulator"
 
diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
index 9f1382ac8b1c2..0f0e7cb3d8974 100644
--- a/core/src/main/scala/org/apache/spark/executor/Executor.scala
+++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -706,6 +706,8 @@ private[spark] class Executor(
         task.metrics.setJvmGCTime(computeTotalGcTime() - startGCTime)
         task.metrics.setResultSerializationTime(TimeUnit.NANOSECONDS.toMillis(
           afterSerializationNs - beforeSerializationNs))
+        task.metrics.setPeakOnHeapExecutionMemory(taskMemoryManager.getPeakOnHeapExecutionMemory)
+        task.metrics.setPeakOffHeapExecutionMemory(taskMemoryManager.getPeakOffHeapExecutionMemory)
         // Expose task metrics using the Dropwizard metrics system.
         // Update task metrics counters
         executorSource.METRIC_CPU_TIME.inc(task.metrics.executorCpuTime)
diff --git a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
index 8540e8c330a09..582c93007f4f5 100644
--- a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
+++ b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
@@ -56,6 +56,8 @@ class TaskMetrics private[spark] () extends Serializable {
   private val _memoryBytesSpilled = new LongAccumulator
   private val _diskBytesSpilled = new LongAccumulator
   private val _peakExecutionMemory = new LongAccumulator
+  private val _peakOnHeapExecutionMemory = new LongAccumulator
+  private val _peakOffHeapExecutionMemory = new LongAccumulator
   private val _updatedBlockStatuses = new CollectionAccumulator[(BlockId, BlockStatus)]
 
   /**
@@ -109,9 +111,22 @@ class TaskMetrics private[spark] () extends Serializable {
    * joins. The value of this accumulator should be approximately the sum of the peak sizes
    * across all such data structures created in this task. For SQL jobs, this only tracks all
    * unsafe operators and ExternalSort.
+   * This is not equal to peakOnHeapExecutionMemory + peakOffHeapExecutionMemory
    */
+  // TODO: SPARK-48789: the naming is confusing since this does not really reflect the whole
+  //  execution memory. We'd better deprecate this once we have a replacement.
   def peakExecutionMemory: Long = _peakExecutionMemory.sum
 
+  /**
+   * Peak on heap execution memory as tracked by TaskMemoryManager.
+   */
+  def peakOnHeapExecutionMemory: Long = _peakOnHeapExecutionMemory.sum
+
+  /**
+   * Peak off heap execution memory as tracked by TaskMemoryManager.
+   */
+  def peakOffHeapExecutionMemory: Long = _peakOffHeapExecutionMemory.sum
+
   /**
    * Storage statuses of any blocks that have been updated as a result of this task.
    *
@@ -139,6 +154,10 @@ class TaskMetrics private[spark] () extends Serializable {
   private[spark] def setResultSerializationTime(v: Long): Unit =
     _resultSerializationTime.setValue(v)
   private[spark] def setPeakExecutionMemory(v: Long): Unit = _peakExecutionMemory.setValue(v)
+  private[spark] def setPeakOnHeapExecutionMemory(v: Long): Unit =
+    _peakOnHeapExecutionMemory.setValue(v)
+  private[spark] def setPeakOffHeapExecutionMemory(v: Long): Unit =
+    _peakOffHeapExecutionMemory.setValue(v)
   private[spark] def incMemoryBytesSpilled(v: Long): Unit = _memoryBytesSpilled.add(v)
   private[spark] def incDiskBytesSpilled(v: Long): Unit = _diskBytesSpilled.add(v)
   private[spark] def incPeakExecutionMemory(v: Long): Unit = _peakExecutionMemory.add(v)
@@ -225,6 +244,8 @@ class TaskMetrics private[spark] () extends Serializable {
     MEMORY_BYTES_SPILLED -> _memoryBytesSpilled,
     DISK_BYTES_SPILLED -> _diskBytesSpilled,
     PEAK_EXECUTION_MEMORY -> _peakExecutionMemory,
+    PEAK_ON_HEAP_EXECUTION_MEMORY -> _peakOnHeapExecutionMemory,
+    PEAK_OFF_HEAP_EXECUTION_MEMORY -> _peakOffHeapExecutionMemory,
     UPDATED_BLOCK_STATUSES -> _updatedBlockStatuses,
     shuffleRead.REMOTE_BLOCKS_FETCHED -> shuffleReadMetrics._remoteBlocksFetched,
     shuffleRead.LOCAL_BLOCKS_FETCHED -> shuffleReadMetrics._localBlocksFetched,
diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
index 4c7de70adafd6..19cefbc0479a9 100644
--- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -597,6 +597,8 @@ private[spark] object JsonProtocol extends JsonUtils {
     g.writeNumberField("Executor Run Time", taskMetrics.executorRunTime)
     g.writeNumberField("Executor CPU Time", taskMetrics.executorCpuTime)
     g.writeNumberField("Peak Execution Memory", taskMetrics.peakExecutionMemory)
+    g.writeNumberField("Peak On Heap Execution Memory", taskMetrics.peakOnHeapExecutionMemory)
+    g.writeNumberField("Peak Off Heap Execution Memory", taskMetrics.peakOffHeapExecutionMemory)
     g.writeNumberField("Result Size", taskMetrics.resultSize)
     g.writeNumberField("JVM GC Time", taskMetrics.jvmGCTime)
     g.writeNumberField("Result Serialization Time", taskMetrics.resultSerializationTime)
@@ -1254,6 +1256,10 @@ private[spark] object JsonProtocol extends JsonUtils {
     // The "Peak Execution Memory" field was added in Spark 3.0.0:
     metrics.setPeakExecutionMemory(
       jsonOption(json.get("Peak Execution Memory")).map(_.extractLong).getOrElse(0))
+    metrics.setPeakOnHeapExecutionMemory(
+      jsonOption(json.get("Peak On Heap Execution Memory")).map(_.extractLong).getOrElse(0))
+    metrics.setPeakOffHeapExecutionMemory(
+      jsonOption(json.get("Peak Off Heap Execution Memory")).map(_.extractLong).getOrElse(0))
     metrics.setResultSize(json.get("Result Size").extractLong)
     metrics.setJvmGCTime(json.get("JVM GC Time").extractLong)
     metrics.setResultSerializationTime(json.get("Result Serialization Time").extractLong)
diff --git a/core/src/test/scala/org/apache/spark/memory/MemoryManagerSuite.scala b/core/src/test/scala/org/apache/spark/memory/MemoryManagerSuite.scala
index a6f1707a1aabf..1f40ef944a843 100644
--- a/core/src/test/scala/org/apache/spark/memory/MemoryManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/memory/MemoryManagerSuite.scala
@@ -335,6 +335,24 @@ private[memory] trait MemoryManagerSuite extends SparkFunSuite {
     tMemManager.releaseExecutionMemory(500L, c)
     assert(tMemManager.getMemoryConsumptionForThisTask === 0L)
   }
+
+  test("task peak execution memory usage") {
+    val memoryManager = createMemoryManager(
+      maxOnHeapExecutionMemory = 1000L,
+      maxOffHeapExecutionMemory = 1000L)
+
+    val tMemManager = new TaskMemoryManager(memoryManager, 1)
+    val offHeapConsumer = new TestMemoryConsumer(tMemManager, MemoryMode.OFF_HEAP)
+    val onHeapConsumer = new TestMemoryConsumer(tMemManager, MemoryMode.ON_HEAP)
+
+    val result1 = tMemManager.acquireExecutionMemory(500L, offHeapConsumer)
+    val result2 = tMemManager.acquireExecutionMemory(400L, onHeapConsumer)
+    assert(result1 === 500L)
+    assert(result2 === 400L)
+    assert(tMemManager.getMemoryConsumptionForThisTask === 900L)
+    assert(tMemManager.getPeakOnHeapExecutionMemory === 400L)
+    assert(tMemManager.getPeakOffHeapExecutionMemory === 500L)
+  }
 }
 
 private object MemoryManagerSuite {
diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
index 3eae1b3278e74..cdee6ccda706e 100644
--- a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
@@ -1453,6 +1453,8 @@ private[spark] object JsonProtocolSuite extends Assertions {
     t.setExecutorRunTime(b)
     t.setExecutorCpuTime(b)
     t.setPeakExecutionMemory(c)
+    t.setPeakOnHeapExecutionMemory(c)
+    t.setPeakOffHeapExecutionMemory(c)
     t.setResultSize(c)
     t.setJvmGCTime(d)
     t.setResultSerializationTime(a + b)
@@ -1731,6 +1733,8 @@ private[spark] object JsonProtocolSuite extends Assertions {
       |    "Executor Run Time": 400,
       |    "Executor CPU Time": 400,
       |    "Peak Execution Memory": 500,
+      |    "Peak On Heap Execution Memory": 500,
+      |    "Peak Off Heap Execution Memory": 500,
       |    "Result Size": 500,
       |    "JVM GC Time": 600,
       |    "Result Serialization Time": 700,
@@ -1872,6 +1876,8 @@ private[spark] object JsonProtocolSuite extends Assertions {
       |    "Executor Run Time": 400,
       |    "Executor CPU Time": 400,
       |    "Peak Execution Memory": 500,
+      |    "Peak On Heap Execution Memory": 500,
+      |    "Peak Off Heap Execution Memory": 500,
       |    "Result Size": 500,
       |    "JVM GC Time": 600,
       |    "Result Serialization Time": 700,
@@ -2013,6 +2019,8 @@ private[spark] object JsonProtocolSuite extends Assertions {
       |    "Executor Run Time": 400,
       |    "Executor CPU Time": 400,
       |    "Peak Execution Memory": 500,
+      |    "Peak On Heap Execution Memory": 500,
+      |    "Peak Off Heap Execution Memory": 500,
       |    "Result Size": 500,
       |    "JVM GC Time": 600,
       |    "Result Serialization Time": 700,
@@ -2683,6 +2691,20 @@ private[spark] object JsonProtocolSuite extends Assertions {
       |        },
       |        {
       |          "ID": 10,
+      |          "Name": "$PEAK_ON_HEAP_EXECUTION_MEMORY",
+      |          "Update": 500,
+      |          "Internal": true,
+      |          "Count Failed Values": true
+      |        },
+      |        {
+      |          "ID": 11,
+      |          "Name": "$PEAK_OFF_HEAP_EXECUTION_MEMORY",
+      |          "Update": 500,
+      |          "Internal": true,
+      |          "Count Failed Values": true
+      |        },
+      |        {
+      |          "ID": 12,
       |          "Name": "$UPDATED_BLOCK_STATUSES",
       |          "Update": [
       |            {
@@ -2704,175 +2726,175 @@ private[spark] object JsonProtocolSuite extends Assertions {
       |          "Count Failed Values": true
       |        },
       |        {
-      |          "ID": 11,
+      |          "ID": 13,
       |          "Name": "${shuffleRead.REMOTE_BLOCKS_FETCHED}",
       |          "Update": 0,
       |          "Internal": true,
       |          "Count Failed Values": true
       |        },
       |        {
-      |          "ID": 12,
+      |          "ID": 14,
       |          "Name": "${shuffleRead.LOCAL_BLOCKS_FETCHED}",
       |          "Update": 0,
       |          "Internal": true,
       |          "Count Failed Values": true
       |        },
       |        {
-      |          "ID": 13,
+      |          "ID": 15,
       |          "Name": "${shuffleRead.REMOTE_BYTES_READ}",
       |          "Update": 0,
       |          "Internal": true,
       |          "Count Failed Values": true
       |        },
       |        {
-      |          "ID": 14,
+      |          "ID": 16,
       |          "Name": "${shuffleRead.REMOTE_BYTES_READ_TO_DISK}",
       |          "Update": 0,
       |          "Internal": true,
       |          "Count Failed Values": true
       |        },
       |        {
-      |          "ID": 15,
+      |          "ID": 17,
       |          "Name": "${shuffleRead.LOCAL_BYTES_READ}",
       |          "Update": 0,
       |          "Internal": true,
       |          "Count Failed Values": true
       |        },
       |        {
-      |          "ID": 16,
+      |          "ID": 18,
       |          "Name": "${shuffleRead.FETCH_WAIT_TIME}",
       |          "Update": 0,
       |          "Internal": true,
       |          "Count Failed Values": true
       |        },
       |        {
-      |          "ID": 17,
+      |          "ID": 19,
       |          "Name": "${shuffleRead.RECORDS_READ}",
       |          "Update": 0,
       |          "Internal": true,
       |          "Count Failed Values": true
       |        },
       |        {
-      |          "ID": 18,
+      |          "ID": 20,
       |          "Name": "${shuffleRead.CORRUPT_MERGED_BLOCK_CHUNKS}",
       |          "Update": 0,
       |          "Internal": true,
       |          "Count Failed Values": true
       |        },
       |        {
-      |          "ID": 19,
+      |          "ID": 21,
       |          "Name": "${shuffleRead.MERGED_FETCH_FALLBACK_COUNT}",
       |          "Update": 0,
       |          "Internal": true,
       |          "Count Failed Values": true
       |        },
       |        {
-      |          "ID" : 20,
+      |          "ID" : 22,
       |          "Name" : "${shuffleRead.REMOTE_MERGED_BLOCKS_FETCHED}",
       |          "Update" : 0,
       |          "Internal" : true,
       |          "Count Failed Values" : true
       |        },
       |        {
-      |          "ID" : 21,
+      |          "ID" : 23,
       |          "Name" : "${shuffleRead.LOCAL_MERGED_BLOCKS_FETCHED}",
       |          "Update" : 0,
       |          "Internal" : true,
       |          "Count Failed Values" : true
       |        },
       |        {
-      |          "ID" : 22,
+      |          "ID" : 24,
       |          "Name" : "${shuffleRead.REMOTE_MERGED_CHUNKS_FETCHED}",
       |          "Update" : 0,
       |          "Internal" : true,
       |          "Count Failed Values" : true
       |        },
       |        {
-      |          "ID" : 23,
+      |          "ID" : 25,
       |          "Name" : "${shuffleRead.LOCAL_MERGED_CHUNKS_FETCHED}",
       |          "Update" : 0,
       |          "Internal" : true,
       |          "Count Failed Values" : true
       |        },
       |        {
-      |          "ID" : 24,
+      |          "ID" : 26,
       |          "Name" : "${shuffleRead.REMOTE_MERGED_BYTES_READ}",
       |          "Update" : 0,
       |          "Internal" : true,
       |          "Count Failed Values" : true
       |        },
       |        {
-      |          "ID" : 25,
+      |          "ID" : 27,
       |          "Name" : "${shuffleRead.LOCAL_MERGED_BYTES_READ}",
       |          "Update" : 0,
       |          "Internal" : true,
       |          "Count Failed Values" : true
       |        },
       |        {
-      |          "ID" : 26,
+      |          "ID" : 28,
       |          "Name" : "${shuffleRead.REMOTE_REQS_DURATION}",
       |          "Update" : 0,
       |          "Internal" : true,
       |          "Count Failed Values" : true
       |        },
       |        {
-      |          "ID" : 27,
+      |          "ID" : 29,
       |          "Name" : "${shuffleRead.REMOTE_MERGED_REQS_DURATION}",
       |          "Update" : 0,
       |          "Internal" : true,
       |          "Count Failed Values" : true
       |        },
       |        {
-      |          "ID": 28,
+      |          "ID": 30,
       |          "Name": "${shuffleWrite.BYTES_WRITTEN}",
       |          "Update": 0,
       |          "Internal": true,
       |          "Count Failed Values": true
       |        },
       |        {
-      |          "ID": 29,
+      |          "ID": 31,
       |          "Name": "${shuffleWrite.RECORDS_WRITTEN}",
       |          "Update": 0,
       |          "Internal": true,
       |          "Count Failed Values": true
       |        },
       |        {
-      |          "ID": 30,
+      |          "ID": 32,
       |          "Name": "${shuffleWrite.WRITE_TIME}",
       |          "Update": 0,
       |          "Internal": true,
       |          "Count Failed Values": true
       |        },
       |        {
-      |          "ID": 31,
+      |          "ID": 33,
       |          "Name": "${input.BYTES_READ}",
       |          "Update": 2100,
       |          "Internal": true,
       |          "Count Failed Values": true
       |        },
       |        {
-      |          "ID": 32,
+      |          "ID": 34,
       |          "Name": "${input.RECORDS_READ}",
       |          "Update": 21,
       |          "Internal": true,
       |          "Count Failed Values": true
       |        },
       |        {
-      |          "ID": 33,
+      |          "ID": 35,
       |          "Name": "${output.BYTES_WRITTEN}",
       |          "Update": 1200,
       |          "Internal": true,
       |          "Count Failed Values": true
       |        },
       |        {
-      |          "ID": 34,
+      |          "ID": 36,
       |          "Name": "${output.RECORDS_WRITTEN}",
       |          "Update": 12,
       |          "Internal": true,
       |          "Count Failed Values": true
       |        },
       |        {
-      |          "ID": 35,
+      |          "ID": 37,
       |          "Name": "$TEST_ACCUM",
       |          "Update": 0,
       |          "Internal": true,

From 4afd26ed57b2b31d9e499f3c400ca58c315dcb60 Mon Sep 17 00:00:00 2001
From: Ziqi Liu <ziqi.liu@databricks.com>
Date: Fri, 16 Aug 2024 00:38:43 +0800
Subject: [PATCH 2/3] fix

---
 .../spark/memory/TaskMemoryManager.java       | 37 ++++++++++++++++---
 1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java b/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
index 541ed7e2350b2..df224bc902bff 100644
--- a/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
+++ b/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
@@ -122,6 +122,20 @@ public class TaskMemoryManager {
    */
   private volatile long acquiredButNotUsed = 0L;
 
+  /**
+   * Current off heap memory usage by this task.
+   */
+  private long currentOffHeapMemory = 0L;
+
+  private final Object offHeapMemoryLock = new Object();
+
+  /*
+   * Current on heap memory usage by this task.
+   */
+  private long currentOnHeapMemory = 0L;
+
+  private final Object onHeapMemoryLock = new Object();
+
   /**
    * Peak off heap memory usage by this task.
    */
@@ -213,14 +227,16 @@ public long acquireExecutionMemory(long required, MemoryConsumer requestingConsu
           requestingConsumer);
       }
 
-      // Consumer will update its used memory after acquireExecutionMemory, so we need to add `got`
-      // to compute current peak
-      long currentPeak = consumers.stream().filter(c -> c.getMode() == mode)
-        .mapToLong(MemoryConsumer::getUsed).sum() + got;
       if (mode == MemoryMode.OFF_HEAP) {
-        peakOffHeapMemory = Math.max(peakOffHeapMemory, currentPeak);
+        synchronized (offHeapMemoryLock) {
+          currentOffHeapMemory += got;
+          peakOffHeapMemory = Math.max(peakOffHeapMemory, currentOffHeapMemory);
+        }
       } else {
-        peakOnHeapMemory = Math.max(peakOnHeapMemory, currentPeak);
+        synchronized (onHeapMemoryLock) {
+          currentOnHeapMemory += got;
+          peakOnHeapMemory = Math.max(peakOnHeapMemory, currentOnHeapMemory);
+        }
       }
 
       return got;
@@ -290,6 +306,15 @@ public void releaseExecutionMemory(long size, MemoryConsumer consumer) {
         consumer);
     }
     memoryManager.releaseExecutionMemory(size, taskAttemptId, consumer.getMode());
+    if (consumer.getMode() == MemoryMode.OFF_HEAP) {
+      synchronized (offHeapMemoryLock) {
+        currentOffHeapMemory -= size;
+      }
+    } else {
+      synchronized (onHeapMemoryLock) {
+        currentOnHeapMemory -= size;
+      }
+    }
   }
 
   /**

From 0c10aaaff7fe2b36dde8600298829c5b35151a7a Mon Sep 17 00:00:00 2001
From: Ziqi Liu <ziqi.liu@databricks.com>
Date: Fri, 16 Aug 2024 14:03:43 +0800
Subject: [PATCH 3/3] update benchmark result

---
 ...yUnsafeRowArrayBenchmark-jdk21-results.txt | 30 +++++++++----------
 ...endOnlyUnsafeRowArrayBenchmark-results.txt | 30 +++++++++----------
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/sql/core/benchmarks/ExternalAppendOnlyUnsafeRowArrayBenchmark-jdk21-results.txt b/sql/core/benchmarks/ExternalAppendOnlyUnsafeRowArrayBenchmark-jdk21-results.txt
index 08cd0f2c47f86..08f3d54f5ae81 100644
--- a/sql/core/benchmarks/ExternalAppendOnlyUnsafeRowArrayBenchmark-jdk21-results.txt
+++ b/sql/core/benchmarks/ExternalAppendOnlyUnsafeRowArrayBenchmark-jdk21-results.txt
@@ -2,44 +2,44 @@
 WITHOUT SPILL
 ================================================================================================
 
-OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure
+OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure
 AMD EPYC 7763 64-Core Processor
 Array with 100000 rows:                   Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-ArrayBuffer                                        2445           2451          10         41.9          23.9       1.0X
-ExternalAppendOnlyUnsafeRowArray                   3464           3489          36         29.6          33.8       0.7X
+ArrayBuffer                                        2456           2456           0         41.7          24.0       1.0X
+ExternalAppendOnlyUnsafeRowArray                   3572           3595          33         28.7          34.9       0.7X
 
-OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure
+OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure
 AMD EPYC 7763 64-Core Processor
 Array with 1000 rows:                     Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-ArrayBuffer                                        5292           5328          50         49.5          20.2       1.0X
-ExternalAppendOnlyUnsafeRowArray                  11921          11927           9         22.0          45.5       0.4X
+ArrayBuffer                                        5511           5519          11         47.6          21.0       1.0X
+ExternalAppendOnlyUnsafeRowArray                  12331          12382          73         21.3          47.0       0.4X
 
-OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure
+OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure
 AMD EPYC 7763 64-Core Processor
 Array with 30000 rows:                    Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-ArrayBuffer                                       10418          10422           6         47.2          21.2       1.0X
-ExternalAppendOnlyUnsafeRowArray                  16589          16692         145         29.6          33.8       0.6X
+ArrayBuffer                                       10731          10759          39         45.8          21.8       1.0X
+ExternalAppendOnlyUnsafeRowArray                  18516          18568          72         26.5          37.7       0.6X
 
 
 ================================================================================================
 WITH SPILL
 ================================================================================================
 
-OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure
+OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure
 AMD EPYC 7763 64-Core Processor
 Spilling with 1000 rows:                  Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-UnsafeExternalSorter                               8507           8542          50         30.8          32.5       1.0X
-ExternalAppendOnlyUnsafeRowArray                   6301           6314          18         41.6          24.0       1.4X
+UnsafeExternalSorter                               8284           8328          63         31.6          31.6       1.0X
+ExternalAppendOnlyUnsafeRowArray                   6615           6624          14         39.6          25.2       1.3X
 
-OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure
+OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure
 AMD EPYC 7763 64-Core Processor
 Spilling with 10000 rows:                 Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-UnsafeExternalSorter                                  5              5           0         33.0          30.3       1.0X
-ExternalAppendOnlyUnsafeRowArray                      4              4           0         40.1          24.9       1.2X
+UnsafeExternalSorter                                  5              5           0         32.8          30.5       1.0X
+ExternalAppendOnlyUnsafeRowArray                      4              4           0         38.5          26.0       1.2X
 
 
diff --git a/sql/core/benchmarks/ExternalAppendOnlyUnsafeRowArrayBenchmark-results.txt b/sql/core/benchmarks/ExternalAppendOnlyUnsafeRowArrayBenchmark-results.txt
index 10af77fdd8bb2..ca447f9e97dbc 100644
--- a/sql/core/benchmarks/ExternalAppendOnlyUnsafeRowArrayBenchmark-results.txt
+++ b/sql/core/benchmarks/ExternalAppendOnlyUnsafeRowArrayBenchmark-results.txt
@@ -2,44 +2,44 @@
 WITHOUT SPILL
 ================================================================================================
 
-OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure
+OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure
 AMD EPYC 7763 64-Core Processor
 Array with 100000 rows:                   Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-ArrayBuffer                                        2330           2333           4         44.0          22.8       1.0X
-ExternalAppendOnlyUnsafeRowArray                   3306           3317          15         31.0          32.3       0.7X
+ArrayBuffer                                        2496           2499           4         41.0          24.4       1.0X
+ExternalAppendOnlyUnsafeRowArray                   3495           3513          24         29.3          34.1       0.7X
 
-OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure
+OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure
 AMD EPYC 7763 64-Core Processor
 Array with 1000 rows:                     Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-ArrayBuffer                                        5594           5598           6         46.9          21.3       1.0X
-ExternalAppendOnlyUnsafeRowArray                  12278          12332          75         21.4          46.8       0.5X
+ArrayBuffer                                        5277           5284          10         49.7          20.1       1.0X
+ExternalAppendOnlyUnsafeRowArray                  12169          12171           3         21.5          46.4       0.4X
 
-OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure
+OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure
 AMD EPYC 7763 64-Core Processor
 Array with 30000 rows:                    Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-ArrayBuffer                                       10249          10252           4         48.0          20.9       1.0X
-ExternalAppendOnlyUnsafeRowArray                  16386          16397          16         30.0          33.3       0.6X
+ArrayBuffer                                       10107          10110           4         48.6          20.6       1.0X
+ExternalAppendOnlyUnsafeRowArray                  17021          17035          20         28.9          34.6       0.6X
 
 
 ================================================================================================
 WITH SPILL
 ================================================================================================
 
-OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure
+OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure
 AMD EPYC 7763 64-Core Processor
 Spilling with 1000 rows:                  Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-UnsafeExternalSorter                               8294           8315          30         31.6          31.6       1.0X
-ExternalAppendOnlyUnsafeRowArray                   6767           6797          42         38.7          25.8       1.2X
+UnsafeExternalSorter                               8435           8499          89         31.1          32.2       1.0X
+ExternalAppendOnlyUnsafeRowArray                   7126           7131           6         36.8          27.2       1.2X
 
-OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure
+OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure
 AMD EPYC 7763 64-Core Processor
 Spilling with 10000 rows:                 Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-UnsafeExternalSorter                                  5              5           0         34.2          29.2       1.0X
-ExternalAppendOnlyUnsafeRowArray                      4              4           0         38.8          25.8       1.1X
+UnsafeExternalSorter                                  5              5           0         34.5          29.0       1.0X
+ExternalAppendOnlyUnsafeRowArray                      4              4           0         36.6          27.3       1.1X