From 39c1bcd2280f450cabad046a63bc1a80a36a3add Mon Sep 17 00:00:00 2001 From: Marco Gaido Date: Tue, 13 Nov 2018 13:47:25 +0800 Subject: [PATCH] [SPARK-26003] Improve SQLAppStatusListener.aggregateMetrics performance ## What changes were proposed in this pull request? In `SQLAppStatusListener.aggregateMetrics`, we use the `metricIds` only to filter the relevant metrics. And this is a Seq which is also sorted. When there are many metrics involved, this can be pretty inefficient. The PR proposes to use a Set for it. ## How was this patch tested? NA Closes #23002 from mgaido91/SPARK-26003. Authored-by: Marco Gaido Signed-off-by: Wenchen Fan --- .../apache/spark/sql/execution/ui/SQLAppStatusListener.scala | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala index 6069da861310c..111ac3111f7cd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala @@ -181,7 +181,6 @@ class SQLAppStatusListener( } private def aggregateMetrics(exec: LiveExecutionData): Map[Long, String] = { - val metricIds = exec.metrics.map(_.accumulatorId).sorted val metricTypes = exec.metrics.map { m => (m.accumulatorId, m.metricType) }.toMap val metrics = exec.stages.toSeq .flatMap { stageId => Option(stageMetrics.get(stageId)) } @@ -189,10 +188,10 @@ class SQLAppStatusListener( .flatMap { metrics => metrics.ids.zip(metrics.values) } val aggregatedMetrics = (metrics ++ exec.driverAccumUpdates.toSeq) - .filter { case (id, _) => metricIds.contains(id) } + .filter { case (id, _) => metricTypes.contains(id) } .groupBy(_._1) .map { case (id, values) => - id -> SQLMetrics.stringValue(metricTypes(id), values.map(_._2).toSeq) + id -> SQLMetrics.stringValue(metricTypes(id), values.map(_._2)) } // Check the execution again for whether the aggregated metrics data has been calculated.