From b9033f8e7b576b5486a34fbf760ad44622f9b054 Mon Sep 17 00:00:00 2001 From: Stamatis Zampetakis Date: Wed, 20 Apr 2022 18:48:23 +0200 Subject: [PATCH 1/4] TEZ-4407: Misleading split info in TezSplitGrouper logs when adjusting small splits --- .../tez/mapreduce/grouper/TezSplitGrouper.java | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/tez-mapreduce/src/main/java/org/apache/tez/mapreduce/grouper/TezSplitGrouper.java b/tez-mapreduce/src/main/java/org/apache/tez/mapreduce/grouper/TezSplitGrouper.java index 3b2f17d1ff..7c81f9222c 100644 --- a/tez-mapreduce/src/main/java/org/apache/tez/mapreduce/grouper/TezSplitGrouper.java +++ b/tez-mapreduce/src/main/java/org/apache/tez/mapreduce/grouper/TezSplitGrouper.java @@ -260,23 +260,20 @@ public List getGroupedSplits(Configuration conf, desiredNumSplits = newDesiredNumSplits; } else if (lengthPerGroup < minLengthPerGroup) { // splits too small to work. Need to override with size. - int newDesiredNumSplits = (int)(totalLength/minLengthPerGroup) + 1; /** * This is a workaround for systems like S3 that pass the same * fake hostname for all splits. */ if (!allSplitsHaveLocalhost) { + int newDesiredNumSplits = (int)(totalLength/minLengthPerGroup) + 1; + LOG.info("Desired splits: " + desiredNumSplits + " too large. " + + " Desired splitLength: " + lengthPerGroup + + " Min splitLength: " + minLengthPerGroup + + " New desired splits: " + newDesiredNumSplits + + " Total length: " + totalLength + + " Original splits: " + originalSplits.size()); desiredNumSplits = newDesiredNumSplits; } - - LOG.info("Desired splits: " + desiredNumSplits + " too large. " + - " Desired splitLength: " + lengthPerGroup + - " Min splitLength: " + minLengthPerGroup + - " New desired splits: " + newDesiredNumSplits + - " Final desired splits: " + desiredNumSplits + - " All splits have localhost: " + allSplitsHaveLocalhost + - " Total length: " + totalLength + - " Original splits: " + originalSplits.size()); } } From c715c0b3e3f54b191124216789ee469ca2d257b7 Mon Sep 17 00:00:00 2001 From: Stamatis Zampetakis Date: Mon, 26 Jun 2023 10:23:55 +0200 Subject: [PATCH 2/4] Clarify what happens when all spits are in localhost with log message --- .../apache/tez/mapreduce/grouper/TezSplitGrouper.java | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tez-mapreduce/src/main/java/org/apache/tez/mapreduce/grouper/TezSplitGrouper.java b/tez-mapreduce/src/main/java/org/apache/tez/mapreduce/grouper/TezSplitGrouper.java index 7c81f9222c..c52630bd92 100644 --- a/tez-mapreduce/src/main/java/org/apache/tez/mapreduce/grouper/TezSplitGrouper.java +++ b/tez-mapreduce/src/main/java/org/apache/tez/mapreduce/grouper/TezSplitGrouper.java @@ -260,11 +260,10 @@ public List getGroupedSplits(Configuration conf, desiredNumSplits = newDesiredNumSplits; } else if (lengthPerGroup < minLengthPerGroup) { // splits too small to work. Need to override with size. - /** - * This is a workaround for systems like S3 that pass the same - * fake hostname for all splits. - */ - if (!allSplitsHaveLocalhost) { + if (allSplitsHaveLocalhost) { + // Workaround for systems like S3 that pass the same fake hostname for all splits. + LOG.info("Ignore {} configuration cause all splits seem to be on localhost.", TEZ_GROUPING_SPLIT_MIN_SIZE); + } else { int newDesiredNumSplits = (int)(totalLength/minLengthPerGroup) + 1; LOG.info("Desired splits: " + desiredNumSplits + " too large. " + " Desired splitLength: " + lengthPerGroup + From 8f7797ab45ad248de50d76935a0295728a740e79 Mon Sep 17 00:00:00 2001 From: Stamatis Zampetakis Date: Mon, 26 Jun 2023 12:03:41 +0200 Subject: [PATCH 3/4] Group log messages and always print when bounds exceeded --- .../mapreduce/grouper/TezSplitGrouper.java | 32 +++++++++---------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/tez-mapreduce/src/main/java/org/apache/tez/mapreduce/grouper/TezSplitGrouper.java b/tez-mapreduce/src/main/java/org/apache/tez/mapreduce/grouper/TezSplitGrouper.java index c52630bd92..5f217edbfe 100644 --- a/tez-mapreduce/src/main/java/org/apache/tez/mapreduce/grouper/TezSplitGrouper.java +++ b/tez-mapreduce/src/main/java/org/apache/tez/mapreduce/grouper/TezSplitGrouper.java @@ -247,30 +247,28 @@ public List getGroupedSplits(Configuration conf, "Invalid max/min group lengths. Required min>0, max>=min. " + " max: " + maxLengthPerGroup + " min: " + minLengthPerGroup); } + int newDesiredNumSplits = -1; if (lengthPerGroup > maxLengthPerGroup) { // splits too big to work. Need to override with max size. - int newDesiredNumSplits = (int)(totalLength/maxLengthPerGroup) + 1; - LOG.info("Desired splits: " + desiredNumSplits + " too small. " + - " Desired splitLength: " + lengthPerGroup + - " Max splitLength: " + maxLengthPerGroup + - " New desired splits: " + newDesiredNumSplits + - " Total length: " + totalLength + - " Original splits: " + originalSplits.size()); - - desiredNumSplits = newDesiredNumSplits; + newDesiredNumSplits = (int)(totalLength/maxLengthPerGroup) + 1; } else if (lengthPerGroup < minLengthPerGroup) { // splits too small to work. Need to override with size. + newDesiredNumSplits = (int)(totalLength/minLengthPerGroup) + 1; if (allSplitsHaveLocalhost) { // Workaround for systems like S3 that pass the same fake hostname for all splits. LOG.info("Ignore {} configuration cause all splits seem to be on localhost.", TEZ_GROUPING_SPLIT_MIN_SIZE); - } else { - int newDesiredNumSplits = (int)(totalLength/minLengthPerGroup) + 1; - LOG.info("Desired splits: " + desiredNumSplits + " too large. " + - " Desired splitLength: " + lengthPerGroup + - " Min splitLength: " + minLengthPerGroup + - " New desired splits: " + newDesiredNumSplits + - " Total length: " + totalLength + - " Original splits: " + originalSplits.size()); + newDesiredNumSplits = desiredNumSplits; + } + } + if (newDesiredNumSplits != -1) { + LOG.info("Desired splitLength " + lengthPerGroup + " exceeds min/max bounds. " + + " Min splitLength: " + minLengthPerGroup + + " Max splitLength: " + maxLengthPerGroup + + " Desired splits: " + desiredNumSplits + + " Total length: " + totalLength + + " Original splits: " + originalSplits.size()); + if (desiredNumSplits != newDesiredNumSplits) { + LOG.info("Desired splits will change from {} to {}", desiredNumSplits, newDesiredNumSplits); desiredNumSplits = newDesiredNumSplits; } } From d0384d43620cf256cfc08f976ac8b0d9dc0a0817 Mon Sep 17 00:00:00 2001 From: Stamatis Zampetakis Date: Mon, 26 Jun 2023 12:18:06 +0200 Subject: [PATCH 4/4] Always print new desired splits even if not changed. --- .../org/apache/tez/mapreduce/grouper/TezSplitGrouper.java | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tez-mapreduce/src/main/java/org/apache/tez/mapreduce/grouper/TezSplitGrouper.java b/tez-mapreduce/src/main/java/org/apache/tez/mapreduce/grouper/TezSplitGrouper.java index 5f217edbfe..d47122dd8c 100644 --- a/tez-mapreduce/src/main/java/org/apache/tez/mapreduce/grouper/TezSplitGrouper.java +++ b/tez-mapreduce/src/main/java/org/apache/tez/mapreduce/grouper/TezSplitGrouper.java @@ -265,12 +265,10 @@ public List getGroupedSplits(Configuration conf, " Min splitLength: " + minLengthPerGroup + " Max splitLength: " + maxLengthPerGroup + " Desired splits: " + desiredNumSplits + + " New Desired splits: " + newDesiredNumSplits + " Total length: " + totalLength + " Original splits: " + originalSplits.size()); - if (desiredNumSplits != newDesiredNumSplits) { - LOG.info("Desired splits will change from {} to {}", desiredNumSplits, newDesiredNumSplits); - desiredNumSplits = newDesiredNumSplits; - } + desiredNumSplits = newDesiredNumSplits; } }