From 192b02eeb6b2a451fef5e58313d5445c253c3e31 Mon Sep 17 00:00:00 2001 From: Mark Bathori Date: Tue, 1 Mar 2022 15:54:06 +0100 Subject: [PATCH 1/2] TEZ-4393: ShuffleScheduler.hasFailedAcrossNodes log message to have more context --- .../orderedgrouped/ShuffleScheduler.java | 25 ++++++------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/tez-runtime-library/src/main/java/org/apache/tez/runtime/library/common/shuffle/orderedgrouped/ShuffleScheduler.java b/tez-runtime-library/src/main/java/org/apache/tez/runtime/library/common/shuffle/orderedgrouped/ShuffleScheduler.java index 470b04cc5f..d90373fb5c 100644 --- a/tez-runtime-library/src/main/java/org/apache/tez/runtime/library/common/shuffle/orderedgrouped/ShuffleScheduler.java +++ b/tez-runtime-library/src/main/java/org/apache/tez/runtime/library/common/shuffle/orderedgrouped/ShuffleScheduler.java @@ -902,29 +902,20 @@ private void informAM(InputAttemptFetchFailure fetchFailure) { private boolean hasFailedAcrossNodes(String logContext) { int numUniqueHosts = uniqueHosts.size(); Preconditions.checkArgument(numUniqueHosts > 0, "No values in unique hosts"); - int threshold = Math.max(3, - (int) Math.ceil(numUniqueHosts * hostFailureFraction)); - int total = 0; - boolean failedAcrossNodes = false; + int threshold = Math.max(3, (int) Math.ceil(numUniqueHosts * hostFailureFraction)); + int totalFailures = 0; + int totalThreshold = threshold * minFailurePerHost; for(HostPort host : uniqueHosts) { IntWritable failures = hostFailures.get(host); if (failures != null && failures.get() > minFailurePerHost) { - total++; - failedAcrossNodes = (total > (threshold * minFailurePerHost)); - if (failedAcrossNodes) { - break; + totalFailures++; + if (totalFailures > totalThreshold) { + LOG.info("Number of failures across nodes ({}) has exceeded the total threshold limit ({}) for InputAttemptIdentifier: {}", totalFailures, totalThreshold, logContext); + return true; } } } - - LOG.info(logContext + ", numUniqueHosts=" + numUniqueHosts - + ", hostFailureThreshold=" + threshold - + ", hostFailuresCount=" + hostFailures.size() - + ", hosts crossing threshold=" + total - + ", reducerFetchIssues=" + failedAcrossNodes - ); - - return failedAcrossNodes; + return false; } private boolean allEventsReceived() { From 0db15be743b14e11b7e7aeee8741ae6df2b0d7ef Mon Sep 17 00:00:00 2001 From: Mark Bathori Date: Tue, 1 Mar 2022 16:29:06 +0100 Subject: [PATCH 2/2] Fix checkstyle issue --- .../common/shuffle/orderedgrouped/ShuffleScheduler.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tez-runtime-library/src/main/java/org/apache/tez/runtime/library/common/shuffle/orderedgrouped/ShuffleScheduler.java b/tez-runtime-library/src/main/java/org/apache/tez/runtime/library/common/shuffle/orderedgrouped/ShuffleScheduler.java index d90373fb5c..0812c6a610 100644 --- a/tez-runtime-library/src/main/java/org/apache/tez/runtime/library/common/shuffle/orderedgrouped/ShuffleScheduler.java +++ b/tez-runtime-library/src/main/java/org/apache/tez/runtime/library/common/shuffle/orderedgrouped/ShuffleScheduler.java @@ -910,7 +910,8 @@ private boolean hasFailedAcrossNodes(String logContext) { if (failures != null && failures.get() > minFailurePerHost) { totalFailures++; if (totalFailures > totalThreshold) { - LOG.info("Number of failures across nodes ({}) has exceeded the total threshold limit ({}) for InputAttemptIdentifier: {}", totalFailures, totalThreshold, logContext); + LOG.info("Number of failures across nodes ({}) has exceeded the total threshold limit ({}) " + + "for InputAttemptIdentifier: {}", totalFailures, totalThreshold, logContext); return true; } }