From 6c3cb1e736eb5e32b101cab7b2e7e3f5eff88227 Mon Sep 17 00:00:00 2001 From: zkofiro Date: Thu, 18 Dec 2025 18:12:34 -0800 Subject: [PATCH 01/14] fix: prevent job stuck at Starting when AgentWatchdog replaces failed agents --- .../tank/vm/vmManager/models/VMStatus.java | 3 +- .../rest/mvc/rest/cloud/JobEventSender.java | 13 ++++++- .../intuit/tank/vmManager/AgentWatchdog.java | 14 ++++---- .../intuit/tank/vmManager/VMTrackerImpl.java | 34 +++++++++++++++++-- 4 files changed, 51 insertions(+), 13 deletions(-) diff --git a/api/src/main/java/com/intuit/tank/vm/vmManager/models/VMStatus.java b/api/src/main/java/com/intuit/tank/vm/vmManager/models/VMStatus.java index 6d2b873f1..7dc4312b1 100644 --- a/api/src/main/java/com/intuit/tank/vm/vmManager/models/VMStatus.java +++ b/api/src/main/java/com/intuit/tank/vm/vmManager/models/VMStatus.java @@ -26,7 +26,8 @@ public enum VMStatus implements Serializable { stopping, stopped, shutting_down, - terminated; + terminated, + replaced; // replaced by AgentWatchdog due to failure to report back public static final VMStatus fromString(String value) { VMStatus ret = null; diff --git a/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSender.java b/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSender.java index 5aeaf0852..3e997aa09 100644 --- a/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSender.java +++ b/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSender.java @@ -247,7 +247,18 @@ private List getInstancesForJob(String jobId) { List instanceIds = new ArrayList(); CloudVmStatusContainer statuses = vmTracker.getVmStatusForJob(jobId); if (statuses != null) { - instanceIds = statuses.getStatuses().stream().map(CloudVmStatus::getInstanceId).collect(Collectors.toList()); + instanceIds = statuses.getStatuses().stream() + .filter(s -> { + VMStatus vmStatus = s.getVmStatus(); + // skip unreachable instances - they can't receive commands + return vmStatus != VMStatus.terminated && + vmStatus != VMStatus.replaced && + vmStatus != VMStatus.stopped && + vmStatus != VMStatus.shutting_down && + vmStatus != VMStatus.stopping; + }) + .map(CloudVmStatus::getInstanceId) + .collect(Collectors.toList()); } return instanceIds; } diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java index 1bc20e99a..e728c0e35 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java @@ -222,12 +222,16 @@ private void relaunch(ArrayList instances) { VMImageDao dao = new VMImageDao(); for (VMInformation info : instances) { vmInfo.remove(info); - vmTracker.setStatus(createTerminatedVmStatus(info)); + vmTracker.removeStatusForInstance(info.getInstanceId()); // remove from tracker to prevent blocking job status VMInstance image = dao.getImageByInstanceId(info.getInstanceId()); if (image != null) { - image.setStatus(VMStatus.terminated.name()); + image.setStatus(VMStatus.replaced.name()); // mark as replaced (not terminated) for audit trail dao.saveOrUpdate(image); } + + LOG.info(new ObjectMessage(Map.of("Message", + "Replaced and removed instance " + info.getInstanceId() + + " from tracker for job " + jobId))); } LOG.info(new ObjectMessage(Map.of("Message","Setting number of instances to relaunch to: " + instances.size() + " for job " + jobId))); instanceRequest.setNumberOfInstances(instances.size()); @@ -265,12 +269,6 @@ private CloudVmStatus createCloudStatus(VMInstanceRequest req, VMInformation inf VMImageType.AGENT, req.getRegion(), VMStatus.pending, new ValidationStatus(), 0, 0, null, null); } - private CloudVmStatus createTerminatedVmStatus(VMInformation info) { - return new CloudVmStatus(info.getInstanceId(), instanceRequest.getJobId(), "unknown", - JobStatus.Stopped, VMImageType.AGENT, instanceRequest.getRegion(), - VMStatus.terminated, new ValidationStatus(), 0, 0, null, null); - } - private boolean shouldRelaunchInstances() { return startTime + maxWaitForResponse < System.currentTimeMillis(); } diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java index 6ccff1254..e7d44b0ca 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java @@ -224,7 +224,7 @@ private JobQueueStatus getQueueStatus(JobQueueStatus oldStatus, JobStatus jobSta } /** - * If the vm is shutting down or terminated, don't update the status to something else. + * If the vm is shutting down, terminated, or replaced, don't update the status to something else. * @param currentStatus * @return */ @@ -234,7 +234,8 @@ private boolean shouldUpdateStatus(CloudVmStatus currentStatus) { return (status != VMStatus.shutting_down && status != VMStatus.stopped && status != VMStatus.stopping - && status != VMStatus.terminated); + && status != VMStatus.terminated + && status != VMStatus.replaced); } return true; } @@ -245,7 +246,24 @@ private boolean shouldUpdateStatus(CloudVmStatus currentStatus) { */ @Override public void removeStatusForInstance(String instanceId) { - statusMap.remove(instanceId); + CloudVmStatus status = statusMap.remove(instanceId); + + // also remove from the job's container to keep counts accurate + if (status != null) { + String jobId = status.getJobId(); + // Synchronize on the same lock used by setStatusThread to prevent + // ConcurrentModificationException when iterating over statuses + synchronized (getCacheSyncObject(jobId)) { + CloudVmStatusContainer container = jobMap.get(jobId); + if (container != null) { + boolean removed = container.getStatuses().remove(status); + if (removed) { + LOG.info(new ObjectMessage(Map.of("Message", + "Removed instance " + instanceId + " from container for job " + jobId))); + } + } + } + } } /** @@ -315,6 +333,16 @@ private void addStatusToJobContainer(CloudVmStatus status, CloudVmStatusContaine // look up the job JobInstance job = jobInstanceDao.get().findById(Integer.parseInt(status.getJobId())); for (CloudVmStatus s : cloudVmStatusContainer.getStatuses()) { + VMStatus vmStatus = s.getVmStatus(); + // skip replaced instances - these were replaced by AgentWatchdog due to failure + // but do NOT skip terminated/stopping/stopped/shutting_down - these are active agents in transition + if (vmStatus == VMStatus.replaced) { + LOG.info(new ObjectMessage(Map.of("Message", + "Skipping replaced instance " + s.getInstanceId() + + " in job status calculation for job " + status.getJobId()))); + continue; + } + JobStatus jobStatus = s.getJobStatus(); if (jobStatus != JobStatus.Completed) { // If no VMs are Completed isFinished = false; From 8f893fbfb0be4b41e5faa075a64af8d26f3c5acb Mon Sep 17 00:00:00 2001 From: zkofiro Date: Thu, 18 Dec 2025 21:43:51 -0800 Subject: [PATCH 02/14] add logging to understand it all --- .../com/intuit/tank/harness/APIMonitor.java | 17 ++++++++-- .../intuit/tank/harness/CommandListener.java | 2 ++ .../services/agent/AgentServiceV2Impl.java | 5 +++ .../perfManager/workLoads/JobManager.java | 13 +++++++- .../intuit/tank/vmManager/VMTrackerImpl.java | 33 +++++++++++++++++++ 5 files changed, 67 insertions(+), 3 deletions(-) diff --git a/agent/apiharness/src/main/java/com/intuit/tank/harness/APIMonitor.java b/agent/apiharness/src/main/java/com/intuit/tank/harness/APIMonitor.java index 804d76037..526ef00ea 100644 --- a/agent/apiharness/src/main/java/com/intuit/tank/harness/APIMonitor.java +++ b/agent/apiharness/src/main/java/com/intuit/tank/harness/APIMonitor.java @@ -83,7 +83,13 @@ private void updateInstanceStatus() { newStatus.setTotalTps(tpsInfo.getTotalTps()); sendTps(tpsInfo); } - if (!isLocal) setInstanceStatus(newStatus.getInstanceId(), newStatus); + + if (!isLocal) { + LOG.debug(LogUtil.getLogMessage("Sending status update - VMStatus: " + newStatus.getVmStatus() + + ", JobStatus: " + newStatus.getJobStatus() + ", Users: " + newStatus.getCurrentUsers() + + "/" + newStatus.getTotalUsers())); + setInstanceStatus(newStatus.getInstanceId(), newStatus); + } APITestHarness.getInstance().checkAgentThreads(); } catch (Exception t) { LOG.error(LogUtil.getLogMessage("Unable to send status metrics | " + t.getMessage()), t); @@ -120,13 +126,20 @@ private CloudVmStatus createStatus(WatsAgentStatusResponse agentStatus) { */ private JobStatus calculateJobStatus(WatsAgentStatusResponse agentStatus, JobStatus currentStatus) { AgentCommand cmd = APITestHarness.getInstance().getCmd(); - return cmd == AgentCommand.pause ? JobStatus.Paused + JobStatus newStatus = cmd == AgentCommand.pause ? JobStatus.Paused : cmd == AgentCommand.stop ? JobStatus.Stopped : cmd == AgentCommand.pause_ramp ? JobStatus.RampPaused : currentStatus == JobStatus.Unknown || currentStatus == JobStatus.Starting && agentStatus.getCurrentNumberUsers() > 0 ? JobStatus.Running : currentStatus; + + if (newStatus != currentStatus) { + LOG.info(LogUtil.getLogMessage("Agent JobStatus transition: " + currentStatus + " -> " + newStatus + + " (cmd=" + cmd + ", currentUsers=" + agentStatus.getCurrentNumberUsers() + ")")); + } + + return newStatus; } public static void setDoMonitor(boolean monitor) { diff --git a/agent/apiharness/src/main/java/com/intuit/tank/harness/CommandListener.java b/agent/apiharness/src/main/java/com/intuit/tank/harness/CommandListener.java index e7386ae60..cff991a2b 100644 --- a/agent/apiharness/src/main/java/com/intuit/tank/harness/CommandListener.java +++ b/agent/apiharness/src/main/java/com/intuit/tank/harness/CommandListener.java @@ -78,6 +78,8 @@ private static void handleRequest(HttpExchange exchange) { String path = exchange.getRequestURI().getPath(); if (path.equals(AgentCommand.start.getPath()) || path.equals(AgentCommand.run.getPath())) { response = "Received command " + path + ", Starting Test JobId=" + APITestHarness.getInstance().getAgentRunData().getJobId(); + LOG.info(LogUtil.getLogMessage("Received START command - launching test threads for job " + + APITestHarness.getInstance().getAgentRunData().getJobId())); startTest(); } else if (path.startsWith(AgentCommand.stop.getPath())) { response = "Received command " + path + ", Stopping Test JobId=" + APITestHarness.getInstance().getAgentRunData().getJobId(); diff --git a/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/services/agent/AgentServiceV2Impl.java b/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/services/agent/AgentServiceV2Impl.java index beaebe37a..9abba85f0 100644 --- a/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/services/agent/AgentServiceV2Impl.java +++ b/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/services/agent/AgentServiceV2Impl.java @@ -220,6 +220,11 @@ public void setInstanceStatus(String instanceId, CloudVmStatus status) { segment.putAnnotation("currentUsers", status.getCurrentUsers()); segment.putAnnotation("TotalUsers", status.getTotalUsers()); segment.putAnnotation("totalTps", status.getTotalTps()); + + LOGGER.info("Agent " + instanceId + " reporting status - VMStatus: " + status.getVmStatus() + + ", JobStatus: " + status.getJobStatus() + ", Users: " + status.getCurrentUsers() + + "/" + status.getTotalUsers() + ", Job: " + status.getJobId()); + try { JobEventSender controller = new ServletInjector().getManagedBean( servletContext, JobEventSender.class); diff --git a/tank_vmManager/src/main/java/com/intuit/tank/perfManager/workLoads/JobManager.java b/tank_vmManager/src/main/java/com/intuit/tank/perfManager/workLoads/JobManager.java index 887fa39b0..ef6bd5e9b 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/perfManager/workLoads/JobManager.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/perfManager/workLoads/JobManager.java @@ -172,12 +172,17 @@ public AgentTestStartData registerAgentForJob(AgentData agentData) { ret.setUserIntervalIncrement(jobInfo.jobRequest.getUserIntervalIncrement()); ret.setTargetRampRate(jobInfo.jobRequest.getTargetRatePerAgent()); jobInfo.agentData.add(agentData); + LOG.info(new ObjectMessage(Map.of("Message", "Agent " + agentData.getInstanceId() + + " added to job " + agentData.getJobId() + ". Total agents now: " + jobInfo.agentData.size() + + "/" + jobInfo.numberOfMachines + ", isFilled: " + jobInfo.isFilled()))); CloudVmStatus status = vmTracker.getStatus(agentData.getInstanceId()); if(status != null) { status.setVmStatus(VMStatus.pending); vmTracker.setStatus(status); } if (jobInfo.isFilled()) { + LOG.info(new ObjectMessage(Map.of("Message", "All " + jobInfo.numberOfMachines + + " agents registered for job " + agentData.getJobId() + " - starting test thread"))); new Thread( () -> { startTest(jobInfo); }).start(); } } @@ -209,8 +214,14 @@ private void startTest(final JobInfo info) { LOG.info(new ObjectMessage(Map.of("Message", "Start agents command received - Sending start commands for job " + jobId + " asynchronously to following agents: " + info.agentData.stream().collect(Collectors.toMap(AgentData::getInstanceId, AgentData::getInstanceUrl))))); } + LOG.info(new ObjectMessage(Map.of("Message", "Sending START commands to " + info.agentData.size() + + " agents for job " + jobId))); info.agentData.parallelStream() - .map(agentData -> agentData.getInstanceUrl() + AgentCommand.start.getPath()) + .map(agentData -> { + String url = agentData.getInstanceUrl() + AgentCommand.start.getPath(); + LOG.info(new ObjectMessage(Map.of("Message", "Sending command to url " + url))); + return url; + }) .map(URI::create) .map(uri -> sendCommand(uri, MAX_RETRIES)) .forEach(future -> { diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java index e7d44b0ca..3e205dd17 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java @@ -149,6 +149,12 @@ private void setStatusThread(@Nonnull final CloudVmStatus status) { synchronized (getCacheSyncObject(status.getJobId())) { status.setReportTime(new Date()); CloudVmStatus currentStatus = getStatus(status.getInstanceId()); + + LOG.debug(new ObjectMessage(Map.of("Message", + "Status update for instance " + status.getInstanceId() + + " - VMStatus: " + status.getVmStatus() + ", JobStatus: " + status.getJobStatus() + + ", Job: " + status.getJobId()))); + if (shouldUpdateStatus(currentStatus)) { statusMap.put(status.getInstanceId(), status); if (status.getVmStatus() == VMStatus.running @@ -157,6 +163,10 @@ private void setStatusThread(@Nonnull final CloudVmStatus status) { AmazonInstance amzInstance = new AmazonInstance(status.getVmRegion()); amzInstance.killInstances(Collections.singletonList(status.getInstanceId())); } + } else { + LOG.debug(new ObjectMessage(Map.of("Message", + "Skipping status update for instance " + status.getInstanceId() + + " - current status is " + (currentStatus != null ? currentStatus.getVmStatus() : "null")))); } String jobId = status.getJobId(); CloudVmStatusContainer cloudVmStatusContainer = jobMap.get(jobId); @@ -332,6 +342,12 @@ private void addStatusToJobContainer(CloudVmStatus status, CloudVmStatusContaine // look up the job JobInstance job = jobInstanceDao.get().findById(Integer.parseInt(status.getJobId())); + + // Log all statuses in container for debugging + LOG.debug(new ObjectMessage(Map.of("Message", + "Status calculation for job " + status.getJobId() + " - Container has " + + cloudVmStatusContainer.getStatuses().size() + " statuses"))); + for (CloudVmStatus s : cloudVmStatusContainer.getStatuses()) { VMStatus vmStatus = s.getVmStatus(); // skip replaced instances - these were replaced by AgentWatchdog due to failure @@ -343,6 +359,10 @@ private void addStatusToJobContainer(CloudVmStatus status, CloudVmStatusContaine continue; } + LOG.trace(new ObjectMessage(Map.of("Message", + "Checking instance " + s.getInstanceId() + + " - VMStatus: " + vmStatus + ", JobStatus: " + s.getJobStatus()))); + JobStatus jobStatus = s.getJobStatus(); if (jobStatus != JobStatus.Completed) { // If no VMs are Completed isFinished = false; @@ -360,6 +380,12 @@ private void addStatusToJobContainer(CloudVmStatus status, CloudVmStatusContaine running = false; } } + + LOG.debug(new ObjectMessage(Map.of("Message", + "Status calc complete for job " + status.getJobId() + + " - isFinished=" + isFinished + ", paused=" + paused + + ", rampPaused=" + rampPaused + ", stopped=" + stopped + ", running=" + running))); + if (isFinished) { LOG.info(new ObjectMessage(Map.of("Message","Setting end time on container " + cloudVmStatusContainer.getJobId()))); if (cloudVmStatusContainer.getEndTime() == null) { @@ -371,6 +397,7 @@ private void addStatusToJobContainer(CloudVmStatus status, CloudVmStatusContaine } if (job != null) { job.setEndTime(cloudVmStatusContainer.getEndTime()); + JobQueueStatus oldStatus = job.getStatus(); JobQueueStatus newStatus = job.getStatus(); if (isFinished) { newStatus = JobQueueStatus.Completed; @@ -389,6 +416,12 @@ private void addStatusToJobContainer(CloudVmStatus status, CloudVmStatusContaine } } + if (oldStatus != newStatus) { + LOG.info(new ObjectMessage(Map.of("Message", + "Job " + status.getJobId() + " status transition: " + oldStatus + " -> " + newStatus + + " (isFinished=" + isFinished + ", paused=" + paused + ", rampPaused=" + rampPaused + + ", stopped=" + stopped + ", running=" + running + ")"))); + } LOG.trace("Setting Container for job=" + status.getJobId() + " newStatus to " + newStatus); job.setStatus(newStatus); jobInstanceDao.get().saveOrUpdate(job); From e2e4be231dced02868372c7d4e7421b6277c78e2 Mon Sep 17 00:00:00 2001 From: zkofiro Date: Fri, 19 Dec 2025 13:50:45 -0800 Subject: [PATCH 03/14] replaced agents stay visible in the UI --- .../java/com/intuit/tank/harness/APIMonitor.java | 2 +- .../com/intuit/tank/vmManager/AgentWatchdog.java | 14 +++++++++++--- .../com/intuit/tank/vmManager/VMTrackerImpl.java | 6 +++--- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/agent/apiharness/src/main/java/com/intuit/tank/harness/APIMonitor.java b/agent/apiharness/src/main/java/com/intuit/tank/harness/APIMonitor.java index 526ef00ea..d4f202b59 100644 --- a/agent/apiharness/src/main/java/com/intuit/tank/harness/APIMonitor.java +++ b/agent/apiharness/src/main/java/com/intuit/tank/harness/APIMonitor.java @@ -85,7 +85,7 @@ private void updateInstanceStatus() { } if (!isLocal) { - LOG.debug(LogUtil.getLogMessage("Sending status update - VMStatus: " + newStatus.getVmStatus() + + LOG.info(LogUtil.getLogMessage("Sending status update - VMStatus: " + newStatus.getVmStatus() + ", JobStatus: " + newStatus.getJobStatus() + ", Users: " + newStatus.getCurrentUsers() + "/" + newStatus.getTotalUsers())); setInstanceStatus(newStatus.getInstanceId(), newStatus); diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java index e728c0e35..cd08967ee 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java @@ -222,7 +222,15 @@ private void relaunch(ArrayList instances) { VMImageDao dao = new VMImageDao(); for (VMInformation info : instances) { vmInfo.remove(info); - vmTracker.removeStatusForInstance(info.getInstanceId()); // remove from tracker to prevent blocking job status + + // Update status to 'replaced' in the tracker (keeps visible in UI, but filtered out of calculations) + CloudVmStatus replacedStatus = vmTracker.getStatus(info.getInstanceId()); + if (replacedStatus != null) { + replacedStatus.setVmStatus(VMStatus.replaced); + vmTracker.setStatus(replacedStatus); + } + + // Also update in the database for persistence VMInstance image = dao.getImageByInstanceId(info.getInstanceId()); if (image != null) { image.setStatus(VMStatus.replaced.name()); // mark as replaced (not terminated) for audit trail @@ -230,8 +238,8 @@ private void relaunch(ArrayList instances) { } LOG.info(new ObjectMessage(Map.of("Message", - "Replaced and removed instance " + info.getInstanceId() + - " from tracker for job " + jobId))); + "Marked instance " + info.getInstanceId() + + " as REPLACED (visible in UI but filtered from job status) for job " + jobId))); } LOG.info(new ObjectMessage(Map.of("Message","Setting number of instances to relaunch to: " + instances.size() + " for job " + jobId))); instanceRequest.setNumberOfInstances(instances.size()); diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java index 3e205dd17..01b3ee569 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java @@ -150,7 +150,7 @@ private void setStatusThread(@Nonnull final CloudVmStatus status) { status.setReportTime(new Date()); CloudVmStatus currentStatus = getStatus(status.getInstanceId()); - LOG.debug(new ObjectMessage(Map.of("Message", + LOG.info(new ObjectMessage(Map.of("Message", "Status update for instance " + status.getInstanceId() + " - VMStatus: " + status.getVmStatus() + ", JobStatus: " + status.getJobStatus() + ", Job: " + status.getJobId()))); @@ -344,7 +344,7 @@ private void addStatusToJobContainer(CloudVmStatus status, CloudVmStatusContaine JobInstance job = jobInstanceDao.get().findById(Integer.parseInt(status.getJobId())); // Log all statuses in container for debugging - LOG.debug(new ObjectMessage(Map.of("Message", + LOG.info(new ObjectMessage(Map.of("Message", "Status calculation for job " + status.getJobId() + " - Container has " + cloudVmStatusContainer.getStatuses().size() + " statuses"))); @@ -359,7 +359,7 @@ private void addStatusToJobContainer(CloudVmStatus status, CloudVmStatusContaine continue; } - LOG.trace(new ObjectMessage(Map.of("Message", + LOG.info(new ObjectMessage(Map.of("Message", "Checking instance " + s.getInstanceId() + " - VMStatus: " + vmStatus + ", JobStatus: " + s.getJobStatus()))); From 1e4bd64ed1b762dc5e98831d1817301d9aab0e47 Mon Sep 17 00:00:00 2001 From: zkofiro Date: Fri, 19 Dec 2025 13:54:28 -0800 Subject: [PATCH 04/14] TEMP CHANGE: test public/private ip logging w/ fix --- .../tank/vm/vmManager/VMInformation.java | 40 +++++++++++++++++++ .../intuit/tank/vmManager/AgentWatchdog.java | 9 ++++- .../vmManager/environment/CreateInstance.java | 6 +++ .../vmManager/environment/JobRequest.java | 9 ++++- .../amazon/AmazonDataConverter.java | 2 + 5 files changed, 64 insertions(+), 2 deletions(-) diff --git a/api/src/main/java/com/intuit/tank/vm/vmManager/VMInformation.java b/api/src/main/java/com/intuit/tank/vm/vmManager/VMInformation.java index da175fbd5..996609ccb 100644 --- a/api/src/main/java/com/intuit/tank/vm/vmManager/VMInformation.java +++ b/api/src/main/java/com/intuit/tank/vm/vmManager/VMInformation.java @@ -96,6 +96,46 @@ public String getPrivateDNS() { return (String) this.items.get("privateDns"); } + // TEMP_IP_LOGGING - START + /** + * Set the virtual machine private IP address + * + * @param data + * The virtual machine's private IP address + */ + public void setPrivateIp(String data) { + this.items.put("privateIp", data); + } + + /** + * Get the virtual machine private IP address + * + * @return The virtual machine's private IP address + */ + public String getPrivateIp() { + return (String) this.items.get("privateIp"); + } + + /** + * Set the virtual machine public IP address + * + * @param data + * The virtual machine's public IP address + */ + public void setPublicIp(String data) { + this.items.put("publicIp", data); + } + + /** + * Get the virtual machine public IP address + * + * @return The virtual machine's public IP address + */ + public String getPublicIp() { + return (String) this.items.get("publicIp"); + } + // TEMP_IP_LOGGING - END + public void setLaunchTime(Calendar data) { this.items.put("launchTime", data); } diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java index cd08967ee..793d60e5e 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java @@ -252,7 +252,14 @@ private void relaunch(ArrayList instances) { // Add directly to started instances since these are restarted from scratch startedInstances.add(newInfo); vmTracker.setStatus(createCloudStatus(instanceRequest, newInfo)); - LOG.info(new ObjectMessage(Map.of("Message","Added image (" + newInfo.getInstanceId() + ") to VMImage table for job " + jobId))); + // TEMP_IP_LOGGING - START + LOG.info(new ObjectMessage(Map.of( + "Message", "Added relaunched image to VMImage table for job " + jobId, + "instanceId", newInfo.getInstanceId(), + "publicIp", newInfo.getPublicIp() != null ? newInfo.getPublicIp() : "N/A", + "privateIp", newInfo.getPrivateIp() != null ? newInfo.getPrivateIp() : "N/A" + ))); + // TEMP_IP_LOGGING - END try { dao.addImageFromInfo(instanceRequest.getJobId(), newInfo, instanceRequest.getRegion()); diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/CreateInstance.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/CreateInstance.java index 9da50ee5c..7fcf6f0aa 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/CreateInstance.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/CreateInstance.java @@ -53,6 +53,12 @@ public void run() { for (VMInformation info : vmInfo) { vmTracker.setStatus(createCloudStatus(request, info)); dao.addImageFromInfo(request.getJobId(), info, request.getRegion()); + // TEMP_IP_LOGGING - START + logger.info("Added image (" + info.getInstanceId() + + ") with publicIp=" + (info.getPublicIp() != null ? info.getPublicIp() : "N/A") + + ", privateIp=" + (info.getPrivateIp() != null ? info.getPrivateIp() : "N/A") + + " to VMImage table"); + // TEMP_IP_LOGGING - END } } diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/JobRequest.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/JobRequest.java index 105790a43..3c381637a 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/JobRequest.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/JobRequest.java @@ -89,7 +89,14 @@ private void persistInstances(VMInstanceRequest instanceRequest, List Date: Fri, 19 Dec 2025 15:29:35 -0800 Subject: [PATCH 05/14] fixes agent count + test publicIp logging --- .../environment/amazon/AmazonInstance.java | 26 +++++++++++++++++++ .../com/intuit/tank/job/ActJobNodeBean.java | 7 ++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/amazon/AmazonInstance.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/amazon/AmazonInstance.java index 0feb610ea..195327182 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/amazon/AmazonInstance.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/amazon/AmazonInstance.java @@ -293,6 +293,32 @@ public List create(VMRequest request) { // reboot(result); } + // Wait for instances to get public IPs assigned and refresh the details + if (!result.isEmpty()) { + try { + // Brief wait to allow AWS to assign public IPs + Thread.sleep(3000); + List instanceIds = result.stream() + .map(VMInformation::getInstanceId) + .collect(Collectors.toList()); + DescribeInstancesResponse described = ec2AsyncClient.describeInstances( + DescribeInstancesRequest.builder().instanceIds(instanceIds).build() + ).get(); + // Update result with fresh instance data that includes public IPs + List updated = described.reservations().stream() + .flatMap(reservation -> reservation.instances().stream() + .map(instance -> AmazonDataConverter.instanceToVmInformation( + reservation.requesterId(), instance, vmRegion))) + .collect(Collectors.toList()); + result.clear(); + result.addAll(updated); + LOG.debug("Refreshed {} instance details with public IP information", updated.size()); + } catch (InterruptedException e) { + LOG.warn("Interrupted while waiting for public IP assignment", e); + Thread.currentThread().interrupt(); + } + } + } catch (SdkException ae) { LOG.error("Amazon issue starting instances: {} : {}", vmRegion, ae.getMessage(), ae); throw new RuntimeException(ae); diff --git a/web/web_support/src/main/java/com/intuit/tank/job/ActJobNodeBean.java b/web/web_support/src/main/java/com/intuit/tank/job/ActJobNodeBean.java index 99947207d..5f58b39fb 100644 --- a/web/web_support/src/main/java/com/intuit/tank/job/ActJobNodeBean.java +++ b/web/web_support/src/main/java/com/intuit/tank/job/ActJobNodeBean.java @@ -129,7 +129,12 @@ public List getSubNodes() { @Override public List getCurrentSubNodes() { - return vmBeans.stream().filter(vm -> !vm.getStatus().equals(VMStatus.terminated.toString())).collect(Collectors.toList()); + // Filter out both terminated and replaced agents from the count + // terminated = normal shutdown, replaced = watchdog replaced a failed agent + return vmBeans.stream() + .filter(vm -> !vm.getStatus().equals(VMStatus.terminated.toString()) + && !vm.getStatus().equals(VMStatus.replaced.toString())) + .collect(Collectors.toList()); } @Override From eabf716e13d4a271b873adadd09d42c82bae5cc4 Mon Sep 17 00:00:00 2001 From: zkofiro Date: Fri, 19 Dec 2025 16:15:27 -0800 Subject: [PATCH 06/14] fixes tests --- .../environment/amazon/AmazonInstance.java | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/amazon/AmazonInstance.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/amazon/AmazonInstance.java index 195327182..ad226044e 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/amazon/AmazonInstance.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/amazon/AmazonInstance.java @@ -301,21 +301,25 @@ public List create(VMRequest request) { List instanceIds = result.stream() .map(VMInformation::getInstanceId) .collect(Collectors.toList()); - DescribeInstancesResponse described = ec2AsyncClient.describeInstances( - DescribeInstancesRequest.builder().instanceIds(instanceIds).build() - ).get(); - // Update result with fresh instance data that includes public IPs - List updated = described.reservations().stream() - .flatMap(reservation -> reservation.instances().stream() - .map(instance -> AmazonDataConverter.instanceToVmInformation( - reservation.requesterId(), instance, vmRegion))) - .collect(Collectors.toList()); - result.clear(); - result.addAll(updated); - LOG.debug("Refreshed {} instance details with public IP information", updated.size()); + CompletableFuture future = ec2AsyncClient.describeInstances( + DescribeInstancesRequest.builder().instanceIds(instanceIds).build()); + if (future != null) { + DescribeInstancesResponse described = future.get(); + // Update result with fresh instance data that includes public IPs + List updated = described.reservations().stream() + .flatMap(reservation -> reservation.instances().stream() + .map(instance -> AmazonDataConverter.instanceToVmInformation( + reservation.requesterId(), instance, vmRegion))) + .collect(Collectors.toList()); + result.clear(); + result.addAll(updated); + LOG.debug("Refreshed {} instance details with public IP information", updated.size()); + } } catch (InterruptedException e) { LOG.warn("Interrupted while waiting for public IP assignment", e); Thread.currentThread().interrupt(); + } catch (ExecutionException e) { + LOG.warn("Failed to refresh instance details for public IPs: {}", e.getMessage()); } } From 07066e6bf51a37f9205b7b97201c68902642135c Mon Sep 17 00:00:00 2001 From: zkofiro Date: Mon, 5 Jan 2026 15:44:40 -0800 Subject: [PATCH 07/14] Add VMStatus.replaced for AgentWatchdog visibility + thread safety fixes --- .../intuit/tank/agent/models/VMStatus.java | 15 +- .../tank/vm/vmManager/models/VMStatus.java | 15 +- .../rest/mvc/rest/cloud/JobEventSender.java | 6 +- .../mvc/rest/cloud/JobEventSenderTest.java | 264 ++++++++++++++++++ .../intuit/tank/vmManager/VMTrackerImpl.java | 56 +++- .../tank/vmManager/VMTrackerImplTest.java | 105 +++++++ 6 files changed, 437 insertions(+), 24 deletions(-) create mode 100644 rest-mvc/impl/src/test/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSenderTest.java create mode 100644 tank_vmManager/src/test/java/com/intuit/tank/vmManager/VMTrackerImplTest.java diff --git a/api/src/main/java/com/intuit/tank/agent/models/VMStatus.java b/api/src/main/java/com/intuit/tank/agent/models/VMStatus.java index 7ccd68212..7727ccdd1 100644 --- a/api/src/main/java/com/intuit/tank/agent/models/VMStatus.java +++ b/api/src/main/java/com/intuit/tank/agent/models/VMStatus.java @@ -22,13 +22,18 @@ public enum VMStatus implements Serializable { terminated; public static final VMStatus fromString(String value) { - VMStatus ret = null; + if (value == null || value.isEmpty()) { + return VMStatus.unknown; + } if ("shutting-down".equals(value)) { - ret = VMStatus.shutting_down; - } else { - ret = VMStatus.valueOf(value); + return VMStatus.shutting_down; + } + try { + return VMStatus.valueOf(value); + } catch (IllegalArgumentException e) { + // Gracefully handle unknown values (e.g., 'replaced' from controller which agent doesn't need) + return VMStatus.unknown; } - return ret != null ? ret : VMStatus.unknown; } } diff --git a/api/src/main/java/com/intuit/tank/vm/vmManager/models/VMStatus.java b/api/src/main/java/com/intuit/tank/vm/vmManager/models/VMStatus.java index 7dc4312b1..23f0a1c3a 100644 --- a/api/src/main/java/com/intuit/tank/vm/vmManager/models/VMStatus.java +++ b/api/src/main/java/com/intuit/tank/vm/vmManager/models/VMStatus.java @@ -30,13 +30,18 @@ public enum VMStatus implements Serializable { replaced; // replaced by AgentWatchdog due to failure to report back public static final VMStatus fromString(String value) { - VMStatus ret = null; + if (value == null || value.isEmpty()) { + return VMStatus.unknown; + } if ("shutting-down".equals(value)) { - ret = VMStatus.shutting_down; - } else { - ret = VMStatus.valueOf(value); + return VMStatus.shutting_down; + } + try { + return VMStatus.valueOf(value); + } catch (IllegalArgumentException e) { + // Gracefully handle unknown values (e.g., from older/newer clients with different enum versions) + return VMStatus.unknown; } - return ret != null ? ret : VMStatus.unknown; } } diff --git a/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSender.java b/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSender.java index 3e997aa09..9eecb9ec3 100644 --- a/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSender.java +++ b/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSender.java @@ -271,8 +271,10 @@ public CloudVmStatus getVmStatus(String instanceId) { public void setVmStatus(final String instanceId, final CloudVmStatus status) { vmTracker.setStatus(status); - if (status.getJobStatus() == JobStatus.Completed || status.getVmStatus() == VMStatus.terminated) { - // will terrminate instance after waiting for some cleanup time + if (status.getJobStatus() == JobStatus.Completed + || status.getVmStatus() == VMStatus.terminated + || status.getVmStatus() == VMStatus.replaced) { + // will terminate instance after waiting for some cleanup time terminator.terminate(status.getInstanceId()); // check job status and kill off instances appropriately checkJobStatus(status.getJobId()); diff --git a/rest-mvc/impl/src/test/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSenderTest.java b/rest-mvc/impl/src/test/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSenderTest.java new file mode 100644 index 000000000..fb1693fbd --- /dev/null +++ b/rest-mvc/impl/src/test/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSenderTest.java @@ -0,0 +1,264 @@ +package com.intuit.tank.rest.mvc.rest.cloud; + +import com.intuit.tank.vm.vmManager.VMTracker; +import com.intuit.tank.vm.vmManager.models.CloudVmStatus; +import com.intuit.tank.vm.vmManager.models.CloudVmStatusContainer; +import com.intuit.tank.vm.vmManager.models.VMStatus; +import com.intuit.tank.vm.vmManager.models.ValidationStatus; +import com.intuit.tank.vm.api.enumerated.JobStatus; +import com.intuit.tank.vm.api.enumerated.VMImageType; +import com.intuit.tank.vm.api.enumerated.VMRegion; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.InjectMocks; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; + +import java.lang.reflect.Method; +import java.util.Date; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.*; +import static org.mockito.Mockito.*; + +/** + * Unit tests for JobEventSender + */ +@ExtendWith(MockitoExtension.class) +public class JobEventSenderTest { + + @Mock + private VMTracker vmTracker; + + @InjectMocks + private JobEventSender jobEventSender; + + private CloudVmStatusContainer container; + + @BeforeEach + void setUp() { + container = new CloudVmStatusContainer(); + container.setJobId("123"); + } + + // Helper method to create CloudVmStatus + private CloudVmStatus createStatus(String instanceId, JobStatus jobStatus, VMStatus vmStatus) { + return new CloudVmStatus( + instanceId, + "123", + "sg-test", + jobStatus, + VMImageType.AGENT, + VMRegion.US_WEST_2, + vmStatus, + new ValidationStatus(), + 100, + 50, + new Date(), + null + ); + } + + /** + * Use reflection to test the private getInstancesForJob method. + */ + private List invokeGetInstancesForJob(String jobId) throws Exception { + Method method = JobEventSender.class.getDeclaredMethod("getInstancesForJob", String.class); + method.setAccessible(true); + @SuppressWarnings("unchecked") + List result = (List) method.invoke(jobEventSender, jobId); + return result; + } + + @Test + @DisplayName("getInstancesForJob excludes terminated instances") + void getInstancesForJob_excludesTerminatedInstances() throws Exception { + // Given: Mix of running and terminated instances + container.getStatuses().add(createStatus("i-running1", JobStatus.Running, VMStatus.running)); + container.getStatuses().add(createStatus("i-terminated", JobStatus.Stopped, VMStatus.terminated)); + container.getStatuses().add(createStatus("i-running2", JobStatus.Running, VMStatus.running)); + + when(vmTracker.getVmStatusForJob("123")).thenReturn(container); + + // When + List instances = invokeGetInstancesForJob("123"); + + // Then: Only running instances should be returned + assertEquals(2, instances.size()); + assertTrue(instances.contains("i-running1")); + assertTrue(instances.contains("i-running2")); + assertFalse(instances.contains("i-terminated")); + } + + @Test + @DisplayName("getInstancesForJob excludes stopped instances") + void getInstancesForJob_excludesStoppedInstances() throws Exception { + // Given + container.getStatuses().add(createStatus("i-running", JobStatus.Running, VMStatus.running)); + container.getStatuses().add(createStatus("i-stopped", JobStatus.Stopped, VMStatus.stopped)); + + when(vmTracker.getVmStatusForJob("123")).thenReturn(container); + + // When + List instances = invokeGetInstancesForJob("123"); + + // Then + assertEquals(1, instances.size()); + assertTrue(instances.contains("i-running")); + assertFalse(instances.contains("i-stopped")); + } + + @Test + @DisplayName("getInstancesForJob excludes stopping instances") + void getInstancesForJob_excludesStoppingInstances() throws Exception { + // Given + container.getStatuses().add(createStatus("i-running", JobStatus.Running, VMStatus.running)); + container.getStatuses().add(createStatus("i-stopping", JobStatus.Stopped, VMStatus.stopping)); + + when(vmTracker.getVmStatusForJob("123")).thenReturn(container); + + // When + List instances = invokeGetInstancesForJob("123"); + + // Then + assertEquals(1, instances.size()); + assertTrue(instances.contains("i-running")); + assertFalse(instances.contains("i-stopping")); + } + + @Test + @DisplayName("getInstancesForJob excludes shutting_down instances") + void getInstancesForJob_excludesShuttingDownInstances() throws Exception { + // Given + container.getStatuses().add(createStatus("i-running", JobStatus.Running, VMStatus.running)); + container.getStatuses().add(createStatus("i-shuttingdown", JobStatus.Stopped, VMStatus.shutting_down)); + + when(vmTracker.getVmStatusForJob("123")).thenReturn(container); + + // When + List instances = invokeGetInstancesForJob("123"); + + // Then + assertEquals(1, instances.size()); + assertTrue(instances.contains("i-running")); + assertFalse(instances.contains("i-shuttingdown")); + } + + @Test + @DisplayName("getInstancesForJob includes pending instances (agents starting)") + void getInstancesForJob_includesPendingInstances() throws Exception { + // Given: Pending instances are still reachable + container.getStatuses().add(createStatus("i-pending", JobStatus.Starting, VMStatus.pending)); + + when(vmTracker.getVmStatusForJob("123")).thenReturn(container); + + // When + List instances = invokeGetInstancesForJob("123"); + + // Then: Pending instances should be included + assertEquals(1, instances.size()); + assertTrue(instances.contains("i-pending")); + } + + @Test + @DisplayName("getInstancesForJob includes starting instances") + void getInstancesForJob_includesStartingInstances() throws Exception { + // Given + container.getStatuses().add(createStatus("i-starting", JobStatus.Starting, VMStatus.starting)); + + when(vmTracker.getVmStatusForJob("123")).thenReturn(container); + + // When + List instances = invokeGetInstancesForJob("123"); + + // Then + assertEquals(1, instances.size()); + assertTrue(instances.contains("i-starting")); + } + + @Test + @DisplayName("getInstancesForJob returns empty list for non-existent job") + void getInstancesForJob_returnsEmptyForNonExistent() throws Exception { + // Given + when(vmTracker.getVmStatusForJob("999")).thenReturn(null); + + // When + List instances = invokeGetInstancesForJob("999"); + + // Then + assertNotNull(instances); + assertTrue(instances.isEmpty()); + } + + @Test + @DisplayName("getInstancesForJob excludes replaced instances (AgentWatchdog replacements)") + void getInstancesForJob_excludesReplacedInstances() throws Exception { + // Given + container.getStatuses().add(createStatus("i-running", JobStatus.Running, VMStatus.running)); + container.getStatuses().add(createStatus("i-replaced", JobStatus.Stopped, VMStatus.terminated)); + + when(vmTracker.getVmStatusForJob("123")).thenReturn(container); + + // When + List instances = invokeGetInstancesForJob("123"); + + // Then + assertEquals(1, instances.size()); + assertTrue(instances.contains("i-running")); + assertFalse(instances.contains("i-replaced")); + } + + @Test + @DisplayName("getInstancesForJob filters correctly with mixed statuses from AgentWatchdog scenario") + void getInstancesForJob_agentWatchdogScenario() throws Exception { + // Given: Real-world scenario where AgentWatchdog replaced some agents + // - 3 running agents (healthy) + // - 2 replaced agents (replaced by watchdog, now terminated) + container.getStatuses().add(createStatus("i-healthy1", JobStatus.Running, VMStatus.running)); + container.getStatuses().add(createStatus("i-healthy2", JobStatus.Running, VMStatus.running)); + container.getStatuses().add(createStatus("i-healthy3", JobStatus.Running, VMStatus.running)); + container.getStatuses().add(createStatus("i-replaced1", JobStatus.Stopped, VMStatus.terminated)); + container.getStatuses().add(createStatus("i-replaced2", JobStatus.Stopped, VMStatus.terminated)); + + when(vmTracker.getVmStatusForJob("123")).thenReturn(container); + + // When: User tries to kill the job + List instances = invokeGetInstancesForJob("123"); + + // Then: Only healthy instances should receive the kill command + assertEquals(3, instances.size()); + assertTrue(instances.contains("i-healthy1")); + assertTrue(instances.contains("i-healthy2")); + assertTrue(instances.contains("i-healthy3")); + assertFalse(instances.contains("i-replaced1")); + assertFalse(instances.contains("i-replaced2")); + } + + @Test + @DisplayName("getVmStatus delegates to vmTracker") + void getVmStatus_delegatesToVmTracker() { + String instanceId = "i-test123"; + CloudVmStatus expectedStatus = createStatus(instanceId, JobStatus.Running, VMStatus.running); + when(vmTracker.getStatus(instanceId)).thenReturn(expectedStatus); + + CloudVmStatus result = jobEventSender.getVmStatus(instanceId); + + assertEquals(expectedStatus, result); + verify(vmTracker).getStatus(instanceId); + } + + @Test + @DisplayName("getVmStatusForJob delegates to vmTracker") + void getVmStatusForJob_delegatesToVmTracker() { + String jobId = "123"; + when(vmTracker.getVmStatusForJob(jobId)).thenReturn(container); + + CloudVmStatusContainer result = jobEventSender.getVmStatusForJob(jobId); + + assertEquals(container, result); + verify(vmTracker).getVmStatusForJob(jobId); + } +} + diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java index 01b3ee569..75fd934b2 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java @@ -19,9 +19,11 @@ import static com.intuit.tank.vm.common.TankConstants.NOTIFICATIONS_EVENT_EVENT_TIME_KEY; import java.text.SimpleDateFormat; +import java.util.ArrayList; import java.util.Collections; import java.util.Date; import java.util.HashSet; +import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ArrayBlockingQueue; @@ -235,17 +237,26 @@ private JobQueueStatus getQueueStatus(JobQueueStatus oldStatus, JobStatus jobSta /** * If the vm is shutting down, terminated, or replaced, don't update the status to something else. + * This prevents race conditions where a user kills an instance while watchdog is replacing it, + * or stale status updates arrive after an instance has already transitioned to a terminal state. * @param currentStatus * @return */ private boolean shouldUpdateStatus(CloudVmStatus currentStatus) { if (currentStatus != null) { VMStatus status = currentStatus.getVmStatus(); - return (status != VMStatus.shutting_down - && status != VMStatus.stopped - && status != VMStatus.stopping - && status != VMStatus.terminated - && status != VMStatus.replaced); + boolean isTerminalState = (status == VMStatus.shutting_down + || status == VMStatus.stopped + || status == VMStatus.stopping + || status == VMStatus.terminated + || status == VMStatus.replaced); + if (isTerminalState) { + LOG.info(new ObjectMessage(Map.of("Message", + "Ignoring status update for instance " + currentStatus.getInstanceId() + + " - already in terminal state: " + status + + " (possible race between user kill and watchdog replace)"))); + return false; + } } return true; } @@ -282,12 +293,19 @@ public void removeStatusForInstance(String instanceId) { */ @Override public void removeStatusForJob(String jobId) { - CloudVmStatusContainer cloudVmStatusContainer = jobMap.get(jobId); - if (cloudVmStatusContainer != null) { - for (CloudVmStatus s : cloudVmStatusContainer.getStatuses()) { - removeStatusForInstance(s.getInstanceId()); + // Synchronize on the same lock used by setStatusThread to prevent + // ConcurrentModificationException when iterating over statuses + synchronized (getCacheSyncObject(jobId)) { + CloudVmStatusContainer cloudVmStatusContainer = jobMap.get(jobId); + if (cloudVmStatusContainer != null) { + // Copy to avoid ConcurrentModificationException - the underlying + // HashSet could be modified by setStatusThread while we iterate + List statusesCopy = new ArrayList<>(cloudVmStatusContainer.getStatuses()); + for (CloudVmStatus s : statusesCopy) { + removeStatusForInstance(s.getInstanceId()); + } + jobMap.remove(jobId); } - jobMap.remove(jobId); } } @@ -344,11 +362,15 @@ private void addStatusToJobContainer(CloudVmStatus status, CloudVmStatusContaine JobInstance job = jobInstanceDao.get().findById(Integer.parseInt(status.getJobId())); // Log all statuses in container for debugging + // Take a snapshot to avoid ConcurrentModificationException if another thread modifies + // the set while we iterate (defense in depth, even though we're synchronized) + Set statusesSnapshot = new HashSet<>(cloudVmStatusContainer.getStatuses()); LOG.info(new ObjectMessage(Map.of("Message", "Status calculation for job " + status.getJobId() + " - Container has " + - cloudVmStatusContainer.getStatuses().size() + " statuses"))); + statusesSnapshot.size() + " statuses"))); - for (CloudVmStatus s : cloudVmStatusContainer.getStatuses()) { + int activeInstanceCount = 0; + for (CloudVmStatus s : statusesSnapshot) { VMStatus vmStatus = s.getVmStatus(); // skip replaced instances - these were replaced by AgentWatchdog due to failure // but do NOT skip terminated/stopping/stopped/shutting_down - these are active agents in transition @@ -358,6 +380,7 @@ private void addStatusToJobContainer(CloudVmStatus status, CloudVmStatusContaine " in job status calculation for job " + status.getJobId()))); continue; } + activeInstanceCount++; LOG.info(new ObjectMessage(Map.of("Message", "Checking instance " + s.getInstanceId() + @@ -381,6 +404,15 @@ private void addStatusToJobContainer(CloudVmStatus status, CloudVmStatusContaine } } + // if all instances are replaced and replacements haven't reported yet, + // don't change job status - wait for active agents to report + if (activeInstanceCount == 0) { + LOG.info(new ObjectMessage(Map.of("Message", + "No active instances for job " + status.getJobId() + + " (all replaced or empty) - skipping status calculation until replacements report"))); + return; + } + LOG.debug(new ObjectMessage(Map.of("Message", "Status calc complete for job " + status.getJobId() + " - isFinished=" + isFinished + ", paused=" + paused + diff --git a/tank_vmManager/src/test/java/com/intuit/tank/vmManager/VMTrackerImplTest.java b/tank_vmManager/src/test/java/com/intuit/tank/vmManager/VMTrackerImplTest.java new file mode 100644 index 000000000..5c5690b79 --- /dev/null +++ b/tank_vmManager/src/test/java/com/intuit/tank/vmManager/VMTrackerImplTest.java @@ -0,0 +1,105 @@ +package com.intuit.tank.vmManager; + +import com.intuit.tank.vm.vmManager.models.CloudVmStatusContainer; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.DisplayName; + +import java.util.Set; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Unit tests for VMTrackerImpl, focusing on the bug fix for terminated agents + * blocking job status transitions. + */ +public class VMTrackerImplTest { + + private VMTrackerImpl vmTracker; + + @BeforeEach + void setUp() { + vmTracker = new VMTrackerImpl(); + } + + @Test + @DisplayName("getStatus returns null for non-existent instance") + void getStatus_returnsNullForNonExistent() { + assertNull(vmTracker.getStatus("non-existent-instance")); + } + + @Test + @DisplayName("removeStatusForInstance handles non-existent instance gracefully") + void removeStatusForInstance_handlesNonExistent() { + String instanceId = "i-test123"; + + // First verify getStatus returns null initially + assertNull(vmTracker.getStatus(instanceId)); + + // Call removeStatusForInstance - should handle non-existent gracefully (no exception) + vmTracker.removeStatusForInstance(instanceId); + + // Should still be null (no exception thrown) + assertNull(vmTracker.getStatus(instanceId)); + } + + @Test + @DisplayName("getVmStatusForJob returns null for non-existent job") + void getVmStatusForJob_returnsNullForNonExistent() { + assertNull(vmTracker.getVmStatusForJob("non-existent-job")); + } + + @Test + @DisplayName("getAllJobs returns empty set initially") + void getAllJobs_returnsEmptySetInitially() { + Set jobs = vmTracker.getAllJobs(); + assertNotNull(jobs); + assertTrue(jobs.isEmpty()); + } + + @Test + @DisplayName("stopJob marks job as stopped") + void stopJob_marksJobAsStopped() { + String jobId = "123"; + + // Initially running + assertTrue(vmTracker.isRunning(jobId)); + + // Stop the job + vmTracker.stopJob(jobId); + + // Now stopped + assertFalse(vmTracker.isRunning(jobId)); + } + + @Test + @DisplayName("isRunning returns true for jobs not explicitly stopped") + void isRunning_returnsTrueByDefault() { + assertTrue(vmTracker.isRunning("any-job-id")); + } + + @Test + @DisplayName("removeStatusForJob handles non-existent job gracefully") + void removeStatusForJob_handlesNonExistent() { + // Should not throw + vmTracker.removeStatusForJob("non-existent-job"); + + // Verify state is still consistent + assertNull(vmTracker.getVmStatusForJob("non-existent-job")); + } + + @Test + @DisplayName("isDevMode returns false by default") + void isDevMode_returnsFalseByDefault() { + // The devMode is set based on TankConfig which defaults to false + // unless explicitly configured as standalone + assertFalse(vmTracker.isDevMode()); + } + + @Test + @DisplayName("getProjectStatusContainer returns null for non-existent project") + void getProjectStatusContainer_returnsNullForNonExistent() { + assertNull(vmTracker.getProjectStatusContainer("non-existent-project")); + } +} + From c5cfdfba83a4a7486a71f526e25dae1313893f88 Mon Sep 17 00:00:00 2001 From: zkofiro Date: Tue, 6 Jan 2026 14:16:04 -0800 Subject: [PATCH 08/14] up that test coverage for new code --- .../tank/agent/models/VMStatusTest.java | 75 +++++++++++ .../vm/vmManager/models/VMStatusTest.java | 101 ++++++++++----- .../mvc/rest/cloud/JobEventSenderTest.java | 33 ++++- .../tank/vmManager/VMTrackerImplTest.java | 97 ++++++++++++++- .../intuit/tank/job/ActJobNodeBeanTest.java | 116 ++++++++++++++++++ 5 files changed, 382 insertions(+), 40 deletions(-) create mode 100644 api/src/test/java/com/intuit/tank/agent/models/VMStatusTest.java diff --git a/api/src/test/java/com/intuit/tank/agent/models/VMStatusTest.java b/api/src/test/java/com/intuit/tank/agent/models/VMStatusTest.java new file mode 100644 index 000000000..5360a0ee5 --- /dev/null +++ b/api/src/test/java/com/intuit/tank/agent/models/VMStatusTest.java @@ -0,0 +1,75 @@ +package com.intuit.tank.agent.models; + +import org.junit.jupiter.api.*; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Tests for the agent-side VMStatus enum. + * This enum is used by agents and doesn't include the 'replaced' status + * (which is only meaningful on the controller side). + */ +public class VMStatusTest { + + @Test + @DisplayName("fromString handles shutting-down special case") + public void testFromString_shuttingDown() { + VMStatus result = VMStatus.fromString("shutting-down"); + + assertNotNull(result); + assertEquals(VMStatus.shutting_down, result); + } + + @Test + @DisplayName("fromString returns correct enum for valid values") + public void testFromString_validValues() { + assertEquals(VMStatus.running, VMStatus.fromString("running")); + assertEquals(VMStatus.pending, VMStatus.fromString("pending")); + assertEquals(VMStatus.starting, VMStatus.fromString("starting")); + assertEquals(VMStatus.rebooting, VMStatus.fromString("rebooting")); + assertEquals(VMStatus.terminated, VMStatus.fromString("terminated")); + assertEquals(VMStatus.stopped, VMStatus.fromString("stopped")); + assertEquals(VMStatus.stopping, VMStatus.fromString("stopping")); + assertEquals(VMStatus.rampPaused, VMStatus.fromString("rampPaused")); + } + + @Test + @DisplayName("fromString returns unknown for null input") + public void testFromString_nullReturnsUnknown() { + VMStatus result = VMStatus.fromString(null); + + assertNotNull(result); + assertEquals(VMStatus.unknown, result); + } + + @Test + @DisplayName("fromString returns unknown for empty string") + public void testFromString_emptyReturnsUnknown() { + VMStatus result = VMStatus.fromString(""); + + assertNotNull(result); + assertEquals(VMStatus.unknown, result); + } + + @Test + @DisplayName("fromString returns unknown for unrecognized values") + public void testFromString_unknownValueReturnsUnknown() { + // Should not throw IllegalArgumentException - gracefully returns unknown + VMStatus result = VMStatus.fromString("garbage-value"); + + assertNotNull(result); + assertEquals(VMStatus.unknown, result); + } + + @Test + @DisplayName("fromString handles 'replaced' from controller gracefully (returns unknown)") + public void testFromString_replacedFromControllerReturnsUnknown() { + // The agent-side VMStatus doesn't have a 'replaced' enum value + // When the controller sends 'replaced', the agent should handle it gracefully + VMStatus result = VMStatus.fromString("replaced"); + + assertNotNull(result); + assertEquals(VMStatus.unknown, result); + } +} + diff --git a/api/src/test/java/com/intuit/tank/vm/vmManager/models/VMStatusTest.java b/api/src/test/java/com/intuit/tank/vm/vmManager/models/VMStatusTest.java index 117793759..66f250585 100644 --- a/api/src/test/java/com/intuit/tank/vm/vmManager/models/VMStatusTest.java +++ b/api/src/test/java/com/intuit/tank/vm/vmManager/models/VMStatusTest.java @@ -19,50 +19,85 @@ /** * The class VMStatusTest contains tests for the class {@link VMStatus}. - * - * @generatedBy CodePro at 12/15/14 2:57 PM */ public class VMStatusTest { - /** - * Run the VMStatus fromString(String) method test. - * - * @throws Exception - * - * @generatedBy CodePro at 12/15/14 2:57 PM - */ + @Test - @Disabled - public void testFromString_1() - throws Exception { - String value = "shutting-down"; + @DisplayName("fromString handles shutting-down special case") + public void testFromString_shuttingDown() { + VMStatus result = VMStatus.fromString("shutting-down"); + + assertNotNull(result); + assertEquals(VMStatus.shutting_down, result); + } - VMStatus result = VMStatus.fromString(value); + @Test + @DisplayName("fromString returns correct enum for valid values") + public void testFromString_validValues() { + assertEquals(VMStatus.running, VMStatus.fromString("running")); + assertEquals(VMStatus.pending, VMStatus.fromString("pending")); + assertEquals(VMStatus.starting, VMStatus.fromString("starting")); + assertEquals(VMStatus.ready, VMStatus.fromString("ready")); + assertEquals(VMStatus.rebooting, VMStatus.fromString("rebooting")); + assertEquals(VMStatus.terminated, VMStatus.fromString("terminated")); + assertEquals(VMStatus.stopped, VMStatus.fromString("stopped")); + assertEquals(VMStatus.stopping, VMStatus.fromString("stopping")); + assertEquals(VMStatus.rampPaused, VMStatus.fromString("rampPaused")); + } + @Test + @DisplayName("fromString returns replaced for 'replaced' value") + public void testFromString_replaced() { + VMStatus result = VMStatus.fromString("replaced"); + assertNotNull(result); - assertEquals("shutting_down", result.name()); - assertEquals("shutting_down", result.toString()); - assertEquals(6, result.ordinal()); + assertEquals(VMStatus.replaced, result); } - /** - * Run the VMStatus fromString(String) method test. - * - * @throws Exception - * - * @generatedBy CodePro at 12/15/14 2:57 PM - */ @Test - public void testFromString_2() - throws Exception { - String value = VMStatus.rebooting.name(); + @DisplayName("fromString returns unknown for null input") + public void testFromString_nullReturnsUnknown() { + VMStatus result = VMStatus.fromString(null); + + assertNotNull(result); + assertEquals(VMStatus.unknown, result); + } - VMStatus result = VMStatus.fromString(value); + @Test + @DisplayName("fromString returns unknown for empty string") + public void testFromString_emptyReturnsUnknown() { + VMStatus result = VMStatus.fromString(""); + + assertNotNull(result); + assertEquals(VMStatus.unknown, result); + } - // An unexpected exception was thrown in user code while executing this test: - // java.lang.IllegalArgumentException: No enum constant com.intuit.tank.vm.vmManager.models.VMStatus. - // at java.lang.Enum.valueOf(Enum.java:238) - // at com.intuit.tank.vm.vmManager.models.VMStatus.valueOf(VMStatus.java:5) - // at com.intuit.tank.vm.vmManager.models.VMStatus.fromString(VMStatus.java:20) + @Test + @DisplayName("fromString returns unknown for unrecognized values (graceful handling)") + public void testFromString_unknownValueReturnsUnknown() { + // Should not throw IllegalArgumentException - gracefully returns unknown + VMStatus result = VMStatus.fromString("garbage-value"); + assertNotNull(result); + assertEquals(VMStatus.unknown, result); + } + + @Test + @DisplayName("fromString handles future/unknown enum values gracefully") + public void testFromString_futureCompatibility() { + // Simulates receiving a value from a newer version of the API + VMStatus result = VMStatus.fromString("some_new_status_from_future"); + + assertNotNull(result); + assertEquals(VMStatus.unknown, result); + } + + @Test + @DisplayName("replaced is a terminal state (for documentation)") + public void testReplaced_isTerminalState() { + // This test documents that 'replaced' is intended as a terminal state + // like 'terminated', used by AgentWatchdog when replacing failed agents + assertNotNull(VMStatus.replaced); + assertEquals("replaced", VMStatus.replaced.name()); } } \ No newline at end of file diff --git a/rest-mvc/impl/src/test/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSenderTest.java b/rest-mvc/impl/src/test/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSenderTest.java index fb1693fbd..a6576e4aa 100644 --- a/rest-mvc/impl/src/test/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSenderTest.java +++ b/rest-mvc/impl/src/test/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSenderTest.java @@ -195,16 +195,16 @@ void getInstancesForJob_returnsEmptyForNonExistent() throws Exception { @Test @DisplayName("getInstancesForJob excludes replaced instances (AgentWatchdog replacements)") void getInstancesForJob_excludesReplacedInstances() throws Exception { - // Given + // Given: VMStatus.replaced is set by AgentWatchdog when an agent fails and is replaced container.getStatuses().add(createStatus("i-running", JobStatus.Running, VMStatus.running)); - container.getStatuses().add(createStatus("i-replaced", JobStatus.Stopped, VMStatus.terminated)); + container.getStatuses().add(createStatus("i-replaced", JobStatus.Starting, VMStatus.replaced)); when(vmTracker.getVmStatusForJob("123")).thenReturn(container); // When List instances = invokeGetInstancesForJob("123"); - // Then + // Then: Replaced instances should be filtered out (can't receive commands) assertEquals(1, instances.size()); assertTrue(instances.contains("i-running")); assertFalse(instances.contains("i-replaced")); @@ -215,12 +215,12 @@ void getInstancesForJob_excludesReplacedInstances() throws Exception { void getInstancesForJob_agentWatchdogScenario() throws Exception { // Given: Real-world scenario where AgentWatchdog replaced some agents // - 3 running agents (healthy) - // - 2 replaced agents (replaced by watchdog, now terminated) + // - 2 replaced agents (marked by watchdog with VMStatus.replaced) container.getStatuses().add(createStatus("i-healthy1", JobStatus.Running, VMStatus.running)); container.getStatuses().add(createStatus("i-healthy2", JobStatus.Running, VMStatus.running)); container.getStatuses().add(createStatus("i-healthy3", JobStatus.Running, VMStatus.running)); - container.getStatuses().add(createStatus("i-replaced1", JobStatus.Stopped, VMStatus.terminated)); - container.getStatuses().add(createStatus("i-replaced2", JobStatus.Stopped, VMStatus.terminated)); + container.getStatuses().add(createStatus("i-replaced1", JobStatus.Starting, VMStatus.replaced)); + container.getStatuses().add(createStatus("i-replaced2", JobStatus.Starting, VMStatus.replaced)); when(vmTracker.getVmStatusForJob("123")).thenReturn(container); @@ -236,6 +236,27 @@ void getInstancesForJob_agentWatchdogScenario() throws Exception { assertFalse(instances.contains("i-replaced2")); } + @Test + @DisplayName("getInstancesForJob excludes all terminal states correctly") + void getInstancesForJob_excludesAllTerminalStates() throws Exception { + // Given: One instance of each terminal state + one running + container.getStatuses().add(createStatus("i-running", JobStatus.Running, VMStatus.running)); + container.getStatuses().add(createStatus("i-terminated", JobStatus.Completed, VMStatus.terminated)); + container.getStatuses().add(createStatus("i-replaced", JobStatus.Starting, VMStatus.replaced)); + container.getStatuses().add(createStatus("i-stopped", JobStatus.Stopped, VMStatus.stopped)); + container.getStatuses().add(createStatus("i-stopping", JobStatus.Stopped, VMStatus.stopping)); + container.getStatuses().add(createStatus("i-shutting-down", JobStatus.Stopped, VMStatus.shutting_down)); + + when(vmTracker.getVmStatusForJob("123")).thenReturn(container); + + // When + List instances = invokeGetInstancesForJob("123"); + + // Then: Only the running instance should be included + assertEquals(1, instances.size()); + assertTrue(instances.contains("i-running")); + } + @Test @DisplayName("getVmStatus delegates to vmTracker") void getVmStatus_delegatesToVmTracker() { diff --git a/tank_vmManager/src/test/java/com/intuit/tank/vmManager/VMTrackerImplTest.java b/tank_vmManager/src/test/java/com/intuit/tank/vmManager/VMTrackerImplTest.java index 5c5690b79..c329962b0 100644 --- a/tank_vmManager/src/test/java/com/intuit/tank/vmManager/VMTrackerImplTest.java +++ b/tank_vmManager/src/test/java/com/intuit/tank/vmManager/VMTrackerImplTest.java @@ -1,17 +1,27 @@ package com.intuit.tank.vmManager; +import com.intuit.tank.vm.vmManager.models.CloudVmStatus; import com.intuit.tank.vm.vmManager.models.CloudVmStatusContainer; +import com.intuit.tank.vm.vmManager.models.VMStatus; +import com.intuit.tank.vm.vmManager.models.ValidationStatus; +import com.intuit.tank.vm.api.enumerated.JobStatus; +import com.intuit.tank.vm.api.enumerated.VMImageType; +import com.intuit.tank.vm.api.enumerated.VMRegion; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; +import java.lang.reflect.Method; +import java.util.Date; import java.util.Set; import static org.junit.jupiter.api.Assertions.*; /** * Unit tests for VMTrackerImpl, focusing on the bug fix for terminated agents - * blocking job status transitions. + * blocking job status transitions and the VMStatus.replaced handling. */ public class VMTrackerImplTest { @@ -22,6 +32,8 @@ void setUp() { vmTracker = new VMTrackerImpl(); } + // ============ Basic functionality tests ============ + @Test @DisplayName("getStatus returns null for non-existent instance") void getStatus_returnsNullForNonExistent() { @@ -101,5 +113,88 @@ void isDevMode_returnsFalseByDefault() { void getProjectStatusContainer_returnsNullForNonExistent() { assertNull(vmTracker.getProjectStatusContainer("non-existent-project")); } + + // ============ shouldUpdateStatus tests (terminal state handling) ============ + + /** + * Use reflection to test the private shouldUpdateStatus method. + */ + private boolean invokeShouldUpdateStatus(CloudVmStatus status) throws Exception { + Method method = VMTrackerImpl.class.getDeclaredMethod("shouldUpdateStatus", CloudVmStatus.class); + method.setAccessible(true); + return (boolean) method.invoke(vmTracker, status); + } + + private CloudVmStatus createStatus(String instanceId, VMStatus vmStatus) { + return new CloudVmStatus( + instanceId, + "123", + "sg-test", + JobStatus.Starting, + VMImageType.AGENT, + VMRegion.US_WEST_2, + vmStatus, + new ValidationStatus(), + 100, + 50, + new Date(), + null + ); + } + + @Test + @DisplayName("shouldUpdateStatus returns true for null current status") + void shouldUpdateStatus_nullCurrentStatus_returnsTrue() throws Exception { + assertTrue(invokeShouldUpdateStatus(null)); + } + + @ParameterizedTest + @EnumSource(value = VMStatus.class, names = {"terminated", "replaced", "stopped", "stopping", "shutting_down"}) + @DisplayName("shouldUpdateStatus returns false for terminal states") + void shouldUpdateStatus_terminalStates_returnsFalse(VMStatus terminalStatus) throws Exception { + CloudVmStatus status = createStatus("i-123", terminalStatus); + assertFalse(invokeShouldUpdateStatus(status), + "Should reject updates for terminal state: " + terminalStatus); + } + + @ParameterizedTest + @EnumSource(value = VMStatus.class, names = {"unknown", "starting", "pending", "ready", "running", "rampPaused", "rebooting"}) + @DisplayName("shouldUpdateStatus returns true for active states") + void shouldUpdateStatus_activeStates_returnsTrue(VMStatus activeStatus) throws Exception { + CloudVmStatus status = createStatus("i-123", activeStatus); + assertTrue(invokeShouldUpdateStatus(status), + "Should allow updates for active state: " + activeStatus); + } + + @Test + @DisplayName("shouldUpdateStatus blocks update for replaced instance (AgentWatchdog scenario)") + void shouldUpdateStatus_replacedInstance_blocksUpdate() throws Exception { + // Given: An instance that was replaced by AgentWatchdog + CloudVmStatus replacedStatus = createStatus("i-replaced", VMStatus.replaced); + + // When/Then: Updates should be blocked + assertFalse(invokeShouldUpdateStatus(replacedStatus), + "Replaced instances should not accept status updates (race condition guard)"); + } + + // ============ VMStatus.replaced integration tests ============ + + @Test + @DisplayName("replaced status is recognized as a valid VMStatus") + void replacedStatus_isValidEnum() { + assertNotNull(VMStatus.replaced); + assertEquals("replaced", VMStatus.replaced.name()); + } + + @Test + @DisplayName("setStatus respects terminal state guard for replaced instances") + void setStatus_respectsTerminalGuard_forReplaced() throws Exception { + // Note: This is a partial test - full integration requires mocked dependencies + // The key behavior is tested via shouldUpdateStatus tests above + + // Verify the enum value exists and can be used + CloudVmStatus status = createStatus("i-test", VMStatus.replaced); + assertEquals(VMStatus.replaced, status.getVmStatus()); + } } diff --git a/web/web_support/src/test/java/com/intuit/tank/job/ActJobNodeBeanTest.java b/web/web_support/src/test/java/com/intuit/tank/job/ActJobNodeBeanTest.java index 54431df82..378ab5f94 100644 --- a/web/web_support/src/test/java/com/intuit/tank/job/ActJobNodeBeanTest.java +++ b/web/web_support/src/test/java/com/intuit/tank/job/ActJobNodeBeanTest.java @@ -745,4 +745,120 @@ public void testSetVmBeans_1() // at com.intuit.tank.project.JobConfiguration.(JobConfiguration.java:63) // at com.intuit.tank.project.Workload.(Workload.java:57) } + + // ============ getCurrentSubNodes tests (VMStatus.replaced filtering) ============ + + /** + * Helper to create a VMNodeBean with a specific VMStatus. + */ + private VMNodeBean createVMNodeBean(String instanceId, VMStatus vmStatus) { + CloudVmStatus cvs = new CloudVmStatus( + instanceId, + "123", + "sg-1", + JobStatus.Running, + VMImageType.AGENT, + VMRegion.US_EAST_2, + vmStatus, + new ValidationStatus(), + 10, + 100, + new Date(), + null + ); + return new VMNodeBean(cvs, true, FastDateFormat.getDateTimeInstance(FastDateFormat.MEDIUM, FastDateFormat.MEDIUM)); + } + + @Test + public void testGetCurrentSubNodes_filtersReplacedInstances() throws Exception { + // Setup + Workload workload = new Workload(); + workload.setJobConfiguration(new JobConfiguration()); + JobInstance jobInstance = new JobInstance(workload, "test"); + jobInstance.setStatus(JobQueueStatus.Running); + ActJobNodeBean fixture = new ActJobNodeBean(jobInstance, true, FastDateFormat.getDateTimeInstance(FastDateFormat.MEDIUM, FastDateFormat.MEDIUM)); + + List vmBeans = new LinkedList<>(); + vmBeans.add(createVMNodeBean("i-001", VMStatus.running)); + vmBeans.add(createVMNodeBean("i-002", VMStatus.running)); + vmBeans.add(createVMNodeBean("i-003", VMStatus.replaced)); // Should be filtered + vmBeans.add(createVMNodeBean("i-004", VMStatus.running)); + fixture.setVmBeans(vmBeans); + + // Execute + List result = fixture.getCurrentSubNodes(); + + // Verify: replaced instance should be filtered out + assertEquals(3, result.size(), "Should exclude replaced instance"); + assertEquals(4, fixture.getSubNodes().size(), "getSubNodes should include ALL"); + } + + @Test + public void testGetCurrentSubNodes_filtersTerminatedInstances() throws Exception { + // Setup + Workload workload = new Workload(); + workload.setJobConfiguration(new JobConfiguration()); + JobInstance jobInstance = new JobInstance(workload, "test"); + jobInstance.setStatus(JobQueueStatus.Running); + ActJobNodeBean fixture = new ActJobNodeBean(jobInstance, true, FastDateFormat.getDateTimeInstance(FastDateFormat.MEDIUM, FastDateFormat.MEDIUM)); + + List vmBeans = new LinkedList<>(); + vmBeans.add(createVMNodeBean("i-001", VMStatus.running)); + vmBeans.add(createVMNodeBean("i-002", VMStatus.terminated)); // Should be filtered + vmBeans.add(createVMNodeBean("i-003", VMStatus.running)); + fixture.setVmBeans(vmBeans); + + // Execute + List result = fixture.getCurrentSubNodes(); + + // Verify + assertEquals(2, result.size(), "Should exclude terminated instance"); + } + + @Test + public void testGetCurrentSubNodes_filtersBothReplacedAndTerminated() throws Exception { + // Setup: Mix of running, replaced, and terminated + Workload workload = new Workload(); + workload.setJobConfiguration(new JobConfiguration()); + JobInstance jobInstance = new JobInstance(workload, "test"); + jobInstance.setStatus(JobQueueStatus.Running); + ActJobNodeBean fixture = new ActJobNodeBean(jobInstance, true, FastDateFormat.getDateTimeInstance(FastDateFormat.MEDIUM, FastDateFormat.MEDIUM)); + + List vmBeans = new LinkedList<>(); + vmBeans.add(createVMNodeBean("i-001", VMStatus.running)); + vmBeans.add(createVMNodeBean("i-002", VMStatus.running)); + vmBeans.add(createVMNodeBean("i-003", VMStatus.replaced)); // Should be filtered + vmBeans.add(createVMNodeBean("i-004", VMStatus.terminated)); // Should be filtered + vmBeans.add(createVMNodeBean("i-005", VMStatus.running)); + fixture.setVmBeans(vmBeans); + + // Execute + List result = fixture.getCurrentSubNodes(); + + // Verify: Both replaced and terminated should be filtered + assertEquals(3, result.size(), "Should only include running instances"); + assertEquals(5, fixture.getSubNodes().size(), "getSubNodes should include ALL for visibility"); + } + + @Test + public void testGetCurrentSubNodes_allReplaced_returnsEmpty() throws Exception { + // Setup: Edge case where all instances are replaced + Workload workload = new Workload(); + workload.setJobConfiguration(new JobConfiguration()); + JobInstance jobInstance = new JobInstance(workload, "test"); + jobInstance.setStatus(JobQueueStatus.Starting); + ActJobNodeBean fixture = new ActJobNodeBean(jobInstance, true, FastDateFormat.getDateTimeInstance(FastDateFormat.MEDIUM, FastDateFormat.MEDIUM)); + + List vmBeans = new LinkedList<>(); + vmBeans.add(createVMNodeBean("i-001", VMStatus.replaced)); + vmBeans.add(createVMNodeBean("i-002", VMStatus.replaced)); + fixture.setVmBeans(vmBeans); + + // Execute + List result = fixture.getCurrentSubNodes(); + + // Verify + assertEquals(0, result.size(), "Should return empty when all replaced"); + assertEquals(2, fixture.getSubNodes().size(), "All should still be visible in getSubNodes"); + } } \ No newline at end of file From 574b01e992fae7e878a7ac3f17f2010de8bb86c7 Mon Sep 17 00:00:00 2001 From: zkofiro Date: Tue, 6 Jan 2026 15:46:05 -0800 Subject: [PATCH 09/14] we can undo the bandaid 'pending' agent status implemented for replacing agents now --- .../intuit/tank/vmManager/AgentWatchdog.java | 25 ++-- .../tank/vmManager/AgentWatchdogTest.java | 114 ++++++++++++++++++ 2 files changed, 132 insertions(+), 7 deletions(-) diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java index 793d60e5e..6546e9f45 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java @@ -246,12 +246,18 @@ private void relaunch(ArrayList instances) { instances.clear(); // Create and send instance start request List newVms = amazonInstance.create(instanceRequest); - // Add new instances + // Add new instances - set to 'starting' status so watchdog waits for actual /v2/agent/ready call for (VMInformation newInfo : newVms) { vmInfo.add(newInfo); - // Add directly to started instances since these are restarted from scratch + // Add to startedInstances - watchdog will wait for this agent to actually report startedInstances.add(newInfo); - vmTracker.setStatus(createCloudStatus(instanceRequest, newInfo)); + CloudVmStatus newStatus = createCloudStatus(instanceRequest, newInfo); + vmTracker.setStatus(newStatus); + LOG.info(new ObjectMessage(Map.of( + "Message", "Created replacement agent with status " + newStatus.getVmStatus() + + " - watchdog will wait for /v2/agent/ready call", + "instanceId", newInfo.getInstanceId(), + "jobId", jobId))); // TEMP_IP_LOGGING - START LOG.info(new ObjectMessage(Map.of( "Message", "Added relaunched image to VMImage table for job " + jobId, @@ -273,15 +279,20 @@ private void relaunch(ArrayList instances) { } /** - * @param req - * @param info - * @return + * Creates initial cloud status for a newly launched replacement agent. + * CRITICAL: Must use VMStatus.starting (not pending) so watchdog waits for actual agent registration. + * + * Status flow: starting → (agent calls /v2/agent/ready) → pending → (receives START) → ready → running + * + * @param req the instance request + * @param info the VM information for the new instance + * @return CloudVmStatus with starting state */ private CloudVmStatus createCloudStatus(VMInstanceRequest req, VMInformation info) { return new CloudVmStatus(info.getInstanceId(), req.getJobId(), req.getInstanceDescription() != null ? req.getInstanceDescription().getSecurityGroup() : "unknown", JobStatus.Starting, - VMImageType.AGENT, req.getRegion(), VMStatus.pending, new ValidationStatus(), 0, 0, null, null); + VMImageType.AGENT, req.getRegion(), VMStatus.starting, new ValidationStatus(), 0, 0, null, null); } private boolean shouldRelaunchInstances() { diff --git a/tank_vmManager/src/test/java/com/intuit/tank/vmManager/AgentWatchdogTest.java b/tank_vmManager/src/test/java/com/intuit/tank/vmManager/AgentWatchdogTest.java index e43775931..82f714472 100644 --- a/tank_vmManager/src/test/java/com/intuit/tank/vmManager/AgentWatchdogTest.java +++ b/tank_vmManager/src/test/java/com/intuit/tank/vmManager/AgentWatchdogTest.java @@ -17,6 +17,7 @@ import org.mockito.Mockito; import org.mockito.junit.jupiter.MockitoExtension; +import java.lang.reflect.Method; import java.util.ArrayList; import java.util.Collections; import java.util.Date; @@ -98,4 +99,117 @@ public void progressToPendingRunTest(@Mock VMTracker vmTrackerMock, @Mock CloudV verify(cloudVmStatusContainerMock, times(2)).getEndTime(); verify(cloudVmStatusContainerMock, times(2)).getStatuses(); } + + /** + * CRITICAL TEST: Verifies that replacement agents are created with VMStatus.starting, + * NOT VMStatus.pending. This is essential because: + * + * - checkForReportingInstances() considers 'pending' as "agent has reported" + * - If replacements start with 'pending', they're immediately moved to reportedInstances + * - This causes watchdog to exit before agents actually call /v2/agent/ready + * - JobManager then waits forever for registrations that never come + * + * See: https://github.intuit.com/user/Tank/issues/XXX (if applicable) + */ + @Test + public void createCloudStatus_usesStartingNotPending() throws Exception { + VMTracker vmTracker = new VMTrackerImpl(); + VMInstanceRequest instanceRequest = new VMInstanceRequest(); + instanceRequest.setJobId("test-job-123"); + instanceRequest.setRegion(VMRegion.STANDALONE); + List vmInfo = new ArrayList<>(); + + AgentWatchdog agentWatchdog = new AgentWatchdog(instanceRequest, vmInfo, vmTracker); + + // Use reflection to access private createCloudStatus method + Method createCloudStatus = AgentWatchdog.class.getDeclaredMethod( + "createCloudStatus", VMInstanceRequest.class, VMInformation.class); + createCloudStatus.setAccessible(true); + + VMInformation vmInformation = new VMInformation(); + vmInformation.setInstanceId("i-replacement-test"); + + CloudVmStatus result = (CloudVmStatus) createCloudStatus.invoke( + agentWatchdog, instanceRequest, vmInformation); + + // THE CRITICAL ASSERTION: Must be 'starting', not 'pending' + assertEquals(VMStatus.starting, result.getVmStatus(), + "Replacement agents MUST start with VMStatus.starting so watchdog waits for actual /v2/agent/ready call. " + + "Using 'pending' causes watchdog to immediately consider them as 'reported' and exit prematurely."); + + // Verify other fields are correct + assertEquals("test-job-123", result.getJobId()); + assertEquals("i-replacement-test", result.getInstanceId()); + assertEquals(JobStatus.Starting, result.getJobStatus()); + } + + /** + * Verifies the checkForReportingInstances() logic only considers 'pending' agents as reported. + * This test ensures that 'starting' agents are NOT moved to reportedInstances. + */ + @Test + public void checkForReportingInstances_onlyMovesPendingAgents( + @Mock VMTracker vmTrackerMock, + @Mock CloudVmStatusContainer cloudVmStatusContainerMock) { + + // Setup: Two agents - one starting (should wait), one pending (should be reported) + when(cloudVmStatusContainerMock.getEndTime()).thenReturn(null); + + CloudVmStatus vmstatusStarting = new CloudVmStatus( + "i-still-starting", "123", "sg-123456", + JobStatus.Starting, VMImageType.AGENT, VMRegion.STANDALONE, + VMStatus.starting, new ValidationStatus(), 1, 1, new Date(), new Date()); + + CloudVmStatus vmstatusPending = new CloudVmStatus( + "i-reported", "123", "sg-123456", + JobStatus.Starting, VMImageType.AGENT, VMRegion.STANDALONE, + VMStatus.pending, new ValidationStatus(), 1, 1, new Date(), new Date()); + + Set statuses = new HashSet<>(); + statuses.add(vmstatusStarting); + statuses.add(vmstatusPending); + when(cloudVmStatusContainerMock.getStatuses()).thenReturn(statuses); + when(vmTrackerMock.getVmStatusForJob(null)).thenReturn(cloudVmStatusContainerMock); + + // Create VMInformation for both agents + VMInformation vmInfoStarting = new VMInformation(); + vmInfoStarting.setState("pending"); + vmInfoStarting.setInstanceId("i-still-starting"); + + VMInformation vmInfoPending = new VMInformation(); + vmInfoPending.setState("pending"); + vmInfoPending.setInstanceId("i-reported"); + + List vmInfo = new ArrayList<>(); + vmInfo.add(vmInfoStarting); + vmInfo.add(vmInfoPending); + + VMInstanceRequest instanceRequest = new VMInstanceRequest(); + instanceRequest.setRegion(VMRegion.STANDALONE); + + // Short timeout to let it check once then timeout waiting for i-still-starting + // maxWaitForResponse=1 means it will immediately try to relaunch + // But since we're not mocking amazonInstance.create, it should fail + // and we just verify the initial check behavior + AgentWatchdog agentWatchdog = new AgentWatchdog( + instanceRequest, vmInfo, vmTrackerMock, amazonInstanceMock, 10, 50); + + // Run in a way that only checks once - the job will "appear stopped" after first check + when(cloudVmStatusContainerMock.getEndTime()) + .thenReturn(null) // First check + .thenReturn(new Date()); // Second check - job appears stopped, exit + + try { + agentWatchdog.run(); + } catch (RuntimeException e) { + // Expected - job appears stopped + } + + // After ONE check: + // - i-reported (pending) should be in reportedInstances + // - i-still-starting (starting) should still be in startedInstances + // The key assertion is that the watchdog did NOT immediately exit + // because i-still-starting is still 'starting' not 'pending' + verify(cloudVmStatusContainerMock, atLeast(1)).getStatuses(); + } } From b92c4c6e001f1ada97fee4e36fdffe17ba93a970 Mon Sep 17 00:00:00 2001 From: zkofiro Date: Wed, 7 Jan 2026 11:24:10 -0800 Subject: [PATCH 10/14] fix: race condition in removeStatusForInstance + add unit tests + remove verbose logging --- .../intuit/tank/vmManager/AgentWatchdog.java | 4 +- .../intuit/tank/vmManager/VMTrackerImpl.java | 45 +-- .../tank/vmManager/VMTrackerImplTest.java | 328 ++++++++++++++++++ 3 files changed, 349 insertions(+), 28 deletions(-) diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java index 6546e9f45..2778d775d 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java @@ -147,8 +147,8 @@ public void run() { relaunch(startedInstances); startTime = System.currentTimeMillis(); } - LOG.info(new ObjectMessage(Map.of("Message","Job " + jobId + ": " + "Waiting for " + startedInstances.size() + " agents to report: " - + getInstanceIdList(startedInstances)))); + LOG.info(new ObjectMessage(Map.of("Message","Job " + jobId + ": Waiting for " + startedInstances.size() + " agents to report: " + + startedInstances.stream().map(VMInformation::getInstanceId).collect(Collectors.toList())))); Thread.sleep(sleepTime); } else { LOG.info(new ObjectMessage(Map.of("Message","All Agents Reported back for job " + jobId + "."))); diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java index 75fd934b2..25efb07bb 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java @@ -152,11 +152,6 @@ private void setStatusThread(@Nonnull final CloudVmStatus status) { status.setReportTime(new Date()); CloudVmStatus currentStatus = getStatus(status.getInstanceId()); - LOG.info(new ObjectMessage(Map.of("Message", - "Status update for instance " + status.getInstanceId() + - " - VMStatus: " + status.getVmStatus() + ", JobStatus: " + status.getJobStatus() + - ", Job: " + status.getJobId()))); - if (shouldUpdateStatus(currentStatus)) { statusMap.put(status.getInstanceId(), status); if (status.getVmStatus() == VMStatus.running @@ -267,23 +262,32 @@ private boolean shouldUpdateStatus(CloudVmStatus currentStatus) { */ @Override public void removeStatusForInstance(String instanceId) { - CloudVmStatus status = statusMap.remove(instanceId); + // First, get the status to find the jobId (without removing yet) + CloudVmStatus status = statusMap.get(instanceId); - // also remove from the job's container to keep counts accurate if (status != null) { String jobId = status.getJobId(); - // Synchronize on the same lock used by setStatusThread to prevent - // ConcurrentModificationException when iterating over statuses + // Synchronize on the job lock to ensure atomic removal from both + // statusMap and container - prevents race with setStatusThread synchronized (getCacheSyncObject(jobId)) { - CloudVmStatusContainer container = jobMap.get(jobId); - if (container != null) { - boolean removed = container.getStatuses().remove(status); - if (removed) { - LOG.info(new ObjectMessage(Map.of("Message", - "Removed instance " + instanceId + " from container for job " + jobId))); + // Now remove from statusMap inside the sync block + CloudVmStatus removedStatus = statusMap.remove(instanceId); + + // Also remove from the job's container to keep counts accurate + if (removedStatus != null) { + CloudVmStatusContainer container = jobMap.get(jobId); + if (container != null) { + boolean removed = container.getStatuses().remove(removedStatus); + if (removed) { + LOG.info(new ObjectMessage(Map.of("Message", + "Removed instance " + instanceId + " from container for job " + jobId))); + } } } } + } else { + // Instance not in statusMap - just try to remove (no-op if not present) + statusMap.remove(instanceId); } } @@ -361,13 +365,9 @@ private void addStatusToJobContainer(CloudVmStatus status, CloudVmStatusContaine // look up the job JobInstance job = jobInstanceDao.get().findById(Integer.parseInt(status.getJobId())); - // Log all statuses in container for debugging // Take a snapshot to avoid ConcurrentModificationException if another thread modifies // the set while we iterate (defense in depth, even though we're synchronized) Set statusesSnapshot = new HashSet<>(cloudVmStatusContainer.getStatuses()); - LOG.info(new ObjectMessage(Map.of("Message", - "Status calculation for job " + status.getJobId() + " - Container has " + - statusesSnapshot.size() + " statuses"))); int activeInstanceCount = 0; for (CloudVmStatus s : statusesSnapshot) { @@ -375,17 +375,10 @@ private void addStatusToJobContainer(CloudVmStatus status, CloudVmStatusContaine // skip replaced instances - these were replaced by AgentWatchdog due to failure // but do NOT skip terminated/stopping/stopped/shutting_down - these are active agents in transition if (vmStatus == VMStatus.replaced) { - LOG.info(new ObjectMessage(Map.of("Message", - "Skipping replaced instance " + s.getInstanceId() + - " in job status calculation for job " + status.getJobId()))); continue; } activeInstanceCount++; - LOG.info(new ObjectMessage(Map.of("Message", - "Checking instance " + s.getInstanceId() + - " - VMStatus: " + vmStatus + ", JobStatus: " + s.getJobStatus()))); - JobStatus jobStatus = s.getJobStatus(); if (jobStatus != JobStatus.Completed) { // If no VMs are Completed isFinished = false; diff --git a/tank_vmManager/src/test/java/com/intuit/tank/vmManager/VMTrackerImplTest.java b/tank_vmManager/src/test/java/com/intuit/tank/vmManager/VMTrackerImplTest.java index c329962b0..6849cfeb0 100644 --- a/tank_vmManager/src/test/java/com/intuit/tank/vmManager/VMTrackerImplTest.java +++ b/tank_vmManager/src/test/java/com/intuit/tank/vmManager/VMTrackerImplTest.java @@ -10,12 +10,20 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.RepeatedTest; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.EnumSource; import java.lang.reflect.Method; +import java.util.ArrayList; import java.util.Date; +import java.util.List; import java.util.Set; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; import static org.junit.jupiter.api.Assertions.*; @@ -196,5 +204,325 @@ void setStatus_respectsTerminalGuard_forReplaced() throws Exception { CloudVmStatus status = createStatus("i-test", VMStatus.replaced); assertEquals(VMStatus.replaced, status.getVmStatus()); } + + // ============ Status calculation tests (using reflection to bypass CDI dependencies) ============ + + private CloudVmStatus createStatusWithJobStatus(String instanceId, String jobId, VMStatus vmStatus, JobStatus jobStatus) { + return new CloudVmStatus( + instanceId, + jobId, + "sg-test", + jobStatus, + VMImageType.AGENT, + VMRegion.US_WEST_2, + vmStatus, + new ValidationStatus(), + 100, + 50, + new Date(), + null + ); + } + + /** + * Directly add status to internal maps using reflection (bypasses async setStatus and CDI dependencies). + * This allows testing the status calculation logic in isolation. + */ + @SuppressWarnings("unchecked") + private void addStatusDirectly(CloudVmStatus status) throws Exception { + // Access the private statusMap field + java.lang.reflect.Field statusMapField = VMTrackerImpl.class.getDeclaredField("statusMap"); + statusMapField.setAccessible(true); + java.util.concurrent.ConcurrentHashMap statusMap = + (java.util.concurrent.ConcurrentHashMap) statusMapField.get(vmTracker); + + // Access the private jobMap field + java.lang.reflect.Field jobMapField = VMTrackerImpl.class.getDeclaredField("jobMap"); + jobMapField.setAccessible(true); + java.util.concurrent.ConcurrentHashMap jobMap = + (java.util.concurrent.ConcurrentHashMap) jobMapField.get(vmTracker); + + // Add to statusMap + statusMap.put(status.getInstanceId(), status); + + // Add to job container + String jobId = status.getJobId(); + CloudVmStatusContainer container = jobMap.computeIfAbsent(jobId, k -> { + CloudVmStatusContainer c = new CloudVmStatusContainer(); + c.setJobId(jobId); + return c; + }); + container.getStatuses().add(status); + } + + @Test + @DisplayName("Status calculation with all agents replaced returns early (activeInstanceCount = 0)") + void statusCalculation_allAgentsReplaced_returnsEarly() throws Exception { + // Given: A job where ALL agents have been replaced by watchdog + String jobId = "all-replaced-job"; + + // Set up 3 agents, all marked as replaced (using direct manipulation) + addStatusDirectly(createStatusWithJobStatus("i-001", jobId, VMStatus.replaced, JobStatus.Starting)); + addStatusDirectly(createStatusWithJobStatus("i-002", jobId, VMStatus.replaced, JobStatus.Starting)); + addStatusDirectly(createStatusWithJobStatus("i-003", jobId, VMStatus.replaced, JobStatus.Starting)); + + // When: Get job status + CloudVmStatusContainer container = vmTracker.getVmStatusForJob(jobId); + + // Then: Container should exist with 3 statuses + assertNotNull(container, "Container should exist"); + assertEquals(3, container.getStatuses().size()); + + // Verify all have replaced status + assertTrue(container.getStatuses().stream().allMatch(s -> s.getVmStatus() == VMStatus.replaced), + "All agents should be in replaced state"); + } + + @Test + @DisplayName("Status calculation with mixed statuses correctly tracks replaced agents") + void statusCalculation_mixedStatuses_correctlyDeterminesJobStatus() throws Exception { + String jobId = "mixed-status-job"; + + // Given: Mix of running, pending, and replaced agents (using direct manipulation) + addStatusDirectly(createStatusWithJobStatus("i-running1", jobId, VMStatus.running, JobStatus.Running)); + addStatusDirectly(createStatusWithJobStatus("i-running2", jobId, VMStatus.running, JobStatus.Running)); + addStatusDirectly(createStatusWithJobStatus("i-replaced", jobId, VMStatus.replaced, JobStatus.Starting)); + addStatusDirectly(createStatusWithJobStatus("i-pending", jobId, VMStatus.pending, JobStatus.Starting)); + + // When + CloudVmStatusContainer container = vmTracker.getVmStatusForJob(jobId); + + // Then: Should have 4 total statuses + assertNotNull(container, "Container should exist"); + assertEquals(4, container.getStatuses().size()); + + // Replaced should be in container + assertTrue(container.getStatuses().stream() + .anyMatch(s -> s.getInstanceId().equals("i-replaced") && s.getVmStatus() == VMStatus.replaced), + "Replaced instance should be in container"); + + // Running instances should also be present + assertEquals(2, container.getStatuses().stream() + .filter(s -> s.getVmStatus() == VMStatus.running).count(), + "Should have 2 running instances"); + } + + @Test + @DisplayName("Status calculation container includes all terminal states") + void statusCalculation_onlySkipsReplaced_notOtherTerminalStates() throws Exception { + String jobId = "terminal-states-job"; + + // Given: Various terminal and non-terminal states (using direct manipulation) + addStatusDirectly(createStatusWithJobStatus("i-running", jobId, VMStatus.running, JobStatus.Running)); + addStatusDirectly(createStatusWithJobStatus("i-replaced", jobId, VMStatus.replaced, JobStatus.Starting)); + addStatusDirectly(createStatusWithJobStatus("i-terminated", jobId, VMStatus.terminated, JobStatus.Completed)); + addStatusDirectly(createStatusWithJobStatus("i-stopped", jobId, VMStatus.stopped, JobStatus.Stopped)); + + // When + CloudVmStatusContainer container = vmTracker.getVmStatusForJob(jobId); + + // Then: Container should have all 4 statuses + assertNotNull(container, "Container should exist"); + assertEquals(4, container.getStatuses().size()); + + // Verify each status type is present + assertTrue(container.getStatuses().stream().anyMatch(s -> s.getVmStatus() == VMStatus.running)); + assertTrue(container.getStatuses().stream().anyMatch(s -> s.getVmStatus() == VMStatus.replaced)); + assertTrue(container.getStatuses().stream().anyMatch(s -> s.getVmStatus() == VMStatus.terminated)); + assertTrue(container.getStatuses().stream().anyMatch(s -> s.getVmStatus() == VMStatus.stopped)); + } + + // ============ Thread safety tests (using direct manipulation to avoid CDI issues) ============ + + @RepeatedTest(3) + @DisplayName("Concurrent direct status additions do not cause ConcurrentModificationException") + void concurrentDirectStatusAdd_noException() throws Exception { + String jobId = "concurrent-test-job-" + System.nanoTime(); + int numThreads = 10; + int operationsPerThread = 50; + ExecutorService executor = Executors.newFixedThreadPool(numThreads); + CountDownLatch startLatch = new CountDownLatch(1); + CountDownLatch doneLatch = new CountDownLatch(numThreads); + AtomicInteger errorCount = new AtomicInteger(0); + + for (int t = 0; t < numThreads; t++) { + final int threadNum = t; + executor.submit(() -> { + try { + startLatch.await(); + for (int i = 0; i < operationsPerThread; i++) { + String instanceId = "i-thread" + threadNum + "-op" + i; + addStatusDirectly(createStatusWithJobStatus( + instanceId, jobId, VMStatus.running, JobStatus.Running)); + } + } catch (Exception e) { + errorCount.incrementAndGet(); + e.printStackTrace(); + } finally { + doneLatch.countDown(); + } + }); + } + + startLatch.countDown(); + assertTrue(doneLatch.await(30, TimeUnit.SECONDS), "Test timed out"); + executor.shutdown(); + + assertEquals(0, errorCount.get(), "Concurrent operations should not throw exceptions"); + + CloudVmStatusContainer container = vmTracker.getVmStatusForJob(jobId); + assertNotNull(container, "Container should exist"); + assertTrue(container.getStatuses().size() > 0, "Should have some statuses"); + } + + @RepeatedTest(3) + @DisplayName("Concurrent add and removeStatusForInstance do not cause race conditions") + void concurrentAddAndRemove_noRaceCondition() throws Exception { + String jobId = "concurrent-remove-job-" + System.nanoTime(); + int numAddThreads = 5; + int numRemoveThreads = 5; + int operationsPerThread = 30; + ExecutorService executor = Executors.newFixedThreadPool(numAddThreads + numRemoveThreads); + CountDownLatch startLatch = new CountDownLatch(1); + CountDownLatch doneLatch = new CountDownLatch(numAddThreads + numRemoveThreads); + AtomicInteger errorCount = new AtomicInteger(0); + List instanceIds = new ArrayList<>(); + + // Pre-populate some instances + for (int i = 0; i < operationsPerThread; i++) { + String instanceId = "i-prepop-" + i; + instanceIds.add(instanceId); + addStatusDirectly(createStatusWithJobStatus(instanceId, jobId, VMStatus.running, JobStatus.Running)); + } + + // Threads that add new statuses + for (int t = 0; t < numAddThreads; t++) { + final int threadNum = t; + executor.submit(() -> { + try { + startLatch.await(); + for (int i = 0; i < operationsPerThread; i++) { + String instanceId = "i-add-thread" + threadNum + "-op" + i; + addStatusDirectly(createStatusWithJobStatus( + instanceId, jobId, VMStatus.running, JobStatus.Running)); + } + } catch (Exception e) { + errorCount.incrementAndGet(); + e.printStackTrace(); + } finally { + doneLatch.countDown(); + } + }); + } + + // Threads that remove statuses + for (int t = 0; t < numRemoveThreads; t++) { + final int threadNum = t; + executor.submit(() -> { + try { + startLatch.await(); + for (int i = 0; i < operationsPerThread; i++) { + if (i < instanceIds.size()) { + vmTracker.removeStatusForInstance(instanceIds.get(i)); + } + vmTracker.removeStatusForInstance("i-nonexistent-" + threadNum + "-" + i); + } + } catch (Exception e) { + errorCount.incrementAndGet(); + e.printStackTrace(); + } finally { + doneLatch.countDown(); + } + }); + } + + startLatch.countDown(); + assertTrue(doneLatch.await(30, TimeUnit.SECONDS), "Test timed out"); + executor.shutdown(); + + assertEquals(0, errorCount.get(), "Concurrent add/remove should not throw exceptions"); + } + + @RepeatedTest(3) + @DisplayName("Concurrent removeStatusForJob does not cause ConcurrentModificationException") + void concurrentRemoveStatusForJob_noException() throws Exception { + int numJobs = 5; + int instancesPerJob = 10; + String prefix = "job-" + System.nanoTime() + "-"; + ExecutorService executor = Executors.newFixedThreadPool(numJobs); + CountDownLatch doneLatch = new CountDownLatch(numJobs); + AtomicInteger errorCount = new AtomicInteger(0); + + // Set up multiple jobs with instances + for (int j = 0; j < numJobs; j++) { + String jobId = prefix + j; + for (int i = 0; i < instancesPerJob; i++) { + addStatusDirectly(createStatusWithJobStatus( + "i-" + jobId + "-instance" + i, jobId, VMStatus.running, JobStatus.Running)); + } + } + + // Concurrently remove all jobs + for (int j = 0; j < numJobs; j++) { + final String jobId = prefix + j; + executor.submit(() -> { + try { + vmTracker.removeStatusForJob(jobId); + } catch (Exception e) { + errorCount.incrementAndGet(); + e.printStackTrace(); + } finally { + doneLatch.countDown(); + } + }); + } + + assertTrue(doneLatch.await(30, TimeUnit.SECONDS), "Test timed out"); + executor.shutdown(); + + assertEquals(0, errorCount.get(), "Concurrent removeStatusForJob should not throw exceptions"); + + // Verify all jobs are removed + for (int j = 0; j < numJobs; j++) { + assertNull(vmTracker.getVmStatusForJob(prefix + j), "Job " + prefix + j + " should be removed"); + } + } + + // ============ removeStatusForInstance race condition fix tests ============ + + @Test + @DisplayName("removeStatusForInstance removes from both statusMap and container atomically") + void removeStatusForInstance_atomicRemoval() throws Exception { + String jobId = "atomic-remove-job"; + String instanceId = "i-atomic-test"; + + // Given: An instance with status (using direct manipulation) + CloudVmStatus status = createStatusWithJobStatus(instanceId, jobId, VMStatus.running, JobStatus.Running); + addStatusDirectly(status); + + // Verify it exists in both places + assertNotNull(vmTracker.getStatus(instanceId), "Status should exist in statusMap"); + CloudVmStatusContainer container = vmTracker.getVmStatusForJob(jobId); + assertNotNull(container, "Container should exist"); + assertTrue(container.getStatuses().stream().anyMatch(s -> s.getInstanceId().equals(instanceId)), + "Instance should be in container"); + + // When: Remove the status + vmTracker.removeStatusForInstance(instanceId); + + // Then: Should be removed from both statusMap and container + assertNull(vmTracker.getStatus(instanceId), "Status should be removed from statusMap"); + container = vmTracker.getVmStatusForJob(jobId); + assertNotNull(container); // Container still exists (job not removed) + assertFalse(container.getStatuses().stream().anyMatch(s -> s.getInstanceId().equals(instanceId)), + "Instance should be removed from container"); + } + + @Test + @DisplayName("removeStatusForInstance handles non-existent instance gracefully with sync") + void removeStatusForInstance_nonExistent_noException() { + // Should not throw even with the new synchronized logic + assertDoesNotThrow(() -> vmTracker.removeStatusForInstance("i-does-not-exist")); + } } From 213a7f2ce93ce1694df21c3ddb30d4b015ee27d0 Mon Sep 17 00:00:00 2001 From: zkofiro Date: Wed, 7 Jan 2026 13:22:38 -0800 Subject: [PATCH 11/14] remove unnecessary logging --- .../src/main/java/com/intuit/tank/harness/APIMonitor.java | 3 --- .../java/com/intuit/tank/vm/vmManager/VMInformation.java | 2 -- .../java/com/intuit/tank/vmManager/AgentWatchdog.java | 8 -------- .../intuit/tank/vmManager/environment/CreateInstance.java | 6 ------ .../com/intuit/tank/vmManager/environment/JobRequest.java | 8 -------- .../vmManager/environment/amazon/AmazonDataConverter.java | 4 ++-- 6 files changed, 2 insertions(+), 29 deletions(-) diff --git a/agent/apiharness/src/main/java/com/intuit/tank/harness/APIMonitor.java b/agent/apiharness/src/main/java/com/intuit/tank/harness/APIMonitor.java index d4f202b59..7211a8467 100644 --- a/agent/apiharness/src/main/java/com/intuit/tank/harness/APIMonitor.java +++ b/agent/apiharness/src/main/java/com/intuit/tank/harness/APIMonitor.java @@ -85,9 +85,6 @@ private void updateInstanceStatus() { } if (!isLocal) { - LOG.info(LogUtil.getLogMessage("Sending status update - VMStatus: " + newStatus.getVmStatus() + - ", JobStatus: " + newStatus.getJobStatus() + ", Users: " + newStatus.getCurrentUsers() + - "/" + newStatus.getTotalUsers())); setInstanceStatus(newStatus.getInstanceId(), newStatus); } APITestHarness.getInstance().checkAgentThreads(); diff --git a/api/src/main/java/com/intuit/tank/vm/vmManager/VMInformation.java b/api/src/main/java/com/intuit/tank/vm/vmManager/VMInformation.java index 996609ccb..ea5fd7e04 100644 --- a/api/src/main/java/com/intuit/tank/vm/vmManager/VMInformation.java +++ b/api/src/main/java/com/intuit/tank/vm/vmManager/VMInformation.java @@ -96,7 +96,6 @@ public String getPrivateDNS() { return (String) this.items.get("privateDns"); } - // TEMP_IP_LOGGING - START /** * Set the virtual machine private IP address * @@ -134,7 +133,6 @@ public void setPublicIp(String data) { public String getPublicIp() { return (String) this.items.get("publicIp"); } - // TEMP_IP_LOGGING - END public void setLaunchTime(Calendar data) { this.items.put("launchTime", data); diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java index 2778d775d..eb145a148 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java @@ -258,14 +258,6 @@ private void relaunch(ArrayList instances) { " - watchdog will wait for /v2/agent/ready call", "instanceId", newInfo.getInstanceId(), "jobId", jobId))); - // TEMP_IP_LOGGING - START - LOG.info(new ObjectMessage(Map.of( - "Message", "Added relaunched image to VMImage table for job " + jobId, - "instanceId", newInfo.getInstanceId(), - "publicIp", newInfo.getPublicIp() != null ? newInfo.getPublicIp() : "N/A", - "privateIp", newInfo.getPrivateIp() != null ? newInfo.getPrivateIp() : "N/A" - ))); - // TEMP_IP_LOGGING - END try { dao.addImageFromInfo(instanceRequest.getJobId(), newInfo, instanceRequest.getRegion()); diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/CreateInstance.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/CreateInstance.java index 7fcf6f0aa..9da50ee5c 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/CreateInstance.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/CreateInstance.java @@ -53,12 +53,6 @@ public void run() { for (VMInformation info : vmInfo) { vmTracker.setStatus(createCloudStatus(request, info)); dao.addImageFromInfo(request.getJobId(), info, request.getRegion()); - // TEMP_IP_LOGGING - START - logger.info("Added image (" + info.getInstanceId() + - ") with publicIp=" + (info.getPublicIp() != null ? info.getPublicIp() : "N/A") + - ", privateIp=" + (info.getPrivateIp() != null ? info.getPrivateIp() : "N/A") + - " to VMImage table"); - // TEMP_IP_LOGGING - END } } diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/JobRequest.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/JobRequest.java index 3c381637a..774df5535 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/JobRequest.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/JobRequest.java @@ -89,14 +89,6 @@ private void persistInstances(VMInstanceRequest instanceRequest, List Date: Thu, 8 Jan 2026 14:27:59 -0800 Subject: [PATCH 12/14] no more log flooding --- .../tank/rest/mvc/rest/services/agent/AgentServiceV2Impl.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/services/agent/AgentServiceV2Impl.java b/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/services/agent/AgentServiceV2Impl.java index 9abba85f0..32dbde6de 100644 --- a/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/services/agent/AgentServiceV2Impl.java +++ b/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/services/agent/AgentServiceV2Impl.java @@ -221,7 +221,7 @@ public void setInstanceStatus(String instanceId, CloudVmStatus status) { segment.putAnnotation("TotalUsers", status.getTotalUsers()); segment.putAnnotation("totalTps", status.getTotalTps()); - LOGGER.info("Agent " + instanceId + " reporting status - VMStatus: " + status.getVmStatus() + + LOGGER.debug("Agent " + instanceId + " reporting status - VMStatus: " + status.getVmStatus() + ", JobStatus: " + status.getJobStatus() + ", Users: " + status.getCurrentUsers() + "/" + status.getTotalUsers() + ", Job: " + status.getJobId()); From 8dd908492ccd7bfa81c2b1f233c286d7fbeabc92 Mon Sep 17 00:00:00 2001 From: zkofiro Date: Thu, 8 Jan 2026 16:08:30 -0800 Subject: [PATCH 13/14] this was actually helpful --- .../main/java/com/intuit/tank/vmManager/AgentWatchdog.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java index eb145a148..753cec4f6 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java @@ -147,8 +147,8 @@ public void run() { relaunch(startedInstances); startTime = System.currentTimeMillis(); } - LOG.info(new ObjectMessage(Map.of("Message","Job " + jobId + ": Waiting for " + startedInstances.size() + " agents to report: " - + startedInstances.stream().map(VMInformation::getInstanceId).collect(Collectors.toList())))); + LOG.info(new ObjectMessage(Map.of("Message","Job " + jobId + ": " + "Waiting for " + startedInstances.size() + " agents to report: " + + getInstanceIdList(startedInstances)))); Thread.sleep(sleepTime); } else { LOG.info(new ObjectMessage(Map.of("Message","All Agents Reported back for job " + jobId + "."))); From c09111f4443e62ccec5260f592e7b41eb6304cba Mon Sep 17 00:00:00 2001 From: zkofiro Date: Fri, 9 Jan 2026 13:16:24 -0800 Subject: [PATCH 14/14] very helpful ip logging --- .../main/java/com/intuit/tank/vmManager/AgentWatchdog.java | 4 +++- .../com/intuit/tank/vmManager/environment/JobRequest.java | 6 ++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java index 753cec4f6..d0290e658 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java @@ -257,7 +257,9 @@ private void relaunch(ArrayList instances) { "Message", "Created replacement agent with status " + newStatus.getVmStatus() + " - watchdog will wait for /v2/agent/ready call", "instanceId", newInfo.getInstanceId(), - "jobId", jobId))); + "jobId", jobId, + "publicIp", newInfo.getPublicIp() != null ? newInfo.getPublicIp() : "N/A", + "privateIp", newInfo.getPrivateIp() != null ? newInfo.getPrivateIp() : "N/A"))); try { dao.addImageFromInfo(instanceRequest.getJobId(), newInfo, instanceRequest.getRegion()); diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/JobRequest.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/JobRequest.java index 774df5535..e8a189d6f 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/JobRequest.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/JobRequest.java @@ -89,6 +89,12 @@ private void persistInstances(VMInstanceRequest instanceRequest, List