diff --git a/cmd/claw-api/scheduler.go b/cmd/claw-api/scheduler.go index a61b181..2e58d15 100644 --- a/cmd/claw-api/scheduler.go +++ b/cmd/claw-api/scheduler.go @@ -58,6 +58,9 @@ type dispatchOptions struct { ignoreDegraded bool } +const defaultWakeExecTimeout = 30 * time.Second +const openclawWakeExecTimeout = 2 * time.Minute + func newScheduler(manifest *schedulepkg.Manifest, docker *client.Client, state *scheduleStateStore, log io.Writer) (*scheduler, error) { if manifest == nil || len(manifest.Invocations) == 0 { return nil, nil @@ -229,7 +232,15 @@ func (s *scheduler) dispatchWithOptions(ctx context.Context, entry *scheduledInv return result } - execCtx, cancel := context.WithTimeout(ctx, 30*time.Second) + if detail, skip := s.deferWakeForHealth(ctx, target.ID, entry.manifest.Wake.Adapter); skip { + result.status = "skipped" + result.detail = detail + result.skipped = true + s.logf("schedule %s: skipped (%s)", entry.manifest.ID, result.detail) + return result + } + + execCtx, cancel := context.WithTimeout(ctx, wakeExecTimeout(entry.manifest.Wake.Adapter)) defer cancel() stdout, stderr, exitCode, err := shared.ExecInContainer(execCtx, s.docker, target.ID, entry.manifest.Wake.Command) if err != nil { @@ -275,6 +286,41 @@ func (s *scheduler) dispatchWithOptions(ctx context.Context, entry *scheduledInv return result } +func wakeExecTimeout(adapter string) time.Duration { + switch strings.TrimSpace(adapter) { + case "openclaw-exec": + return openclawWakeExecTimeout + default: + return defaultWakeExecTimeout + } +} + +func deferWakeForHealthStatus(adapter string, state *types.ContainerState) (string, bool) { + if strings.TrimSpace(adapter) != "openclaw-exec" || state == nil || state.Health == nil { + return "", false + } + status := strings.ToLower(strings.TrimSpace(state.Health.Status)) + switch status { + case "", "healthy": + return "", false + case "starting": + return "target-health-starting", true + default: + return "target-health-" + status, true + } +} + +func (s *scheduler) deferWakeForHealth(ctx context.Context, containerID, adapter string) (string, bool) { + if s == nil || s.docker == nil { + return "", false + } + info, err := s.docker.ContainerInspect(ctx, containerID) + if err != nil { + return "", false + } + return deferWakeForHealthStatus(adapter, info.State) +} + func (s *scheduler) lookupTargetContainer(ctx context.Context, target string) (types.Container, error) { if s == nil || s.docker == nil { return types.Container{}, fmt.Errorf("docker client unavailable") diff --git a/cmd/claw-api/scheduler_test.go b/cmd/claw-api/scheduler_test.go index b9682ff..a95d7c2 100644 --- a/cmd/claw-api/scheduler_test.go +++ b/cmd/claw-api/scheduler_test.go @@ -3,6 +3,8 @@ package main import ( "testing" "time" + + "github.com/docker/docker/api/types" ) func TestNextSchedulerDelayAlignsToMinuteBoundary(t *testing.T) { @@ -32,3 +34,48 @@ func TestShouldAttemptDegradedThrottlesToRoughlyTenPercent(t *testing.T) { t.Fatalf("expected roughly 10%% allowed, got %d/%d", allowed, total) } } + +func TestWakeExecTimeoutUsesOpenClawBudget(t *testing.T) { + if got := wakeExecTimeout("openclaw-exec"); got != openclawWakeExecTimeout { + t.Fatalf("expected openclaw wake timeout %v, got %v", openclawWakeExecTimeout, got) + } + if got := wakeExecTimeout("hermes-exec"); got != defaultWakeExecTimeout { + t.Fatalf("expected default wake timeout %v, got %v", defaultWakeExecTimeout, got) + } +} + +func TestDeferWakeForHealthStatusRequiresHealthyOpenClawTarget(t *testing.T) { + t.Run("healthy openclaw proceeds", func(t *testing.T) { + if detail, skip := deferWakeForHealthStatus("openclaw-exec", &types.ContainerState{ + Health: &types.Health{Status: "healthy"}, + }); skip || detail != "" { + t.Fatalf("expected healthy openclaw target to proceed, got detail=%q skip=%v", detail, skip) + } + }) + + t.Run("starting openclaw defers", func(t *testing.T) { + detail, skip := deferWakeForHealthStatus("openclaw-exec", &types.ContainerState{ + Health: &types.Health{Status: "starting"}, + }) + if !skip || detail != "target-health-starting" { + t.Fatalf("expected starting openclaw target to defer, got detail=%q skip=%v", detail, skip) + } + }) + + t.Run("unhealthy openclaw defers", func(t *testing.T) { + detail, skip := deferWakeForHealthStatus("openclaw-exec", &types.ContainerState{ + Health: &types.Health{Status: "unhealthy"}, + }) + if !skip || detail != "target-health-unhealthy" { + t.Fatalf("expected unhealthy openclaw target to defer, got detail=%q skip=%v", detail, skip) + } + }) + + t.Run("non-openclaw adapter ignores health", func(t *testing.T) { + if detail, skip := deferWakeForHealthStatus("hermes-exec", &types.ContainerState{ + Health: &types.Health{Status: "starting"}, + }); skip || detail != "" { + t.Fatalf("expected non-openclaw adapter to ignore health deferral, got detail=%q skip=%v", detail, skip) + } + }) +} diff --git a/site/changelog.md b/site/changelog.md index 2510643..fa949e7 100644 --- a/site/changelog.md +++ b/site/changelog.md @@ -30,6 +30,7 @@ outline: deep ## Unreleased - **Fix: OpenClaw scheduled jobs are materialized under the canonical cron store again** ([#159](https://github.com/mostlydev/clawdapus/issues/159)) — the OpenClaw driver now mounts a writable `~/.openclaw/cron/` directory and writes `jobs.json` there instead of under the config directory. Current OpenClaw builds resolve cron definitions from `~/.openclaw/cron/jobs.json`, so the previous layout left `openclaw cron list` empty and `openclaw cron run ` failed against jobs Clawdapus thought it had compiled. `claw up` now emits the native store where OpenClaw actually reads it, preserves the dedicated cron directory mount, and keeps pod-origin wakes targeting the runner-native `openclaw cron run ` contract. +- **Fix: OpenClaw scheduler wakes no longer burn failures during startup lag** ([#160](https://github.com/mostlydev/clawdapus/issues/160)) — `claw-api` now treats OpenClaw wakes as adapter-aware operations instead of generic 30 second execs. The scheduler defers `openclaw-exec` wakes while Docker health is still `starting` or `unhealthy`, so boot lag stops being recorded as a failed fire, and OpenClaw wakes now get a longer exec budget before being marked as timed out. This reduces false degradation on desks where the runner is still coming up even though the schedule is otherwise valid. ## v0.8.11 {#v0-8-11}