diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 6d7490bad..4f8479106 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -27,6 +27,8 @@ jobs: cloud: name: e2e k8s-${{ matrix.k3s }} runs-on: ubuntu-latest + # Fork pull requests cannot access the required secrets, so only repository branches may run this job. + if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository permissions: id-token: write @@ -156,6 +158,8 @@ jobs: robot: name: e2e robot runs-on: ubuntu-latest + # Fork pull requests cannot access the required secrets, so only repository branches may run this job. + if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository permissions: id-token: write diff --git a/hcloud/cloud.go b/hcloud/cloud.go index 8840fd567..f17b5e725 100644 --- a/hcloud/cloud.go +++ b/hcloud/cloud.go @@ -107,7 +107,7 @@ func NewCloud(cidr string) (cloudprovider.Interface, error) { robotClient = robot.NewRateLimitedClient( cfg.Robot.RateLimitWaitTime, - robot.NewCachedClient(cfg.Robot.CacheTimeout, c), + robot.NewCachedClient(cfg.Robot.CacheTimeout, robot.NewClient(c)), ) } diff --git a/hcloud/cloud_test.go b/hcloud/cloud_test.go index 71634ee5f..3bc13da59 100644 --- a/hcloud/cloud_test.go +++ b/hcloud/cloud_test.go @@ -31,6 +31,7 @@ import ( "k8s.io/client-go/tools/record" "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/config" + "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/robot" "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/testsupport" "github.com/hetznercloud/hcloud-go/v2/hcloud" "github.com/hetznercloud/hcloud-go/v2/hcloud/schema" @@ -40,7 +41,7 @@ type testEnv struct { Server *httptest.Server Mux *http.ServeMux Client *hcloud.Client - RobotClient hrobot.RobotClient + RobotClient robot.Client Recorder record.EventRecorder Cfg config.HCCMConfiguration } @@ -75,7 +76,7 @@ func newTestEnv() testEnv { Server: server, Mux: mux, Client: client, - RobotClient: robotClient, + RobotClient: robot.NewClient(robotClient), Recorder: recorder, Cfg: cfg, } diff --git a/hcloud/instances_test.go b/hcloud/instances_test.go index 5f4aef91b..959b40c0f 100644 --- a/hcloud/instances_test.go +++ b/hcloud/instances_test.go @@ -24,6 +24,7 @@ import ( "os" "reflect" "testing" + "time" "github.com/stretchr/testify/assert" hrobotmodels "github.com/syself/hrobot-go/models" @@ -32,6 +33,7 @@ import ( cloudprovider "k8s.io/cloud-provider" "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/config" + "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/robot" "github.com/hetznercloud/hcloud-go/v2/hcloud" "github.com/hetznercloud/hcloud-go/v2/hcloud/schema" ) @@ -189,6 +191,70 @@ func TestInstances_InstanceExists(t *testing.T) { } } +// TestInstances_InstanceExistsRobotServerCreatedAfterCacheFill verifies +// that InstanceExists refreshes the Robot cache after a name lookup miss, +// so a bare-metal server created after the cache was warmed is found. +func TestInstances_InstanceExistsRobotServerCreatedAfterCacheFill(t *testing.T) { + env := newTestEnv() + defer env.Teardown() + + servers := make([]hrobotmodels.Server, 0, 2) + servers = append(servers, + hrobotmodels.Server{ + ServerIP: "233.252.0.123", + ServerIPv6Net: "2a01:f48:111:4221::", + ServerNumber: 321, + Name: "robot-server1", + }, + ) + + env.Mux.HandleFunc("/robot/server", func(w http.ResponseWriter, _ *http.Request) { + responses := make([]hrobotmodels.ServerResponse, 0, len(servers)) + for _, server := range servers { + responses = append(responses, hrobotmodels.ServerResponse{Server: server}) + } + json.NewEncoder(w).Encode(responses) + }) + env.Mux.HandleFunc("/servers", func(w http.ResponseWriter, _ *http.Request) { + json.NewEncoder(w).Encode(schema.ServerListResponse{Servers: []schema.Server{}}) + }) + + instances := newInstances( + env.Client, + robot.NewCachedClient(time.Hour, env.RobotClient), + env.Recorder, + 0, + env.Cfg, + ) + + exists, err := instances.InstanceExists(context.TODO(), &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "robot-server1"}, + }) + if err != nil { + t.Fatalf("Unexpected error warming cache: %v", err) + } + if !exists { + t.Fatal("Expected robot-server1 to exist") + } + + servers = append(servers, hrobotmodels.Server{ + ServerIP: "233.252.0.124", + ServerIPv6Net: "2a01:f48:111:4222::", + ServerNumber: 322, + Name: "robot-server2", + }) + + exists, err = instances.InstanceExists(context.TODO(), &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "robot-server2"}, + }) + if err != nil { + t.Fatalf("Unexpected error after creating robot-server2: %v", err) + } + if !exists { + t.Fatal("Expected robot-server2 to exist after it was created") + } +} + func TestInstances_InstanceShutdown(t *testing.T) { env := newTestEnv() defer env.Teardown() diff --git a/hcloud/instances_util.go b/hcloud/instances_util.go index 0503cc8c0..a2a09938f 100644 --- a/hcloud/instances_util.go +++ b/hcloud/instances_util.go @@ -88,11 +88,23 @@ func getRobotServerByName(c robot.Client, node *corev1.Node) (server *hrobotmode for i, s := range serverList { if s.Name == node.Name { - server = &serverList[i] + return &serverList[i], nil } } - return server, nil + serverList, err = c.ServerGetListForceRefresh(node.Name) + if err != nil { + return nil, fmt.Errorf("%s: force refresh after cache miss: %w", op, err) + } + + for i, s := range serverList { + if s.Name == node.Name { + return &serverList[i], nil + } + } + + // No matching Robot server exists for this node, even after refreshing the cached list. + return nil, nil } func getRobotServerByID(i *instances, id int, node *corev1.Node) (*hrobotmodels.Server, error) { diff --git a/hcloud/instances_util_test.go b/hcloud/instances_util_test.go index 792289098..198196ec1 100644 --- a/hcloud/instances_util_test.go +++ b/hcloud/instances_util_test.go @@ -63,3 +63,38 @@ func TestGetRobotServerByID(t *testing.T) { }) } } + +func TestGetRobotServerByNameForceRefreshFindsNewServer(t *testing.T) { + robotClientMock := &mocks.RobotClient{} + robotClientMock.Test(t) + robotClientMock.On("ServerGetList").Return([]hrobotmodels.Server{ + {ServerNumber: 321, Name: "robot-server1"}, + }, nil).Once() + robotClientMock.On("ServerGetListForceRefresh", "robot-server2").Return([]hrobotmodels.Server{ + {ServerNumber: 322, Name: "robot-server2"}, + }, nil).Once() + + server, err := getRobotServerByName(robotClientMock, &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "robot-server2"}, + }) + require.NoError(t, err) + require.NotNil(t, server) + assert.Equal(t, "robot-server2", server.Name) +} + +func TestGetRobotServerByNameReturnsNilAfterForceRefreshMiss(t *testing.T) { + robotClientMock := &mocks.RobotClient{} + robotClientMock.Test(t) + robotClientMock.On("ServerGetList").Return([]hrobotmodels.Server{ + {ServerNumber: 321, Name: "robot-server1"}, + }, nil).Once() + robotClientMock.On("ServerGetListForceRefresh", "robot-server2").Return([]hrobotmodels.Server{ + {ServerNumber: 321, Name: "robot-server1"}, + }, nil).Once() + + server, err := getRobotServerByName(robotClientMock, &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "robot-server2"}, + }) + require.NoError(t, err) + assert.Nil(t, server) +} diff --git a/internal/mocks/robot.go b/internal/mocks/robot.go index 04b486c5d..22c9e0366 100644 --- a/internal/mocks/robot.go +++ b/internal/mocks/robot.go @@ -19,6 +19,11 @@ func (m *RobotClient) ServerGetList() ([]hrobotmodels.Server, error) { return getRobotServers(args, 0), args.Error(1) } +func (m *RobotClient) ServerGetListForceRefresh(nodeName string) ([]hrobotmodels.Server, error) { + args := m.Called(nodeName) + return getRobotServers(args, 0), args.Error(1) +} + func (m *RobotClient) BootLinuxDelete(id int) (*hrobotmodels.Linux, error) { panic("this method should not be called") } diff --git a/internal/robot/adapter.go b/internal/robot/adapter.go new file mode 100644 index 000000000..9aba0e916 --- /dev/null +++ b/internal/robot/adapter.go @@ -0,0 +1,23 @@ +package robot + +import ( + hrobot "github.com/syself/hrobot-go" + hrobotmodels "github.com/syself/hrobot-go/models" +) + +// adapter wraps hrobot.RobotClient so it satisfies this package's Client interface. +type adapter struct { + hrobot.RobotClient +} + +func NewClient(robotClient hrobot.RobotClient) Client { + if robotClient == nil { + return nil + } + + return &adapter{RobotClient: robotClient} +} + +func (a *adapter) ServerGetListForceRefresh(_ string) ([]hrobotmodels.Server, error) { + return a.ServerGetList() +} diff --git a/internal/robot/adapter_test.go b/internal/robot/adapter_test.go new file mode 100644 index 000000000..d021ef6c4 --- /dev/null +++ b/internal/robot/adapter_test.go @@ -0,0 +1,44 @@ +package robot + +import ( + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + hrobot "github.com/syself/hrobot-go" + hrobotmodels "github.com/syself/hrobot-go/models" +) + +func TestNewClientNil(t *testing.T) { + assert.Nil(t, NewClient(nil)) +} + +func TestAdapterServerGetListForceRefresh(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, "/robot/server", r.URL.Path) + err := json.NewEncoder(w).Encode([]hrobotmodels.ServerResponse{ + { + Server: hrobotmodels.Server{ + ServerNumber: 321, + Name: "robot-server1", + }, + }, + }) + assert.NoError(t, err) + })) + defer server.Close() + + robotClient := hrobot.NewBasicAuthClient("", "") + robotClient.SetBaseURL(server.URL + "/robot") + + client := NewClient(robotClient) + require.NotNil(t, client) + + servers, err := client.ServerGetListForceRefresh("robot-server1") + require.NoError(t, err) + require.Len(t, servers, 1) + assert.Equal(t, "robot-server1", servers[0].Name) +} diff --git a/internal/robot/cache.go b/internal/robot/cache.go index 2bb96c71c..969dd9c6d 100644 --- a/internal/robot/cache.go +++ b/internal/robot/cache.go @@ -12,20 +12,26 @@ type cacheRobotClient struct { timeout time.Duration lastUpdate time.Time + now func() time.Time // mutex is necessary to synchronize parallel access to the cache mutex sync.Mutex // cache servers []hrobotmodels.Server serversByID map[int]*hrobotmodels.Server + + // forcedRefreshServerNames tracks which node names already triggered a forced refresh within the current cache timeout window. + forcedRefreshServerNames map[string]time.Time } func NewCachedClient(cacheTimeout time.Duration, robotClient Client) Client { return &cacheRobotClient{ timeout: cacheTimeout, robotClient: robotClient, + now: time.Now, - serversByID: make(map[int]*hrobotmodels.Server), + serversByID: make(map[int]*hrobotmodels.Server), + forcedRefreshServerNames: make(map[string]time.Time), } } @@ -57,13 +63,40 @@ func (c *cacheRobotClient) ServerGetList() ([]hrobotmodels.Server, error) { return c.servers, nil } +// ServerGetListForceRefresh refreshes the server list immediately, unless the same node already forced a refresh within the current cache timeout window. +func (c *cacheRobotClient) ServerGetListForceRefresh(nodeName string) ([]hrobotmodels.Server, error) { + c.mutex.Lock() + defer c.mutex.Unlock() + + if nodeName != "" && c.nodeHasAlreadyForcedRefresh(nodeName) { + if err := c.updateCacheIfNecessary(); err != nil { + return nil, err + } + return c.servers, nil + } + + if err := c.refreshCache(); err != nil { + return nil, err + } + + if nodeName != "" { + c.forcedRefreshServerNames[nodeName] = c.currentTime() + } + + return c.servers, nil +} + // Make sure to lock the mutext before calling updateCacheIfNecessary. func (c *cacheRobotClient) updateCacheIfNecessary() error { nextUpdate := c.lastUpdate.Add(c.timeout) - if time.Now().Before(nextUpdate) { + if c.currentTime().Before(nextUpdate) { return nil } + return c.refreshCache() +} + +func (c *cacheRobotClient) refreshCache() error { servers, err := c.robotClient.ServerGetList() if err != nil { return err @@ -79,10 +112,34 @@ func (c *cacheRobotClient) updateCacheIfNecessary() error { } // set time of last update - c.lastUpdate = time.Now() + c.lastUpdate = c.currentTime() return nil } +// nodeHasAlreadyForcedRefresh reports whether this node already triggered a forced refresh within the current cache timeout window and drops expired entries. +func (c *cacheRobotClient) nodeHasAlreadyForcedRefresh(nodeName string) bool { + forcedAt, found := c.forcedRefreshServerNames[nodeName] + if !found { + return false + } + + if c.currentTime().After(forcedAt.Add(c.timeout)) { + delete(c.forcedRefreshServerNames, nodeName) + return false + } + + return true +} + +// currentTime centralizes access to the clock so tests can inject a deterministic time source via c.now. +func (c *cacheRobotClient) currentTime() time.Time { + if c.now == nil { + return time.Now() + } + + return c.now() +} + // ResetGet does not use the cache, as we need up to date information for its function. func (c *cacheRobotClient) ResetGet(id int) (*hrobotmodels.Reset, error) { return c.robotClient.ResetGet(id) diff --git a/internal/robot/cache_test.go b/internal/robot/cache_test.go new file mode 100644 index 000000000..fd0723f9a --- /dev/null +++ b/internal/robot/cache_test.go @@ -0,0 +1,134 @@ +package robot + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + hrobotmodels "github.com/syself/hrobot-go/models" + + "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/mocks" +) + +// TestCacheClientServerGetListForceRefreshSkipsRepeatedNameWithinTimeout verifies that repeated forced refreshes for the same node name reuse the cached result until the timeout expires. +func TestCacheClientServerGetListForceRefreshSkipsRepeatedNameWithinTimeout(t *testing.T) { + now := time.Date(2026, time.April, 7, 13, 0, 0, 0, time.UTC) + mockClient := &mocks.RobotClient{} + mockClient.On("ServerGetList").Return([]hrobotmodels.Server{ + {ServerNumber: 321, Name: "robot-server1"}, + }, nil).Once() + + client := &cacheRobotClient{ + robotClient: mockClient, + timeout: time.Hour, + now: func() time.Time { return now }, + serversByID: make(map[int]*hrobotmodels.Server), + forcedRefreshServerNames: make(map[string]time.Time), + } + + servers, err := client.ServerGetListForceRefresh("robot-server2") + assert.NoError(t, err) + assert.Len(t, servers, 1) + + servers, err = client.ServerGetListForceRefresh("robot-server2") + assert.NoError(t, err) + assert.Len(t, servers, 1) + + mockClient.AssertNumberOfCalls(t, "ServerGetList", 1) +} + +// TestCacheClientServerGetListForceRefreshExpiresPerName verifies that a node name can trigger another forced refresh after its timeout window has expired. +func TestCacheClientServerGetListForceRefreshExpiresPerName(t *testing.T) { + now := time.Date(2026, time.April, 7, 13, 0, 0, 0, time.UTC) + mockClient := &mocks.RobotClient{} + mockClient.On("ServerGetList").Return([]hrobotmodels.Server{ + {ServerNumber: 321, Name: "robot-server1"}, + }, nil).Twice() + + client := &cacheRobotClient{ + robotClient: mockClient, + timeout: time.Hour, + now: func() time.Time { return now }, + serversByID: make(map[int]*hrobotmodels.Server), + forcedRefreshServerNames: make(map[string]time.Time), + } + + _, err := client.ServerGetListForceRefresh("robot-server2") + assert.NoError(t, err) + + now = now.Add(time.Hour + time.Second) + + _, err = client.ServerGetListForceRefresh("robot-server2") + assert.NoError(t, err) + + mockClient.AssertNumberOfCalls(t, "ServerGetList", 2) +} + +type cacheTestRobotClient struct { + servers []hrobotmodels.Server + listCalls int + reset *hrobotmodels.Reset + resetCalls int +} + +func (c *cacheTestRobotClient) ServerGet(int) (*hrobotmodels.Server, error) { + panic("this method should not be called") +} + +func (c *cacheTestRobotClient) ServerGetList() ([]hrobotmodels.Server, error) { + c.listCalls++ + return c.servers, nil +} + +func (c *cacheTestRobotClient) ServerGetListForceRefresh(string) ([]hrobotmodels.Server, error) { + panic("this method should not be called") +} + +func (c *cacheTestRobotClient) ResetGet(int) (*hrobotmodels.Reset, error) { + c.resetCalls++ + return c.reset, nil +} + +func TestNewCachedClientCachesServerListAndLookupByID(t *testing.T) { + backend := &cacheTestRobotClient{ + servers: []hrobotmodels.Server{ + {ServerNumber: 321, Name: "robot-server1"}, + }, + } + + client := NewCachedClient(time.Hour, backend) + + servers, err := client.ServerGetList() + require.NoError(t, err) + require.Len(t, servers, 1) + + server, err := client.ServerGet(321) + require.NoError(t, err) + require.NotNil(t, server) + assert.Equal(t, "robot-server1", server.Name) + + missingServer, err := client.ServerGet(999) + require.Error(t, err) + assert.Nil(t, missingServer) + assert.True(t, hrobotmodels.IsError(err, hrobotmodels.ErrorCodeServerNotFound)) + assert.Equal(t, 1, backend.listCalls) +} + +func TestCacheClientCurrentTimeFallsBackToTimeNow(t *testing.T) { + client := &cacheRobotClient{} + + assert.WithinDuration(t, time.Now(), client.currentTime(), time.Second) +} + +func TestNewCachedClientResetGetBypassesCache(t *testing.T) { + expectedReset := &hrobotmodels.Reset{OperatingStatus: "running"} + backend := &cacheTestRobotClient{reset: expectedReset} + + client := NewCachedClient(time.Hour, backend) + + reset, err := client.ResetGet(321) + require.NoError(t, err) + assert.Same(t, expectedReset, reset) + assert.Equal(t, 1, backend.resetCalls) +} diff --git a/internal/robot/interface.go b/internal/robot/interface.go index 64e3d934c..b2231069f 100644 --- a/internal/robot/interface.go +++ b/internal/robot/interface.go @@ -7,5 +7,6 @@ import ( type Client interface { ServerGet(id int) (*hrobotmodels.Server, error) ServerGetList() ([]hrobotmodels.Server, error) + ServerGetListForceRefresh(nodeName string) ([]hrobotmodels.Server, error) ResetGet(id int) (*hrobotmodels.Reset, error) } diff --git a/internal/robot/ratelimit.go b/internal/robot/ratelimit.go index 81dfd8695..7afcf54dc 100644 --- a/internal/robot/ratelimit.go +++ b/internal/robot/ratelimit.go @@ -44,6 +44,17 @@ func (c *rateLimitClient) ServerGetList() ([]hrobotmodels.Server, error) { return servers, err } +// ServerGetListForceRefresh applies the same rate-limit guard to forced refreshes before delegating to the wrapped client. +func (c *rateLimitClient) ServerGetListForceRefresh(nodeName string) ([]hrobotmodels.Server, error) { + if c.isExceeded() { + return nil, c.getRateLimitError() + } + + servers, err := c.robotClient.ServerGetListForceRefresh(nodeName) + c.handleError(err) + return servers, err +} + func (c *rateLimitClient) ResetGet(id int) (*hrobotmodels.Reset, error) { if c.isExceeded() { return nil, c.getRateLimitError() diff --git a/internal/robot/ratelimit_test.go b/internal/robot/ratelimit_test.go index c041a26ba..ffce37c50 100644 --- a/internal/robot/ratelimit_test.go +++ b/internal/robot/ratelimit_test.go @@ -6,6 +6,7 @@ import ( "time" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" hrobotmodels "github.com/syself/hrobot-go/models" "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/mocks" @@ -66,3 +67,36 @@ func TestRateLimitGetRateLimitError(t *testing.T) { assert.Error(t, err) assert.True(t, strings.HasPrefix(err.Error(), "rate limit exceeded, next try at ")) } + +func TestRateLimitServerGetListForceRefresh(t *testing.T) { + mock := mocks.RobotClient{} + mock.On("ServerGetListForceRefresh", "robot-server1").Return([]hrobotmodels.Server{ + {ServerNumber: 321, Name: "robot-server1"}, + }, nil).Once() + + client := NewRateLimitedClient(5*time.Minute, &mock) + + servers, err := client.ServerGetListForceRefresh("robot-server1") + require.NoError(t, err) + require.Len(t, servers, 1) + assert.Equal(t, "robot-server1", servers[0].Name) + mock.AssertNumberOfCalls(t, "ServerGetListForceRefresh", 1) +} + +func TestRateLimitServerGetListForceRefreshStopsAfterRateLimit(t *testing.T) { + mock := mocks.RobotClient{} + mock.On("ServerGetListForceRefresh", "robot-server1").Return(nil, hrobotmodels.Error{ + Code: hrobotmodels.ErrorCodeRateLimitExceeded, + Message: "Rate limit exceeded", + }).Once() + + client := NewRateLimitedClient(5*time.Minute, &mock) + + _, err := client.ServerGetListForceRefresh("robot-server1") + require.Error(t, err) + + _, err = client.ServerGetListForceRefresh("robot-server1") + require.Error(t, err) + + mock.AssertNumberOfCalls(t, "ServerGetListForceRefresh", 1) +}