From 31e89bfbdf06db2593c3e9eeedb551c80b2b267d Mon Sep 17 00:00:00 2001 From: Lucas Vieira Date: Mon, 27 Apr 2026 22:28:53 -0300 Subject: [PATCH] refactor(rds): make CreateDBInstance async, return creating immediately MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CreateDBInstance now returns a `creating` placeholder ~immediately and runs the container start (and any underlying postgres image pull/build) in a background tokio task. DescribeDBInstances flips the row to `available` once the container is up, or removes it and emits RDS-EVENT-0058 ("DB instance failed to create") on failure. Matches real AWS RDS behavior — CreateDBInstance never blocks on the container. - The `` element is suppressed from the Create/Describe XML while the instance is in `creating` so SDK callers don't try to dial an empty host:0. - Persistence load drops any `creating` rows from a captured snapshot; the background task didn't survive the restart and the placeholder would otherwise be stuck forever. - New `helpers::wait_for_db_available` for tests that need the endpoint after Create. Existing rds e2e tests + sdk e2e bridge into it; the heavy-engine and aws_lambda/aws_s3 tests pass through too. - TypeScript SDK e2e drops the 180_000 ms blanket timeout in favor of an explicit poll loop on DescribeDBInstances, matching what users will write themselves. --- crates/fakecloud-e2e/tests/helpers/mod.rs | 31 +++ crates/fakecloud-e2e/tests/rds.rs | 25 +- crates/fakecloud-e2e/tests/rds_aws_lambda.rs | 9 +- crates/fakecloud-e2e/tests/rds_aws_s3.rs | 9 +- .../fakecloud-e2e/tests/rds_heavy_engines.rs | 5 + crates/fakecloud-e2e/tests/sdk.rs | 1 + crates/fakecloud-rds/src/service.rs | 237 ++++++++++++------ crates/fakecloud-server/src/main.rs | 28 +++ sdks/typescript/tests/e2e.test.ts | 25 +- website/content/docs/services/rds.md | 28 +++ 10 files changed, 289 insertions(+), 109 deletions(-) diff --git a/crates/fakecloud-e2e/tests/helpers/mod.rs b/crates/fakecloud-e2e/tests/helpers/mod.rs index 66444316..22a8701c 100644 --- a/crates/fakecloud-e2e/tests/helpers/mod.rs +++ b/crates/fakecloud-e2e/tests/helpers/mod.rs @@ -26,3 +26,34 @@ pub fn gunzip(data: &[u8]) -> Vec { pub fn _path_buf_shim(p: PathBuf) -> PathBuf { p } + +/// Poll DescribeDBInstances until the instance reports +/// `db_instance_status = "available"`, then return the populated +/// `DbInstance`. CreateDBInstance returns a `creating` placeholder +/// immediately; this helper bridges tests that need the endpoint. +pub async fn wait_for_db_available( + rds: &aws_sdk_rds::Client, + db_instance_identifier: &str, + max_secs: u64, +) -> aws_sdk_rds::types::DbInstance { + let deadline = std::time::Instant::now() + std::time::Duration::from_secs(max_secs); + while std::time::Instant::now() < deadline { + if let Ok(resp) = rds + .describe_db_instances() + .db_instance_identifier(db_instance_identifier) + .send() + .await + { + for inst in resp.db_instances() { + if inst.db_instance_status() == Some("available") { + return inst.clone(); + } + } + } + tokio::time::sleep(std::time::Duration::from_millis(500)).await; + } + panic!( + "DB instance {} did not reach 'available' within {}s", + db_instance_identifier, max_secs + ); +} diff --git a/crates/fakecloud-e2e/tests/rds.rs b/crates/fakecloud-e2e/tests/rds.rs index 77cce82e..1fd2daf3 100644 --- a/crates/fakecloud-e2e/tests/rds.rs +++ b/crates/fakecloud-e2e/tests/rds.rs @@ -65,17 +65,7 @@ async fn rds_create_and_describe_db_instance() { let created = create_response.db_instance().expect("created instance"); assert_eq!(created.db_instance_status(), Some("creating")); - let describe_response = client - .describe_db_instances() - .db_instance_identifier("orders-db") - .send() - .await - .unwrap(); - - let instances = describe_response.db_instances(); - assert_eq!(instances.len(), 1); - let instance = &instances[0]; - assert_eq!(instance.db_instance_status(), Some("available")); + let instance = helpers::wait_for_db_available(&client, "orders-db", 180).await; assert_eq!(instance.engine(), Some("postgres")); let endpoint = instance.endpoint().expect("endpoint"); @@ -612,7 +602,7 @@ async fn create_instance_with_deletion_protection( db_instance_identifier: &str, deletion_protection: bool, ) -> aws_sdk_rds::operation::create_db_instance::CreateDbInstanceOutput { - client + let resp = client .create_db_instance() .db_instance_identifier(db_instance_identifier) .allocated_storage(20) @@ -625,7 +615,11 @@ async fn create_instance_with_deletion_protection( .db_name("appdb") .send() .await - .unwrap() + .unwrap(); + // CreateDBInstance returns a `creating` placeholder; most callers + // need the DB to be ready before exercising downstream ops. + helpers::wait_for_db_available(client, db_instance_identifier, 180).await; + resp } async fn connect_with_retry( @@ -766,8 +760,9 @@ async fn final_snapshot_on_delete() { .await .unwrap(); - let instance = response.db_instance().expect("db instance"); - let port = instance.endpoint().unwrap().port().unwrap(); + let _instance = response.db_instance().expect("db instance"); + let ready = helpers::wait_for_db_available(&client, "e2e-rds-final", 180).await; + let port = ready.endpoint().unwrap().port().unwrap(); // Wait for instance and insert test data let (postgres, connection) = diff --git a/crates/fakecloud-e2e/tests/rds_aws_lambda.rs b/crates/fakecloud-e2e/tests/rds_aws_lambda.rs index c735b37f..af3b901c 100644 --- a/crates/fakecloud-e2e/tests/rds_aws_lambda.rs +++ b/crates/fakecloud-e2e/tests/rds_aws_lambda.rs @@ -84,8 +84,7 @@ async fn aws_lambda_extension_invoke_round_trip() { // 2. Create the Postgres DB instance — triggers lazy fakecloud-postgres // image build on first run, so this can take a while. - let create = rds - .create_db_instance() + rds.create_db_instance() .db_instance_identifier("aws-lambda-ext-db") .allocated_storage(20) .db_instance_class("db.t3.micro") @@ -98,10 +97,8 @@ async fn aws_lambda_extension_invoke_round_trip() { .await .expect("create postgres instance"); - let endpoint = create - .db_instance() - .and_then(|i| i.endpoint()) - .expect("endpoint"); + let instance = helpers::wait_for_db_available(&rds, "aws-lambda-ext-db", 240).await; + let endpoint = instance.endpoint().expect("endpoint"); let host = endpoint.address().expect("address").to_string(); let port = endpoint.port().expect("port"); diff --git a/crates/fakecloud-e2e/tests/rds_aws_s3.rs b/crates/fakecloud-e2e/tests/rds_aws_s3.rs index 8597177e..c09b8087 100644 --- a/crates/fakecloud-e2e/tests/rds_aws_s3.rs +++ b/crates/fakecloud-e2e/tests/rds_aws_s3.rs @@ -69,8 +69,7 @@ async fn aws_s3_extension_import_export_round_trip() { // 2. Postgres instance — reuses the lazy fakecloud-postgres image // that the aws_lambda e2e already exercises, so this run is fast // when ordered after the lambda test on the same runner. - let create = rds - .create_db_instance() + rds.create_db_instance() .db_instance_identifier("aws-s3-ext-db") .allocated_storage(20) .db_instance_class("db.t3.micro") @@ -82,10 +81,8 @@ async fn aws_s3_extension_import_export_round_trip() { .send() .await .expect("create postgres instance"); - let endpoint = create - .db_instance() - .and_then(|i| i.endpoint()) - .expect("endpoint"); + let instance = helpers::wait_for_db_available(&rds, "aws-s3-ext-db", 240).await; + let endpoint = instance.endpoint().expect("endpoint"); let host = endpoint.address().expect("address").to_string(); let port = endpoint.port().expect("port"); diff --git a/crates/fakecloud-e2e/tests/rds_heavy_engines.rs b/crates/fakecloud-e2e/tests/rds_heavy_engines.rs index 5501fa7d..385d8c75 100644 --- a/crates/fakecloud-e2e/tests/rds_heavy_engines.rs +++ b/crates/fakecloud-e2e/tests/rds_heavy_engines.rs @@ -76,6 +76,8 @@ async fn rds_oracle_create_describe_delete() { .await .expect("create oracle instance"); + // Oracle Free can take up to ~4 minutes to bootstrap. + helpers::wait_for_db_available(&client, "oracle-smoke", 360).await; let describe = client .describe_db_instances() .db_instance_identifier("oracle-smoke") @@ -126,6 +128,7 @@ async fn rds_sqlserver_create_describe_delete() { .await .expect("create sqlserver instance"); + helpers::wait_for_db_available(&client, "mssql-smoke", 300).await; let describe = client .describe_db_instances() .db_instance_identifier("mssql-smoke") @@ -167,6 +170,8 @@ async fn rds_db2_create_describe_delete() { .await .expect("create db2 instance"); + // Db2 Community Edition's first boot can take ~6 minutes. + helpers::wait_for_db_available(&client, "db2-smoke", 480).await; let describe = client .describe_db_instances() .db_instance_identifier("db2-smoke") diff --git a/crates/fakecloud-e2e/tests/sdk.rs b/crates/fakecloud-e2e/tests/sdk.rs index f2fc9059..474796c8 100644 --- a/crates/fakecloud-e2e/tests/sdk.rs +++ b/crates/fakecloud-e2e/tests/sdk.rs @@ -136,6 +136,7 @@ async fn sdk_rds_get_instances() { .await .unwrap(); + helpers::wait_for_db_available(&rds, "sdk-rds-db", 240).await; let instances = fc.rds().get_instances().await.expect("get rds instances"); let instance = instances .instances diff --git a/crates/fakecloud-rds/src/service.rs b/crates/fakecloud-rds/src/service.rs index 65454105..0c417779 100644 --- a/crates/fakecloud-rds/src/service.rs +++ b/crates/fakecloud-rds/src/service.rs @@ -316,23 +316,14 @@ impl RdsService { event_categories: &[&str], message: &str, ) { - let Some(ref bus) = self.delivery_bus else { - return; - }; - let detail = serde_json::json!({ - "EventCategories": event_categories, - "SourceType": source_type.as_str(), - "SourceArn": source_arn, - "Date": Utc::now().to_rfc3339(), - "Message": message, - "SourceIdentifier": source_identifier, - "EventID": event_id, - }); - bus.put_event_to_eventbridge( - "aws.rds", - source_type.detail_type(), - &detail.to_string(), - "default", + emit_event_static( + self.delivery_bus.as_ref(), + source_type, + source_identifier, + source_arn, + event_id, + event_categories, + message, ); } @@ -491,72 +482,60 @@ impl RdsService { } } - let runtime = self.require_runtime()?; + let runtime = self.require_runtime()?.clone(); let logical_db_name = db_name .clone() .unwrap_or_else(|| default_db_name(&engine).to_string()); - let running = runtime - .ensure_postgres( - &db_instance_identifier, - &engine, - &engine_version, - &master_username, - &master_user_password, - &logical_db_name, - &request.account_id, - &request.region, - ) - .await - .map_err(|error| { - self.state - .write() - .get_or_create(&request.account_id) - .cancel_instance_creation(&db_instance_identifier); - runtime_error_to_service_error(error) - })?; - let mut accounts = self.state.write(); - let state = accounts.get_or_create(&request.account_id); + // Insert a "creating" placeholder synchronously and spawn the + // container start in a background task. CreateDBInstance returns + // ~immediately; DescribeDBInstances flips to "available" (or + // "failed") when the container is up. Matches AWS RDS behavior: + // CreateDBInstance never blocks on the container coming up. let created_at = Utc::now(); - let instance = DbInstance { - db_instance_identifier: db_instance_identifier.clone(), - db_instance_arn: state.db_instance_arn(&db_instance_identifier), - db_instance_class: db_instance_class.clone(), - engine: engine.clone(), - engine_version: engine_version.clone(), - db_instance_status: "available".to_string(), - master_username: master_username.clone(), - db_name: db_name.clone(), - endpoint_address: "127.0.0.1".to_string(), - port: i32::from(running.host_port), - allocated_storage, - publicly_accessible, - deletion_protection, - created_at, - dbi_resource_id: state.next_dbi_resource_id(), - master_user_password, - container_id: running.container_id, - host_port: running.host_port, - tags: Vec::new(), - read_replica_source_db_instance_identifier: None, - read_replica_db_instance_identifiers: Vec::new(), - vpc_security_group_ids, - db_parameter_group_name, - backup_retention_period, - preferred_backup_window, - latest_restorable_time: if backup_retention_period > 0 { - Some(created_at) - } else { - None - }, - option_group_name, - multi_az, - pending_modified_values: None, + let instance = { + let mut accounts = self.state.write(); + let state = accounts.get_or_create(&request.account_id); + let placeholder = DbInstance { + db_instance_identifier: db_instance_identifier.clone(), + db_instance_arn: state.db_instance_arn(&db_instance_identifier), + db_instance_class: db_instance_class.clone(), + engine: engine.clone(), + engine_version: engine_version.clone(), + db_instance_status: "creating".to_string(), + master_username: master_username.clone(), + db_name: db_name.clone(), + endpoint_address: String::new(), + port: 0, + allocated_storage, + publicly_accessible, + deletion_protection, + created_at, + dbi_resource_id: state.next_dbi_resource_id(), + master_user_password: master_user_password.clone(), + container_id: String::new(), + host_port: 0, + tags: Vec::new(), + read_replica_source_db_instance_identifier: None, + read_replica_db_instance_identifiers: Vec::new(), + vpc_security_group_ids, + db_parameter_group_name, + backup_retention_period, + preferred_backup_window, + latest_restorable_time: if backup_retention_period > 0 { + Some(created_at) + } else { + None + }, + option_group_name, + multi_az, + pending_modified_values: None, + }; + state.finish_instance_creation(placeholder.clone()); + placeholder }; - state.finish_instance_creation(instance.clone()); let instance_arn = instance.db_instance_arn.clone(); - drop(accounts); self.emit_event( RdsSourceType::DbInstance, @@ -567,13 +546,71 @@ impl RdsService { "DB instance created", ); + { + let state_handle = self.state.clone(); + let delivery_bus = self.delivery_bus.clone(); + let runtime = runtime.clone(); + let id = db_instance_identifier.clone(); + let engine = engine.clone(); + let engine_version = engine_version.clone(); + let master_username = master_username.clone(); + let master_user_password = master_user_password.clone(); + let account_id = request.account_id.clone(); + let region = request.region.clone(); + let arn = instance_arn.clone(); + tokio::spawn(async move { + match runtime + .ensure_postgres( + &id, + &engine, + &engine_version, + &master_username, + &master_user_password, + &logical_db_name, + &account_id, + ®ion, + ) + .await + { + Ok(running) => { + let mut accounts = state_handle.write(); + let state = accounts.get_or_create(&account_id); + if let Some(inst) = state.instances.get_mut(&id) { + inst.db_instance_status = "available".to_string(); + inst.endpoint_address = "127.0.0.1".to_string(); + inst.port = i32::from(running.host_port); + inst.host_port = running.host_port; + inst.container_id = running.container_id; + } + } + Err(error) => { + tracing::error!(%error, db_instance_identifier=%id, "create_db_instance background task failed"); + { + let mut accounts = state_handle.write(); + let state = accounts.get_or_create(&account_id); + state.instances.remove(&id); + } + emit_event_static( + delivery_bus.as_ref(), + RdsSourceType::DbInstance, + &id, + &arn, + "RDS-EVENT-0058", + &["failure"], + &format!("DB instance failed to create: {}", error), + ); + } + } + }); + } + Ok(AwsResponse::xml( StatusCode::OK, xml_wrap( "CreateDBInstance", &format!( "{}", - db_instance_xml(&instance, Some("creating")) + db_instance_xml(&instance, None) ), &request.request_id, ), @@ -2692,6 +2729,37 @@ fn tag_xml(tag: &RdsTag) -> String { ) } +/// Free-standing version of `emit_event` so background tasks (which +/// don't have a `&self`) can publish RDS events through the same path. +pub(crate) fn emit_event_static( + delivery_bus: Option<&Arc>, + source_type: RdsSourceType, + source_identifier: &str, + source_arn: &str, + event_id: &str, + event_categories: &[&str], + message: &str, +) { + let Some(bus) = delivery_bus else { + return; + }; + let detail = serde_json::json!({ + "EventCategories": event_categories, + "SourceType": source_type.as_str(), + "SourceArn": source_arn, + "Date": Utc::now().to_rfc3339(), + "Message": message, + "SourceIdentifier": source_identifier, + "EventID": event_id, + }); + bus.put_event_to_eventbridge( + "aws.rds", + source_type.detail_type(), + &detail.to_string(), + "default", + ); +} + fn db_instance_xml(instance: &DbInstance, status_override: Option<&str>) -> String { let status = status_override.unwrap_or(&instance.db_instance_status); let db_name_xml = instance @@ -2829,6 +2897,20 @@ fn db_instance_xml(instance: &DbInstance, status_override: Option<&str>) -> Stri }) .unwrap_or_default(); + // Endpoint is suppressed while the container is still coming up so + // SDK callers don't try to dial an empty host:0. Once the background + // task fills in `endpoint_address` and `port`, DescribeDBInstances + // returns the real endpoint. + let endpoint_xml = if instance.endpoint_address.is_empty() || instance.port == 0 { + String::new() + } else { + format!( + "
{}
{}
", + xml_escape(&instance.endpoint_address), + instance.port + ) + }; + format!( "{identifier}\ {class}\ @@ -2836,7 +2918,7 @@ fn db_instance_xml(instance: &DbInstance, status_override: Option<&str>) -> Stri {status}\ {master_username}\ {db_name_xml}\ -
{endpoint_address}
{port}
\ + {endpoint_xml}\ {allocated_storage}\ {create_time}\ {preferred_backup_window}\ @@ -2867,7 +2949,6 @@ fn db_instance_xml(instance: &DbInstance, status_override: Option<&str>) -> Stri engine = xml_escape(&instance.engine), status = xml_escape(status), master_username = xml_escape(&instance.master_username), - endpoint_address = xml_escape(&instance.endpoint_address), port = instance.port, allocated_storage = instance.allocated_storage, create_time = instance.created_at.to_rfc3339(), diff --git a/crates/fakecloud-server/src/main.rs b/crates/fakecloud-server/src/main.rs index 92663dcb..c68f62ee 100644 --- a/crates/fakecloud-server/src/main.rs +++ b/crates/fakecloud-server/src/main.rs @@ -1761,6 +1761,34 @@ async fn main() { "loaded rds persistence snapshot (migrated from v1)", ); } + // Drop any `creating` placeholder rows the snapshot + // captured mid-CreateDBInstance. The background + // container-start task didn't survive the restart, + // so the placeholder would otherwise be stuck in + // `creating` forever. Dropping them is safe — the + // user can retry CreateDBInstance. + { + let mut mas = rds_state.write(); + for (_, state) in mas.iter_mut() { + let stuck: Vec = state + .instances + .iter() + .filter(|(_, inst)| { + inst.db_instance_status == "creating" + }) + .map(|(id, _)| id.clone()) + .collect(); + for id in &stuck { + state.instances.remove(id); + } + if !stuck.is_empty() { + tracing::warn!( + count = stuck.len(), + "dropped stuck `creating` rds instances after persistence load", + ); + } + } + } } Err(err) => fatal_exit(format_args!( "failed to parse rds persistence snapshot: {err}" diff --git a/sdks/typescript/tests/e2e.test.ts b/sdks/typescript/tests/e2e.test.ts index 587dc0b4..7a5f16bf 100644 --- a/sdks/typescript/tests/e2e.test.ts +++ b/sdks/typescript/tests/e2e.test.ts @@ -46,7 +46,11 @@ import { CreateScheduleCommand, FlexibleTimeWindowMode, } from "@aws-sdk/client-scheduler"; -import { RDSClient, CreateDBInstanceCommand } from "@aws-sdk/client-rds"; +import { + RDSClient, + CreateDBInstanceCommand, + DescribeDBInstancesCommand, +} from "@aws-sdk/client-rds"; import { ElastiCacheClient, CreateCacheClusterCommand, @@ -101,8 +105,9 @@ describe("health", () => { }); describe("rds", () => { - // First run on a fresh runner builds the fakecloud-postgres image - // (plpython3u + aws_lambda extension files); allow up to 3 minutes. + // CreateDBInstance returns ~immediately with status `creating`. Poll + // DescribeDBInstances until the container is up; the underlying image + // pull/build still takes up to ~3 minutes on a cold runner. it("getInstances() returns fakecloud-managed DB instances", async () => { const rds = new RDSClient(awsConfig()); @@ -119,6 +124,18 @@ describe("rds", () => { }), ); + const deadline = Date.now() + 240_000; + while (Date.now() < deadline) { + const desc = await rds.send( + new DescribeDBInstancesCommand({ + DBInstanceIdentifier: "ts-rds-db", + }), + ); + const status = desc.DBInstances?.[0]?.DBInstanceStatus; + if (status === "available") break; + await new Promise((r) => setTimeout(r, 1000)); + } + const result = await fc.rds.getInstances(); const instance = result.instances.find( (candidate) => candidate.dbInstanceIdentifier === "ts-rds-db", @@ -128,7 +145,7 @@ describe("rds", () => { expect(instance!.dbName).toBe("appdb"); expect(instance!.containerId.length).toBeGreaterThan(0); expect(instance!.hostPort).toBeGreaterThan(0); - }, 180_000); + }, 300_000); }); // ── ElastiCache ───────────────────────────────────────────────────── diff --git a/website/content/docs/services/rds.md b/website/content/docs/services/rds.md index 2f40bb5c..f38669f7 100644 --- a/website/content/docs/services/rds.md +++ b/website/content/docs/services/rds.md @@ -120,6 +120,34 @@ The `options` argument is forwarded verbatim into the underlying postgres `COPY` The bridges (`/_fakecloud/rds/s3-import`, `/_fakecloud/rds/s3-export`) read and write the in-memory S3 state of the same fakecloud server, so any object that's visible to a `GetObject`/`PutObject` call against fakecloud is reachable from `aws_s3`. +## Asynchronous instance creation + +`CreateDBInstance` returns ~immediately with `DBInstanceStatus = "creating"`. The container start (and the underlying image pull/build for postgres) runs in the background; `DescribeDBInstances` returns the live status. Callers should poll until the status flips to `available` before connecting: + +```python +import boto3 +import time + +rds = boto3.client("rds", endpoint_url="http://localhost:4566") +rds.create_db_instance( + DBInstanceIdentifier="my-db", + Engine="postgres", + EngineVersion="16.3", + MasterUsername="admin", + MasterUserPassword="secret123", + AllocatedStorage=20, + DBInstanceClass="db.t3.micro", +) + +while True: + desc = rds.describe_db_instances(DBInstanceIdentifier="my-db") + if desc["DBInstances"][0]["DBInstanceStatus"] == "available": + break + time.sleep(1) +``` + +This matches AWS RDS behavior — real `CreateDBInstance` also never blocks on the container coming up. The `Endpoint` element is omitted from create/describe responses while the instance is still in `creating`. + ## How the Docker integration works When you call `CreateDBInstance` for PostgreSQL/MySQL/MariaDB/Oracle/SQL Server/Db2, fakecloud starts a real Docker container running the upstream image for that engine and version, waits for it to be ready, and reports the mapped host port. Your application connects to that port like it would connect to any database.