diff --git a/known_issues/18_tf_env_azure.md b/known_issues/18_tf_env_azure.md index 84d908cf..84723138 100644 --- a/known_issues/18_tf_env_azure.md +++ b/known_issues/18_tf_env_azure.md @@ -1,6 +1,6 @@ # Known Issues: Terraform Azure Environment -> **Audit status (2026-04-20):** `0 still valid · 7 resolved · 1 partially fixed · 0 moved · 0 needs triage` +> **Audit status (2026-04-25):** `0 still valid · 8 resolved · 0 partially fixed · 0 moved · 0 needs triage` ## ~~CRITICAL: `nonsensitive()` strips sensitivity from `additional_secrets` before merge~~ — RESOLVED @@ -95,18 +95,17 @@ **Effort:** `small` (contingent on triage) -## ~~HIGH: `SCHEDULED_TASK_SECRET` injected as plain-text environment variable~~ — PARTIALLY RESOLVED +## ~~HIGH: `SCHEDULED_TASK_SECRET` injected as plain-text environment variable~~ — RESOLVED **File**: `terraform/environments/azure/compute.tf:42` **Description**: The scheduled-task shared secret was injected into the Container App as a plaintext env var, visible via `az containerapp show`, the Azure Portal, and exported ARM templates. -**Status:** ✔️ Partially resolved — runtime env-var leak closed; Logic App path deferred. +**Status:** ✔️ Fully resolved — runtime env-var leak closed AND Logic App path migrated to Key Vault references (issue #50). **Resolved by:** - Container App env var switched from `SCHEDULED_TASK_SECRET = ` to `SCHEDULED_TASK_SECRET_NAME = `. `az containerapp show` now reveals only the secret name. - `ApplicationConfig.ScheduledTaskSecretName` added. `NewApplicationFromDeps` resolves it via the existing `SecretResolver` (Azure Key Vault / AWS Secrets Manager) at startup and populates `ScheduledTaskSecret` in memory. Falls back to the plaintext `SCHEDULED_TASK_SECRET` env var if the lookup fails or the resolver isn't configured, so dev/local runs still work. - -**Deferred:** The Logic App workflow (`scheduled-tasks.tf:48,106`) still interpolates `var.scheduled_task_secret` into its outgoing `Authorization: Bearer …` header. This value is stored inside the Logic App resource + Terraform state; removing it requires migrating to Azure Logic Apps Key Vault connections (`@parameters('kv-secret')`), a larger refactor. The container-app side — which is what was actually flagged in the audit — no longer leaks. +- Logic App workflows now have a system-assigned managed identity granted "Key Vault Secrets User" on the vault; each workflow's first action GETs the secret from KV at runtime via that identity, and the call-endpoint action references `@body('get-secret')['value']` in its outgoing Authorization header. The plaintext value no longer lives in the workflow definition or Terraform state. See `terraform/modules/compute/azure/container-apps/scheduled-tasks.tf` and PR resolving #50. ### Original implementation plan diff --git a/terraform/environments/azure/.terraform.lock.hcl b/terraform/environments/azure/.terraform.lock.hcl index 7261db99..8f9ec233 100644 --- a/terraform/environments/azure/.terraform.lock.hcl +++ b/terraform/environments/azure/.terraform.lock.hcl @@ -159,3 +159,23 @@ provider "registry.terraform.io/hashicorp/random" { "zh:f49fd62aa8c5525a5c17abd51e27ca5e213881d58882fd42fec4a545b53c9699", ] } + +provider "registry.terraform.io/hashicorp/time" { + version = "0.13.1" + constraints = "~> 0.11" + hashes = [ + "h1:ZT5ppCNIModqk3iOkVt5my8b8yBHmDpl663JtXAIRqM=", + "zh:02cb9aab1002f0f2a94a4f85acec8893297dc75915f7404c165983f720a54b74", + "zh:04429b2b31a492d19e5ecf999b116d396dac0b24bba0d0fb19ecaefe193fdb8f", + "zh:26f8e51bb7c275c404ba6028c1b530312066009194db721a8427a7bc5cdbc83a", + "zh:772ff8dbdbef968651ab3ae76d04afd355c32f8a868d03244db3f8496e462690", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:898db5d2b6bd6ca5457dccb52eedbc7c5b1a71e4a4658381bcbb38cedbbda328", + "zh:8de913bf09a3fa7bedc29fec18c47c571d0c7a3d0644322c46f3aa648cf30cd8", + "zh:9402102c86a87bdfe7e501ffbb9c685c32bbcefcfcf897fd7d53df414c36877b", + "zh:b18b9bb1726bb8cfbefc0a29cf3657c82578001f514bcf4c079839b6776c47f0", + "zh:b9d31fdc4faecb909d7c5ce41d2479dd0536862a963df434be4b16e8e4edc94d", + "zh:c951e9f39cca3446c060bd63933ebb89cedde9523904813973fbc3d11863ba75", + "zh:e5b773c0d07e962291be0e9b413c7a22c044b8c7b58c76e8aa91d1659990dfb5", + ] +} diff --git a/terraform/environments/azure/compute.tf b/terraform/environments/azure/compute.tf index 51f1bc9d..25d20b17 100644 --- a/terraform/environments/azure/compute.tf +++ b/terraform/environments/azure/compute.tf @@ -57,17 +57,14 @@ module "compute_container_apps" { # Scheduled tasks (Logic Apps) # - # scheduled_task_secret still passes the plaintext value here because the - # Logic App workflow definition embeds it in its outgoing "Authorization: - # Bearer ..." header (see scheduled-tasks.tf). Azure Logic Apps DO support - # Key Vault references via @parameters() + a Key Vault connection, but - # that's a larger refactor. This value is stored in Terraform state and - # in the Logic App resource, but is no longer exposed in the Container - # App's env vars — `az containerapp show` now reveals only the secret - # name, not the value. - enable_scheduled_tasks = var.enable_scheduled_tasks - scheduled_task_secret = module.secrets.scheduled_task_secret_value - recommendation_schedule = var.recommendation_schedule + # Each Logic App workflow has a system-assigned managed identity that holds + # "Key Vault Secrets User" on the same Key Vault as the Container App. The + # workflow's first action GETs scheduled-task-secret from KV via that + # identity at runtime; the value never lands in the workflow definition or + # Terraform state. We only pass the *name* of the secret, not the value. + enable_scheduled_tasks = var.enable_scheduled_tasks + scheduled_task_secret_name = module.secrets.scheduled_task_secret_name + recommendation_schedule = var.recommendation_schedule # RI exchange automation enable_ri_exchange_schedule = var.enable_ri_exchange_schedule diff --git a/terraform/modules/compute/azure/container-apps/scheduled-tasks.tf b/terraform/modules/compute/azure/container-apps/scheduled-tasks.tf index 0108942c..cd295e59 100644 --- a/terraform/modules/compute/azure/container-apps/scheduled-tasks.tf +++ b/terraform/modules/compute/azure/container-apps/scheduled-tasks.tf @@ -1,5 +1,17 @@ # Azure Logic Apps for scheduled tasks on Container Apps # This is the Azure equivalent of AWS EventBridge + Lambda or GCP Cloud Scheduler + Cloud Run +# +# SECURITY: The shared scheduled-task secret is NEVER interpolated into the +# workflow definition or Terraform state. Each Logic App workflow has a +# system-assigned managed identity that holds "Key Vault Secrets User" on the +# vault that stores `scheduled-task-secret`. At workflow runtime the first +# action (`get-secret`) calls the Key Vault data-plane REST API authenticated +# by the workflow's managed identity, and the call-endpoint action references +# `@body('get-secret')['value']` in the outgoing Authorization header. +# +# Effect: `terraform show` / `az logicapp show` only ever reveal the Key Vault +# URL the workflow is going to call — the actual secret value is fetched +# in-process by the Logic Apps engine and never lands in any persisted artifact. # Parse cron schedule into Logic Apps recurrence format # Azure Logic Apps uses a different format than cron @@ -8,9 +20,47 @@ locals { # For simplicity, support daily schedules at a specific hour # Full cron parsing would require more complex logic schedule_hour = var.enable_scheduled_tasks ? split(" ", var.recommendation_schedule)[1] : "2" + + # Data-plane URL of the scheduled-task secret in Key Vault. The Logic App + # workflows fetch this at runtime via managed identity. `key_vault_uri` + # already includes the trailing slash (e.g. https://.vault.azure.net/). + scheduled_task_secret_url = ( + var.enable_scheduled_tasks || var.enable_ri_exchange_schedule + ) ? "${var.key_vault_uri}secrets/${var.scheduled_task_secret_name}?api-version=7.4" : "" } +# Plan-time guard: if any scheduled-task workflow is enabled, the secret name +# and key vault URI must be set correctly. Without these checks, an empty +# scheduled_task_secret_name silently produces `/secrets/?api-version=...` +# (the list-secrets endpoint), and a key_vault_uri without a trailing slash +# breaks the URL. Both surface late as runtime 401/403; the precondition +# fails them at plan/apply instead. +# +# Why a precondition rather than `validation` blocks on the variables: variable +# validation can only reference its own `var.` and runs unconditionally, +# so it can't say "non-empty WHEN the schedule is enabled" without breaking +# legitimate disabled-by-default callers. The character-set half of the rule +# (no `/?# ` in the secret name) does live on the variable itself — see the +# `validation` block on `scheduled_task_secret_name` in variables.tf. +resource "terraform_data" "scheduled_task_secret_preconditions" { + count = (var.enable_scheduled_tasks || var.enable_ri_exchange_schedule) ? 1 : 0 + + lifecycle { + precondition { + condition = length(var.scheduled_task_secret_name) > 0 + error_message = "scheduled_task_secret_name must be set when enable_scheduled_tasks or enable_ri_exchange_schedule is true." + } + precondition { + condition = endswith(var.key_vault_uri, "/") + error_message = "key_vault_uri must end with '/' (e.g. https://.vault.azure.net/)." + } + } +} + +# ============================================== # Logic App workflow for recommendations refresh +# ============================================== + resource "azurerm_logic_app_workflow" "recommendations" { count = var.enable_scheduled_tasks ? 1 : 0 @@ -18,6 +68,12 @@ resource "azurerm_logic_app_workflow" "recommendations" { location = var.location resource_group_name = var.resource_group_name + # System-assigned managed identity used to read the shared secret from + # Key Vault at workflow runtime. See header comment. + identity { + type = "SystemAssigned" + } + tags = var.tags } @@ -33,27 +89,77 @@ resource "azurerm_logic_app_trigger_recurrence" "daily" { time_zone = "UTC" } -# HTTP action to call Container App endpoint -resource "azurerm_logic_app_action_http" "call_recommendations" { +# Step 1: Fetch the shared secret from Key Vault using the workflow's +# system-assigned managed identity. The secret value lives in the workflow +# run's transient state only — never in the workflow definition or TF state. +resource "azurerm_logic_app_action_custom" "recommendations_get_secret" { count = var.enable_scheduled_tasks ? 1 : 0 - name = "call-recommendations-endpoint" + name = "get-secret" logic_app_id = azurerm_logic_app_workflow.recommendations[0].id - method = "POST" - uri = "https://${azurerm_container_app.main.ingress[0].fqdn}/api/scheduled/recommendations" + body = jsonencode({ + type = "Http" + inputs = { + method = "GET" + uri = local.scheduled_task_secret_url + authentication = { + type = "ManagedServiceIdentity" + audience = "https://vault.azure.net" + } + } + runAfter = {} + runtimeConfiguration = { + secureData = { + properties = ["outputs"] + } + } + }) - headers = { - "Content-Type" = "application/json" - "Authorization" = "Bearer ${var.scheduled_task_secret}" - "X-Trigger" = "scheduled" - "X-Source" = "azure-logic-apps" - } + # Ensure the role assignment exists before this action so the very first + # post-apply manual run doesn't 403 while RBAC propagation completes. + # Scheduled runs (next 02:00 UTC) almost certainly fall after propagation, + # but this keeps `terraform apply && trigger now` deterministic. + depends_on = [azurerm_role_assignment.recommendations_kv_secrets_user] +} + +# Step 2: Call the Container App scheduled-recommendations endpoint, using +# the secret pulled by the previous action. `@body('get-secret')['value']` is +# evaluated by the Logic Apps engine at runtime; it is never expanded into +# the persisted workflow definition. +resource "azurerm_logic_app_action_custom" "call_recommendations" { + count = var.enable_scheduled_tasks ? 1 : 0 + + name = "call-recommendations-endpoint" + logic_app_id = azurerm_logic_app_workflow.recommendations[0].id body = jsonencode({ - source = "azure-logic-apps" - timestamp = "@{utcNow()}" + type = "Http" + inputs = { + method = "POST" + uri = "https://${azurerm_container_app.main.ingress[0].fqdn}/api/scheduled/recommendations" + headers = { + "Content-Type" = "application/json" + "Authorization" = "Bearer @{body('get-secret')['value']}" + "X-Trigger" = "scheduled" + "X-Source" = "azure-logic-apps" + } + body = { + source = "azure-logic-apps" + timestamp = "@{utcNow()}" + } + } + runAfter = { + "get-secret" = ["Succeeded"] + } + runtimeConfiguration = { + secureData = { + properties = ["inputs"] + } + } }) + + depends_on = [azurerm_logic_app_action_custom.recommendations_get_secret] } # ============================================== @@ -78,6 +184,10 @@ resource "azurerm_logic_app_workflow" "ri_exchange" { location = var.location resource_group_name = var.resource_group_name + identity { + type = "SystemAssigned" + } + tags = var.tags } @@ -92,29 +202,73 @@ resource "azurerm_logic_app_trigger_recurrence" "ri_exchange" { time_zone = "UTC" } -resource "azurerm_logic_app_action_http" "call_ri_exchange" { +resource "azurerm_logic_app_action_custom" "ri_exchange_get_secret" { count = var.enable_ri_exchange_schedule ? 1 : 0 - name = "call-ri-exchange-endpoint" + name = "get-secret" logic_app_id = azurerm_logic_app_workflow.ri_exchange[0].id - method = "POST" - uri = "https://${azurerm_container_app.main.ingress[0].fqdn}/api/scheduled/ri-exchange" + body = jsonencode({ + type = "Http" + inputs = { + method = "GET" + uri = local.scheduled_task_secret_url + authentication = { + type = "ManagedServiceIdentity" + audience = "https://vault.azure.net" + } + } + runAfter = {} + runtimeConfiguration = { + secureData = { + properties = ["outputs"] + } + } + }) + + # See recommendations_get_secret.depends_on rationale. + depends_on = [azurerm_role_assignment.ri_exchange_kv_secrets_user] +} - headers = { - "Content-Type" = "application/json" - "Authorization" = "Bearer ${var.scheduled_task_secret}" - "X-Trigger" = "scheduled" - "X-Source" = "azure-logic-apps" - } +resource "azurerm_logic_app_action_custom" "call_ri_exchange" { + count = var.enable_ri_exchange_schedule ? 1 : 0 + + name = "call-ri-exchange-endpoint" + logic_app_id = azurerm_logic_app_workflow.ri_exchange[0].id body = jsonencode({ - source = "azure-logic-apps" - timestamp = "@{utcNow()}" + type = "Http" + inputs = { + method = "POST" + uri = "https://${azurerm_container_app.main.ingress[0].fqdn}/api/scheduled/ri-exchange" + headers = { + "Content-Type" = "application/json" + "Authorization" = "Bearer @{body('get-secret')['value']}" + "X-Trigger" = "scheduled" + "X-Source" = "azure-logic-apps" + } + body = { + source = "azure-logic-apps" + timestamp = "@{utcNow()}" + } + } + runAfter = { + "get-secret" = ["Succeeded"] + } + runtimeConfiguration = { + secureData = { + properties = ["inputs"] + } + } }) + + depends_on = [azurerm_logic_app_action_custom.ri_exchange_get_secret] } +# ============================================== # Logic App workflow for cleanup (sessions and executions) +# ============================================== + resource "azurerm_logic_app_workflow" "cleanup" { count = var.enable_scheduled_tasks ? 1 : 0 @@ -122,6 +276,10 @@ resource "azurerm_logic_app_workflow" "cleanup" { location = var.location resource_group_name = var.resource_group_name + identity { + type = "SystemAssigned" + } + tags = var.tags } @@ -137,25 +295,102 @@ resource "azurerm_logic_app_trigger_recurrence" "cleanup_daily" { time_zone = "UTC" } -# HTTP action to call cleanup endpoint -resource "azurerm_logic_app_action_http" "call_cleanup" { +resource "azurerm_logic_app_action_custom" "cleanup_get_secret" { count = var.enable_scheduled_tasks ? 1 : 0 - name = "call-cleanup-endpoint" + name = "get-secret" logic_app_id = azurerm_logic_app_workflow.cleanup[0].id - method = "POST" - uri = "https://${azurerm_container_app.main.ingress[0].fqdn}/api/scheduled/cleanup" + body = jsonencode({ + type = "Http" + inputs = { + method = "GET" + uri = local.scheduled_task_secret_url + authentication = { + type = "ManagedServiceIdentity" + audience = "https://vault.azure.net" + } + } + runAfter = {} + runtimeConfiguration = { + secureData = { + properties = ["outputs"] + } + } + }) + + # See recommendations_get_secret.depends_on rationale. + depends_on = [azurerm_role_assignment.cleanup_kv_secrets_user] +} - headers = { - "Content-Type" = "application/json" - "Authorization" = "Bearer ${var.scheduled_task_secret}" - "X-Trigger" = "scheduled" - "X-Source" = "azure-logic-apps" - } +resource "azurerm_logic_app_action_custom" "call_cleanup" { + count = var.enable_scheduled_tasks ? 1 : 0 + + name = "call-cleanup-endpoint" + logic_app_id = azurerm_logic_app_workflow.cleanup[0].id body = jsonencode({ - dryRun = false - source = "azure-logic-apps" + type = "Http" + inputs = { + method = "POST" + uri = "https://${azurerm_container_app.main.ingress[0].fqdn}/api/scheduled/cleanup" + headers = { + "Content-Type" = "application/json" + "Authorization" = "Bearer @{body('get-secret')['value']}" + "X-Trigger" = "scheduled" + "X-Source" = "azure-logic-apps" + } + body = { + dryRun = false + source = "azure-logic-apps" + } + } + runAfter = { + "get-secret" = ["Succeeded"] + } + runtimeConfiguration = { + secureData = { + properties = ["inputs"] + } + } }) + + depends_on = [azurerm_logic_app_action_custom.cleanup_get_secret] +} + +# ============================================== +# RBAC: grant each Logic App's managed identity read access to the +# scheduled-task secret in Key Vault. +# ============================================== + +# Use a single role assignment per workflow rather than per secret. The grant +# is "Key Vault Secrets User" (read-only) scoped to the whole vault, matching +# the same pattern used by the container app's runtime identity. Vault-scoped +# read access is acceptable here because each workflow only ever reads one +# specific secret URL embedded in its definition; adding a per-secret RBAC +# scope would require splitting the vault, which is out of scope for this +# change. + +resource "azurerm_role_assignment" "recommendations_kv_secrets_user" { + count = var.enable_scheduled_tasks ? 1 : 0 + + scope = var.key_vault_id + role_definition_name = "Key Vault Secrets User" + principal_id = azurerm_logic_app_workflow.recommendations[0].identity[0].principal_id +} + +resource "azurerm_role_assignment" "ri_exchange_kv_secrets_user" { + count = var.enable_ri_exchange_schedule ? 1 : 0 + + scope = var.key_vault_id + role_definition_name = "Key Vault Secrets User" + principal_id = azurerm_logic_app_workflow.ri_exchange[0].identity[0].principal_id +} + +resource "azurerm_role_assignment" "cleanup_kv_secrets_user" { + count = var.enable_scheduled_tasks ? 1 : 0 + + scope = var.key_vault_id + role_definition_name = "Key Vault Secrets User" + principal_id = azurerm_logic_app_workflow.cleanup[0].identity[0].principal_id } diff --git a/terraform/modules/compute/azure/container-apps/variables.tf b/terraform/modules/compute/azure/container-apps/variables.tf index fed6f9ad..b493154a 100644 --- a/terraform/modules/compute/azure/container-apps/variables.tf +++ b/terraform/modules/compute/azure/container-apps/variables.tf @@ -204,11 +204,22 @@ variable "enable_scheduled_tasks" { default = true } -variable "scheduled_task_secret" { - description = "Shared secret for authenticating scheduled task HTTP calls" +variable "scheduled_task_secret_name" { + description = "Key Vault secret name (NOT the value) holding the shared secret for authenticating scheduled task HTTP calls. The Logic App workflows fetch this secret at runtime via their managed identity, so the plaintext never lands in the workflow definition or Terraform state. Must be non-empty when enable_scheduled_tasks or enable_ri_exchange_schedule is true (enforced by a precondition in scheduled-tasks.tf)." type = string default = "" - sensitive = true + + # Validation runs unconditionally and can't reference sibling variables, so + # the "non-empty when scheduled tasks are enabled" rule lives in the + # `terraform_data.scheduled_task_secret_preconditions` resource in + # scheduled-tasks.tf. This block handles only the character-set half: if a + # value is supplied, it must be a bare Key Vault secret name — no `/`, `?`, + # `#`, or whitespace — so it can't escape the + # `${key_vault_uri}secrets/${name}?api-version=…` URL template. + validation { + condition = var.scheduled_task_secret_name == "" || !can(regex("[/?# ]", var.scheduled_task_secret_name)) + error_message = "scheduled_task_secret_name must be a bare Key Vault secret name (no '/', '?', '#', or whitespace)." + } } variable "recommendation_schedule" {