diff --git a/gen/api/v1/metrics_collector.pb.go b/gen/api/v1/metrics_collector.pb.go index e163d025..f3fa9cea 100644 --- a/gen/api/v1/metrics_collector.pb.go +++ b/gen/api/v1/metrics_collector.pb.go @@ -190,6 +190,11 @@ const ( ResourceType_RESOURCE_TYPE_WORKLOAD_RULE_CPU_THROTTLE ResourceType = 58 // CloudNativePG Cluster (postgresql.cnpg.io/v1) ResourceType_RESOURCE_TYPE_CNPG_CLUSTER ResourceType = 59 + // Container health events (direct path, not CRD-sourced) + ResourceType_RESOURCE_TYPE_CONTAINER_OOM_EVENT ResourceType = 60 + ResourceType_RESOURCE_TYPE_CONTAINER_CRASHLOOP_EVENT ResourceType = 61 + ResourceType_RESOURCE_TYPE_CONTAINER_STARTUP_LIFECYCLE ResourceType = 62 + ResourceType_RESOURCE_TYPE_CONTAINER_CPU_THROTTLE_EVENT ResourceType = 63 // Cluster snapshot type ResourceType_RESOURCE_TYPE_CLUSTER_SNAPSHOT ResourceType = 77 ) @@ -257,70 +262,78 @@ var ( 57: "RESOURCE_TYPE_WORKLOAD_RULE_OOM", 58: "RESOURCE_TYPE_WORKLOAD_RULE_CPU_THROTTLE", 59: "RESOURCE_TYPE_CNPG_CLUSTER", + 60: "RESOURCE_TYPE_CONTAINER_OOM_EVENT", + 61: "RESOURCE_TYPE_CONTAINER_CRASHLOOP_EVENT", + 62: "RESOURCE_TYPE_CONTAINER_STARTUP_LIFECYCLE", + 63: "RESOURCE_TYPE_CONTAINER_CPU_THROTTLE_EVENT", 77: "RESOURCE_TYPE_CLUSTER_SNAPSHOT", } ResourceType_value = map[string]int32{ - "RESOURCE_TYPE_UNSPECIFIED": 0, - "RESOURCE_TYPE_NODE": 1, - "RESOURCE_TYPE_POD": 2, - "RESOURCE_TYPE_NAMESPACE": 3, - "RESOURCE_TYPE_EVENT": 4, - "RESOURCE_TYPE_ENDPOINTS": 5, - "RESOURCE_TYPE_SERVICE_ACCOUNT": 6, - "RESOURCE_TYPE_LIMIT_RANGE": 7, - "RESOURCE_TYPE_RESOURCE_QUOTA": 8, - "RESOURCE_TYPE_DEPLOYMENT": 9, - "RESOURCE_TYPE_STATEFUL_SET": 10, - "RESOURCE_TYPE_DAEMON_SET": 11, - "RESOURCE_TYPE_REPLICA_SET": 12, - "RESOURCE_TYPE_REPLICATION_CONTROLLER": 13, - "RESOURCE_TYPE_JOB": 14, - "RESOURCE_TYPE_CRON_JOB": 15, - "RESOURCE_TYPE_PERSISTENT_VOLUME_CLAIM": 16, - "RESOURCE_TYPE_PERSISTENT_VOLUME": 17, - "RESOURCE_TYPE_STORAGE_CLASS": 18, - "RESOURCE_TYPE_SERVICE": 19, - "RESOURCE_TYPE_INGRESS": 20, - "RESOURCE_TYPE_INGRESS_CLASS": 21, - "RESOURCE_TYPE_NETWORK_POLICY": 22, - "RESOURCE_TYPE_ROLE": 23, - "RESOURCE_TYPE_ROLE_BINDING": 24, - "RESOURCE_TYPE_CLUSTER_ROLE": 25, - "RESOURCE_TYPE_CLUSTER_ROLE_BINDING": 26, - "RESOURCE_TYPE_HORIZONTAL_POD_AUTOSCALER": 27, - "RESOURCE_TYPE_VERTICAL_POD_AUTOSCALER": 28, - "RESOURCE_TYPE_POD_DISRUPTION_BUDGET": 29, - "RESOURCE_TYPE_POD_SECURITY_POLICY": 30, - "RESOURCE_TYPE_CUSTOM_RESOURCE_DEFINITION": 31, - "RESOURCE_TYPE_CUSTOM_RESOURCE": 32, - "RESOURCE_TYPE_CONFIG_MAP": 33, - "RESOURCE_TYPE_SECRET": 34, - "RESOURCE_TYPE_CONTAINER": 35, - "RESOURCE_TYPE_NODE_RESOURCE": 36, - "RESOURCE_TYPE_CONTAINER_RESOURCE": 37, - "RESOURCE_TYPE_CLUSTER": 38, - "RESOURCE_TYPE_CSI_NODE": 39, - "RESOURCE_TYPE_KARPENTER": 40, - "RESOURCE_TYPE_DATADOG": 41, - "RESOURCE_TYPE_ARGO_ROLLOUTS": 42, - "RESOURCE_TYPE_KEDA": 43, - "RESOURCE_TYPE_KEDA_SCALED_OBJECT": 44, - "RESOURCE_TYPE_KEDA_SCALED_JOB": 45, - "RESOURCE_TYPE_CSI_DRIVER": 46, - "RESOURCE_TYPE_CSI_STORAGE_CAPACITY": 47, - "RESOURCE_TYPE_VOLUME_ATTACHMENT": 48, - "RESOURCE_TYPE_KUBEFLOW_NOTEBOOK": 49, - "RESOURCE_TYPE_VOLCANO_JOB": 50, - "RESOURCE_TYPE_SPARK_APPLICATION": 51, - "RESOURCE_TYPE_SCHEDULED_SPARK_APPLICATION": 52, - "RESOURCE_TYPE_CRON_VOLCANO_JOB": 53, - "RESOURCE_TYPE_PVC_METRICS": 54, - "RESOURCE_TYPE_WORKLOAD_RECOMMENDATION": 55, - "RESOURCE_TYPE_WORKLOAD_RULE": 56, - "RESOURCE_TYPE_WORKLOAD_RULE_OOM": 57, - "RESOURCE_TYPE_WORKLOAD_RULE_CPU_THROTTLE": 58, - "RESOURCE_TYPE_CNPG_CLUSTER": 59, - "RESOURCE_TYPE_CLUSTER_SNAPSHOT": 77, + "RESOURCE_TYPE_UNSPECIFIED": 0, + "RESOURCE_TYPE_NODE": 1, + "RESOURCE_TYPE_POD": 2, + "RESOURCE_TYPE_NAMESPACE": 3, + "RESOURCE_TYPE_EVENT": 4, + "RESOURCE_TYPE_ENDPOINTS": 5, + "RESOURCE_TYPE_SERVICE_ACCOUNT": 6, + "RESOURCE_TYPE_LIMIT_RANGE": 7, + "RESOURCE_TYPE_RESOURCE_QUOTA": 8, + "RESOURCE_TYPE_DEPLOYMENT": 9, + "RESOURCE_TYPE_STATEFUL_SET": 10, + "RESOURCE_TYPE_DAEMON_SET": 11, + "RESOURCE_TYPE_REPLICA_SET": 12, + "RESOURCE_TYPE_REPLICATION_CONTROLLER": 13, + "RESOURCE_TYPE_JOB": 14, + "RESOURCE_TYPE_CRON_JOB": 15, + "RESOURCE_TYPE_PERSISTENT_VOLUME_CLAIM": 16, + "RESOURCE_TYPE_PERSISTENT_VOLUME": 17, + "RESOURCE_TYPE_STORAGE_CLASS": 18, + "RESOURCE_TYPE_SERVICE": 19, + "RESOURCE_TYPE_INGRESS": 20, + "RESOURCE_TYPE_INGRESS_CLASS": 21, + "RESOURCE_TYPE_NETWORK_POLICY": 22, + "RESOURCE_TYPE_ROLE": 23, + "RESOURCE_TYPE_ROLE_BINDING": 24, + "RESOURCE_TYPE_CLUSTER_ROLE": 25, + "RESOURCE_TYPE_CLUSTER_ROLE_BINDING": 26, + "RESOURCE_TYPE_HORIZONTAL_POD_AUTOSCALER": 27, + "RESOURCE_TYPE_VERTICAL_POD_AUTOSCALER": 28, + "RESOURCE_TYPE_POD_DISRUPTION_BUDGET": 29, + "RESOURCE_TYPE_POD_SECURITY_POLICY": 30, + "RESOURCE_TYPE_CUSTOM_RESOURCE_DEFINITION": 31, + "RESOURCE_TYPE_CUSTOM_RESOURCE": 32, + "RESOURCE_TYPE_CONFIG_MAP": 33, + "RESOURCE_TYPE_SECRET": 34, + "RESOURCE_TYPE_CONTAINER": 35, + "RESOURCE_TYPE_NODE_RESOURCE": 36, + "RESOURCE_TYPE_CONTAINER_RESOURCE": 37, + "RESOURCE_TYPE_CLUSTER": 38, + "RESOURCE_TYPE_CSI_NODE": 39, + "RESOURCE_TYPE_KARPENTER": 40, + "RESOURCE_TYPE_DATADOG": 41, + "RESOURCE_TYPE_ARGO_ROLLOUTS": 42, + "RESOURCE_TYPE_KEDA": 43, + "RESOURCE_TYPE_KEDA_SCALED_OBJECT": 44, + "RESOURCE_TYPE_KEDA_SCALED_JOB": 45, + "RESOURCE_TYPE_CSI_DRIVER": 46, + "RESOURCE_TYPE_CSI_STORAGE_CAPACITY": 47, + "RESOURCE_TYPE_VOLUME_ATTACHMENT": 48, + "RESOURCE_TYPE_KUBEFLOW_NOTEBOOK": 49, + "RESOURCE_TYPE_VOLCANO_JOB": 50, + "RESOURCE_TYPE_SPARK_APPLICATION": 51, + "RESOURCE_TYPE_SCHEDULED_SPARK_APPLICATION": 52, + "RESOURCE_TYPE_CRON_VOLCANO_JOB": 53, + "RESOURCE_TYPE_PVC_METRICS": 54, + "RESOURCE_TYPE_WORKLOAD_RECOMMENDATION": 55, + "RESOURCE_TYPE_WORKLOAD_RULE": 56, + "RESOURCE_TYPE_WORKLOAD_RULE_OOM": 57, + "RESOURCE_TYPE_WORKLOAD_RULE_CPU_THROTTLE": 58, + "RESOURCE_TYPE_CNPG_CLUSTER": 59, + "RESOURCE_TYPE_CONTAINER_OOM_EVENT": 60, + "RESOURCE_TYPE_CONTAINER_CRASHLOOP_EVENT": 61, + "RESOURCE_TYPE_CONTAINER_STARTUP_LIFECYCLE": 62, + "RESOURCE_TYPE_CONTAINER_CPU_THROTTLE_EVENT": 63, + "RESOURCE_TYPE_CLUSTER_SNAPSHOT": 77, } ) @@ -3562,7 +3575,7 @@ var file_api_v1_metrics_collector_proto_rawDesc = []byte{ 0x45, 0x5f, 0x43, 0x4f, 0x4e, 0x54, 0x41, 0x49, 0x4e, 0x45, 0x52, 0x5f, 0x52, 0x45, 0x53, 0x54, 0x41, 0x52, 0x54, 0x45, 0x44, 0x10, 0x08, 0x12, 0x1f, 0x0a, 0x1b, 0x45, 0x56, 0x45, 0x4e, 0x54, 0x5f, 0x54, 0x59, 0x50, 0x45, 0x5f, 0x43, 0x4c, 0x55, 0x53, 0x54, 0x45, 0x52, 0x5f, 0x53, 0x4e, - 0x41, 0x50, 0x53, 0x48, 0x4f, 0x54, 0x10, 0x09, 0x2a, 0x84, 0x10, 0x0a, 0x0c, 0x52, 0x65, 0x73, + 0x41, 0x50, 0x53, 0x48, 0x4f, 0x54, 0x10, 0x09, 0x2a, 0xb7, 0x11, 0x0a, 0x0c, 0x52, 0x65, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x54, 0x79, 0x70, 0x65, 0x12, 0x1d, 0x0a, 0x19, 0x52, 0x45, 0x53, 0x4f, 0x55, 0x52, 0x43, 0x45, 0x5f, 0x54, 0x59, 0x50, 0x45, 0x5f, 0x55, 0x4e, 0x53, 0x50, 0x45, 0x43, 0x49, 0x46, 0x49, 0x45, 0x44, 0x10, 0x00, 0x12, 0x16, 0x0a, 0x12, 0x52, 0x45, 0x53, 0x4f, @@ -3688,70 +3701,81 @@ var file_api_v1_metrics_collector_proto_rawDesc = []byte{ 0x4f, 0x52, 0x4b, 0x4c, 0x4f, 0x41, 0x44, 0x5f, 0x52, 0x55, 0x4c, 0x45, 0x5f, 0x43, 0x50, 0x55, 0x5f, 0x54, 0x48, 0x52, 0x4f, 0x54, 0x54, 0x4c, 0x45, 0x10, 0x3a, 0x12, 0x1e, 0x0a, 0x1a, 0x52, 0x45, 0x53, 0x4f, 0x55, 0x52, 0x43, 0x45, 0x5f, 0x54, 0x59, 0x50, 0x45, 0x5f, 0x43, 0x4e, 0x50, - 0x47, 0x5f, 0x43, 0x4c, 0x55, 0x53, 0x54, 0x45, 0x52, 0x10, 0x3b, 0x12, 0x22, 0x0a, 0x1e, 0x52, - 0x45, 0x53, 0x4f, 0x55, 0x52, 0x43, 0x45, 0x5f, 0x54, 0x59, 0x50, 0x45, 0x5f, 0x43, 0x4c, 0x55, - 0x53, 0x54, 0x45, 0x52, 0x5f, 0x53, 0x4e, 0x41, 0x50, 0x53, 0x48, 0x4f, 0x54, 0x10, 0x4d, 0x2a, - 0x8c, 0x01, 0x0a, 0x08, 0x4c, 0x6f, 0x67, 0x4c, 0x65, 0x76, 0x65, 0x6c, 0x12, 0x19, 0x0a, 0x15, - 0x4c, 0x4f, 0x47, 0x5f, 0x4c, 0x45, 0x56, 0x45, 0x4c, 0x5f, 0x55, 0x4e, 0x53, 0x50, 0x45, 0x43, - 0x49, 0x46, 0x49, 0x45, 0x44, 0x10, 0x00, 0x12, 0x13, 0x0a, 0x0f, 0x4c, 0x4f, 0x47, 0x5f, 0x4c, - 0x45, 0x56, 0x45, 0x4c, 0x5f, 0x44, 0x45, 0x42, 0x55, 0x47, 0x10, 0x01, 0x12, 0x12, 0x0a, 0x0e, - 0x4c, 0x4f, 0x47, 0x5f, 0x4c, 0x45, 0x56, 0x45, 0x4c, 0x5f, 0x49, 0x4e, 0x46, 0x4f, 0x10, 0x02, - 0x12, 0x12, 0x0a, 0x0e, 0x4c, 0x4f, 0x47, 0x5f, 0x4c, 0x45, 0x56, 0x45, 0x4c, 0x5f, 0x57, 0x41, - 0x52, 0x4e, 0x10, 0x03, 0x12, 0x13, 0x0a, 0x0f, 0x4c, 0x4f, 0x47, 0x5f, 0x4c, 0x45, 0x56, 0x45, - 0x4c, 0x5f, 0x45, 0x52, 0x52, 0x4f, 0x52, 0x10, 0x04, 0x12, 0x13, 0x0a, 0x0f, 0x4c, 0x4f, 0x47, - 0x5f, 0x4c, 0x45, 0x56, 0x45, 0x4c, 0x5f, 0x46, 0x41, 0x54, 0x41, 0x4c, 0x10, 0x05, 0x32, 0xa0, - 0x05, 0x0a, 0x17, 0x4d, 0x65, 0x74, 0x72, 0x69, 0x63, 0x73, 0x43, 0x6f, 0x6c, 0x6c, 0x65, 0x63, - 0x74, 0x6f, 0x72, 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x12, 0x49, 0x0a, 0x0c, 0x53, 0x65, - 0x6e, 0x64, 0x52, 0x65, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x12, 0x1b, 0x2e, 0x61, 0x70, 0x69, + 0x47, 0x5f, 0x43, 0x4c, 0x55, 0x53, 0x54, 0x45, 0x52, 0x10, 0x3b, 0x12, 0x25, 0x0a, 0x21, 0x52, + 0x45, 0x53, 0x4f, 0x55, 0x52, 0x43, 0x45, 0x5f, 0x54, 0x59, 0x50, 0x45, 0x5f, 0x43, 0x4f, 0x4e, + 0x54, 0x41, 0x49, 0x4e, 0x45, 0x52, 0x5f, 0x4f, 0x4f, 0x4d, 0x5f, 0x45, 0x56, 0x45, 0x4e, 0x54, + 0x10, 0x3c, 0x12, 0x2b, 0x0a, 0x27, 0x52, 0x45, 0x53, 0x4f, 0x55, 0x52, 0x43, 0x45, 0x5f, 0x54, + 0x59, 0x50, 0x45, 0x5f, 0x43, 0x4f, 0x4e, 0x54, 0x41, 0x49, 0x4e, 0x45, 0x52, 0x5f, 0x43, 0x52, + 0x41, 0x53, 0x48, 0x4c, 0x4f, 0x4f, 0x50, 0x5f, 0x45, 0x56, 0x45, 0x4e, 0x54, 0x10, 0x3d, 0x12, + 0x2d, 0x0a, 0x29, 0x52, 0x45, 0x53, 0x4f, 0x55, 0x52, 0x43, 0x45, 0x5f, 0x54, 0x59, 0x50, 0x45, + 0x5f, 0x43, 0x4f, 0x4e, 0x54, 0x41, 0x49, 0x4e, 0x45, 0x52, 0x5f, 0x53, 0x54, 0x41, 0x52, 0x54, + 0x55, 0x50, 0x5f, 0x4c, 0x49, 0x46, 0x45, 0x43, 0x59, 0x43, 0x4c, 0x45, 0x10, 0x3e, 0x12, 0x2e, + 0x0a, 0x2a, 0x52, 0x45, 0x53, 0x4f, 0x55, 0x52, 0x43, 0x45, 0x5f, 0x54, 0x59, 0x50, 0x45, 0x5f, + 0x43, 0x4f, 0x4e, 0x54, 0x41, 0x49, 0x4e, 0x45, 0x52, 0x5f, 0x43, 0x50, 0x55, 0x5f, 0x54, 0x48, + 0x52, 0x4f, 0x54, 0x54, 0x4c, 0x45, 0x5f, 0x45, 0x56, 0x45, 0x4e, 0x54, 0x10, 0x3f, 0x12, 0x22, + 0x0a, 0x1e, 0x52, 0x45, 0x53, 0x4f, 0x55, 0x52, 0x43, 0x45, 0x5f, 0x54, 0x59, 0x50, 0x45, 0x5f, + 0x43, 0x4c, 0x55, 0x53, 0x54, 0x45, 0x52, 0x5f, 0x53, 0x4e, 0x41, 0x50, 0x53, 0x48, 0x4f, 0x54, + 0x10, 0x4d, 0x2a, 0x8c, 0x01, 0x0a, 0x08, 0x4c, 0x6f, 0x67, 0x4c, 0x65, 0x76, 0x65, 0x6c, 0x12, + 0x19, 0x0a, 0x15, 0x4c, 0x4f, 0x47, 0x5f, 0x4c, 0x45, 0x56, 0x45, 0x4c, 0x5f, 0x55, 0x4e, 0x53, + 0x50, 0x45, 0x43, 0x49, 0x46, 0x49, 0x45, 0x44, 0x10, 0x00, 0x12, 0x13, 0x0a, 0x0f, 0x4c, 0x4f, + 0x47, 0x5f, 0x4c, 0x45, 0x56, 0x45, 0x4c, 0x5f, 0x44, 0x45, 0x42, 0x55, 0x47, 0x10, 0x01, 0x12, + 0x12, 0x0a, 0x0e, 0x4c, 0x4f, 0x47, 0x5f, 0x4c, 0x45, 0x56, 0x45, 0x4c, 0x5f, 0x49, 0x4e, 0x46, + 0x4f, 0x10, 0x02, 0x12, 0x12, 0x0a, 0x0e, 0x4c, 0x4f, 0x47, 0x5f, 0x4c, 0x45, 0x56, 0x45, 0x4c, + 0x5f, 0x57, 0x41, 0x52, 0x4e, 0x10, 0x03, 0x12, 0x13, 0x0a, 0x0f, 0x4c, 0x4f, 0x47, 0x5f, 0x4c, + 0x45, 0x56, 0x45, 0x4c, 0x5f, 0x45, 0x52, 0x52, 0x4f, 0x52, 0x10, 0x04, 0x12, 0x13, 0x0a, 0x0f, + 0x4c, 0x4f, 0x47, 0x5f, 0x4c, 0x45, 0x56, 0x45, 0x4c, 0x5f, 0x46, 0x41, 0x54, 0x41, 0x4c, 0x10, + 0x05, 0x32, 0xa0, 0x05, 0x0a, 0x17, 0x4d, 0x65, 0x74, 0x72, 0x69, 0x63, 0x73, 0x43, 0x6f, 0x6c, + 0x6c, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x12, 0x49, 0x0a, + 0x0c, 0x53, 0x65, 0x6e, 0x64, 0x52, 0x65, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x12, 0x1b, 0x2e, + 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x65, 0x6e, 0x64, 0x52, 0x65, 0x73, 0x6f, 0x75, + 0x72, 0x63, 0x65, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x1c, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x65, 0x6e, 0x64, 0x52, 0x65, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, - 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x1c, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, - 0x2e, 0x53, 0x65, 0x6e, 0x64, 0x52, 0x65, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x52, 0x65, 0x73, - 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x58, 0x0a, 0x11, 0x53, 0x65, 0x6e, 0x64, 0x52, 0x65, 0x73, - 0x6f, 0x75, 0x72, 0x63, 0x65, 0x42, 0x61, 0x74, 0x63, 0x68, 0x12, 0x20, 0x2e, 0x61, 0x70, 0x69, - 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x65, 0x6e, 0x64, 0x52, 0x65, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, - 0x42, 0x61, 0x74, 0x63, 0x68, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x21, 0x2e, 0x61, - 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x65, 0x6e, 0x64, 0x52, 0x65, 0x73, 0x6f, 0x75, 0x72, - 0x63, 0x65, 0x42, 0x61, 0x74, 0x63, 0x68, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, - 0x61, 0x0a, 0x14, 0x53, 0x65, 0x6e, 0x64, 0x54, 0x65, 0x6c, 0x65, 0x6d, 0x65, 0x74, 0x72, 0x79, - 0x4d, 0x65, 0x74, 0x72, 0x69, 0x63, 0x73, 0x12, 0x23, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, - 0x2e, 0x53, 0x65, 0x6e, 0x64, 0x54, 0x65, 0x6c, 0x65, 0x6d, 0x65, 0x74, 0x72, 0x79, 0x4d, 0x65, - 0x74, 0x72, 0x69, 0x63, 0x73, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x24, 0x2e, 0x61, - 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x65, 0x6e, 0x64, 0x54, 0x65, 0x6c, 0x65, 0x6d, 0x65, - 0x74, 0x72, 0x79, 0x4d, 0x65, 0x74, 0x72, 0x69, 0x63, 0x73, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, - 0x73, 0x65, 0x12, 0x66, 0x0a, 0x19, 0x53, 0x65, 0x6e, 0x64, 0x43, 0x6c, 0x75, 0x73, 0x74, 0x65, - 0x72, 0x53, 0x6e, 0x61, 0x70, 0x73, 0x68, 0x6f, 0x74, 0x53, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x12, - 0x1c, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x43, 0x6c, 0x75, 0x73, 0x74, 0x65, 0x72, - 0x53, 0x6e, 0x61, 0x70, 0x73, 0x68, 0x6f, 0x74, 0x43, 0x68, 0x75, 0x6e, 0x6b, 0x1a, 0x29, 0x2e, - 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x65, 0x6e, 0x64, 0x43, 0x6c, 0x75, 0x73, 0x74, - 0x65, 0x72, 0x53, 0x6e, 0x61, 0x70, 0x73, 0x68, 0x6f, 0x74, 0x53, 0x74, 0x72, 0x65, 0x61, 0x6d, - 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x28, 0x01, 0x12, 0x58, 0x0a, 0x11, 0x53, 0x65, - 0x6e, 0x64, 0x54, 0x65, 0x6c, 0x65, 0x6d, 0x65, 0x74, 0x72, 0x79, 0x4c, 0x6f, 0x67, 0x73, 0x12, - 0x20, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x65, 0x6e, 0x64, 0x54, 0x65, 0x6c, - 0x65, 0x6d, 0x65, 0x74, 0x72, 0x79, 0x4c, 0x6f, 0x67, 0x73, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, - 0x74, 0x1a, 0x21, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x65, 0x6e, 0x64, 0x54, - 0x65, 0x6c, 0x65, 0x6d, 0x65, 0x74, 0x72, 0x79, 0x4c, 0x6f, 0x67, 0x73, 0x52, 0x65, 0x73, 0x70, - 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x70, 0x0a, 0x19, 0x53, 0x65, 0x6e, 0x64, 0x4e, 0x65, 0x74, 0x77, - 0x6f, 0x72, 0x6b, 0x54, 0x72, 0x61, 0x66, 0x66, 0x69, 0x63, 0x4d, 0x65, 0x74, 0x72, 0x69, 0x63, - 0x73, 0x12, 0x28, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x65, 0x6e, 0x64, 0x4e, + 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x58, 0x0a, 0x11, 0x53, 0x65, 0x6e, 0x64, + 0x52, 0x65, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x42, 0x61, 0x74, 0x63, 0x68, 0x12, 0x20, 0x2e, + 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x65, 0x6e, 0x64, 0x52, 0x65, 0x73, 0x6f, 0x75, + 0x72, 0x63, 0x65, 0x42, 0x61, 0x74, 0x63, 0x68, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, + 0x21, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x65, 0x6e, 0x64, 0x52, 0x65, 0x73, + 0x6f, 0x75, 0x72, 0x63, 0x65, 0x42, 0x61, 0x74, 0x63, 0x68, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, + 0x73, 0x65, 0x12, 0x61, 0x0a, 0x14, 0x53, 0x65, 0x6e, 0x64, 0x54, 0x65, 0x6c, 0x65, 0x6d, 0x65, + 0x74, 0x72, 0x79, 0x4d, 0x65, 0x74, 0x72, 0x69, 0x63, 0x73, 0x12, 0x23, 0x2e, 0x61, 0x70, 0x69, + 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x65, 0x6e, 0x64, 0x54, 0x65, 0x6c, 0x65, 0x6d, 0x65, 0x74, 0x72, + 0x79, 0x4d, 0x65, 0x74, 0x72, 0x69, 0x63, 0x73, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, + 0x24, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x65, 0x6e, 0x64, 0x54, 0x65, 0x6c, + 0x65, 0x6d, 0x65, 0x74, 0x72, 0x79, 0x4d, 0x65, 0x74, 0x72, 0x69, 0x63, 0x73, 0x52, 0x65, 0x73, + 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x66, 0x0a, 0x19, 0x53, 0x65, 0x6e, 0x64, 0x43, 0x6c, 0x75, + 0x73, 0x74, 0x65, 0x72, 0x53, 0x6e, 0x61, 0x70, 0x73, 0x68, 0x6f, 0x74, 0x53, 0x74, 0x72, 0x65, + 0x61, 0x6d, 0x12, 0x1c, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x43, 0x6c, 0x75, 0x73, + 0x74, 0x65, 0x72, 0x53, 0x6e, 0x61, 0x70, 0x73, 0x68, 0x6f, 0x74, 0x43, 0x68, 0x75, 0x6e, 0x6b, + 0x1a, 0x29, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x65, 0x6e, 0x64, 0x43, 0x6c, + 0x75, 0x73, 0x74, 0x65, 0x72, 0x53, 0x6e, 0x61, 0x70, 0x73, 0x68, 0x6f, 0x74, 0x53, 0x74, 0x72, + 0x65, 0x61, 0x6d, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x28, 0x01, 0x12, 0x58, 0x0a, + 0x11, 0x53, 0x65, 0x6e, 0x64, 0x54, 0x65, 0x6c, 0x65, 0x6d, 0x65, 0x74, 0x72, 0x79, 0x4c, 0x6f, + 0x67, 0x73, 0x12, 0x20, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x65, 0x6e, 0x64, + 0x54, 0x65, 0x6c, 0x65, 0x6d, 0x65, 0x74, 0x72, 0x79, 0x4c, 0x6f, 0x67, 0x73, 0x52, 0x65, 0x71, + 0x75, 0x65, 0x73, 0x74, 0x1a, 0x21, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x65, + 0x6e, 0x64, 0x54, 0x65, 0x6c, 0x65, 0x6d, 0x65, 0x74, 0x72, 0x79, 0x4c, 0x6f, 0x67, 0x73, 0x52, + 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x70, 0x0a, 0x19, 0x53, 0x65, 0x6e, 0x64, 0x4e, 0x65, 0x74, 0x77, 0x6f, 0x72, 0x6b, 0x54, 0x72, 0x61, 0x66, 0x66, 0x69, 0x63, 0x4d, 0x65, 0x74, - 0x72, 0x69, 0x63, 0x73, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x29, 0x2e, 0x61, 0x70, - 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x65, 0x6e, 0x64, 0x4e, 0x65, 0x74, 0x77, 0x6f, 0x72, 0x6b, - 0x54, 0x72, 0x61, 0x66, 0x66, 0x69, 0x63, 0x4d, 0x65, 0x74, 0x72, 0x69, 0x63, 0x73, 0x52, 0x65, - 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x49, 0x0a, 0x0c, 0x4e, 0x6f, 0x64, 0x65, 0x4d, 0x65, - 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x12, 0x1b, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, - 0x4e, 0x6f, 0x64, 0x65, 0x4d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x52, 0x65, 0x71, 0x75, - 0x65, 0x73, 0x74, 0x1a, 0x1c, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x4e, 0x6f, 0x64, - 0x65, 0x4d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, - 0x65, 0x42, 0x8e, 0x01, 0x0a, 0x0a, 0x63, 0x6f, 0x6d, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, - 0x42, 0x15, 0x4d, 0x65, 0x74, 0x72, 0x69, 0x63, 0x73, 0x43, 0x6f, 0x6c, 0x6c, 0x65, 0x63, 0x74, - 0x6f, 0x72, 0x50, 0x72, 0x6f, 0x74, 0x6f, 0x50, 0x01, 0x5a, 0x30, 0x67, 0x69, 0x74, 0x68, 0x75, - 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x64, 0x65, 0x76, 0x7a, 0x65, 0x72, 0x6f, 0x2d, 0x69, 0x6e, - 0x63, 0x2f, 0x7a, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x72, 0x2f, 0x67, 0x65, 0x6e, 0x2f, 0x61, - 0x70, 0x69, 0x2f, 0x76, 0x31, 0x3b, 0x61, 0x70, 0x69, 0x76, 0x31, 0xa2, 0x02, 0x03, 0x41, 0x58, - 0x58, 0xaa, 0x02, 0x06, 0x41, 0x70, 0x69, 0x2e, 0x56, 0x31, 0xca, 0x02, 0x06, 0x41, 0x70, 0x69, - 0x5c, 0x56, 0x31, 0xe2, 0x02, 0x12, 0x41, 0x70, 0x69, 0x5c, 0x56, 0x31, 0x5c, 0x47, 0x50, 0x42, - 0x4d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0xea, 0x02, 0x07, 0x41, 0x70, 0x69, 0x3a, 0x3a, - 0x56, 0x31, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, + 0x72, 0x69, 0x63, 0x73, 0x12, 0x28, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x65, + 0x6e, 0x64, 0x4e, 0x65, 0x74, 0x77, 0x6f, 0x72, 0x6b, 0x54, 0x72, 0x61, 0x66, 0x66, 0x69, 0x63, + 0x4d, 0x65, 0x74, 0x72, 0x69, 0x63, 0x73, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x29, + 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x65, 0x6e, 0x64, 0x4e, 0x65, 0x74, 0x77, + 0x6f, 0x72, 0x6b, 0x54, 0x72, 0x61, 0x66, 0x66, 0x69, 0x63, 0x4d, 0x65, 0x74, 0x72, 0x69, 0x63, + 0x73, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x49, 0x0a, 0x0c, 0x4e, 0x6f, 0x64, + 0x65, 0x4d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x12, 0x1b, 0x2e, 0x61, 0x70, 0x69, 0x2e, + 0x76, 0x31, 0x2e, 0x4e, 0x6f, 0x64, 0x65, 0x4d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x52, + 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x1c, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, + 0x4e, 0x6f, 0x64, 0x65, 0x4d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x52, 0x65, 0x73, 0x70, + 0x6f, 0x6e, 0x73, 0x65, 0x42, 0x8e, 0x01, 0x0a, 0x0a, 0x63, 0x6f, 0x6d, 0x2e, 0x61, 0x70, 0x69, + 0x2e, 0x76, 0x31, 0x42, 0x15, 0x4d, 0x65, 0x74, 0x72, 0x69, 0x63, 0x73, 0x43, 0x6f, 0x6c, 0x6c, + 0x65, 0x63, 0x74, 0x6f, 0x72, 0x50, 0x72, 0x6f, 0x74, 0x6f, 0x50, 0x01, 0x5a, 0x30, 0x67, 0x69, + 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x64, 0x65, 0x76, 0x7a, 0x65, 0x72, 0x6f, + 0x2d, 0x69, 0x6e, 0x63, 0x2f, 0x7a, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x72, 0x2f, 0x67, 0x65, + 0x6e, 0x2f, 0x61, 0x70, 0x69, 0x2f, 0x76, 0x31, 0x3b, 0x61, 0x70, 0x69, 0x76, 0x31, 0xa2, 0x02, + 0x03, 0x41, 0x58, 0x58, 0xaa, 0x02, 0x06, 0x41, 0x70, 0x69, 0x2e, 0x56, 0x31, 0xca, 0x02, 0x06, + 0x41, 0x70, 0x69, 0x5c, 0x56, 0x31, 0xe2, 0x02, 0x12, 0x41, 0x70, 0x69, 0x5c, 0x56, 0x31, 0x5c, + 0x47, 0x50, 0x42, 0x4d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0xea, 0x02, 0x07, 0x41, 0x70, + 0x69, 0x3a, 0x3a, 0x56, 0x31, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( diff --git a/internal/collector/container_resource_collector.go b/internal/collector/container_resource_collector.go index 64e2a046..d41c35df 100644 --- a/internal/collector/container_resource_collector.go +++ b/internal/collector/container_resource_collector.go @@ -55,6 +55,12 @@ type gpuQueryState struct { lastFailed bool } +// throttleTracker tracks last emission time for CPU throttle events per container to avoid duplicates. +type throttleTracker struct { + lastEmitted map[string]time.Time // key: "ns/pod/container" → last emit time + mu sync.Mutex +} + // ContainerResourceCollector collects container resource usage metrics type ContainerResourceCollector struct { k8sClient kubernetes.Interface @@ -76,6 +82,7 @@ type ContainerResourceCollector struct { mu sync.RWMutex gpuQueryErrorState map[string]*gpuQueryState // most of the case we are not deploying zxporter to GPU nodes which cause GPU query to fail infinitely, and we dont want to get that GPU query fails error every minute for every container gpuQueryMu sync.Mutex + throttle throttleTracker // Removed manual cache rsLister appslisters.ReplicaSetLister rsInformer cache.SharedIndexInformer @@ -145,6 +152,7 @@ func NewContainerResourceCollector( metrics: metrics, telemetryLogger: telemetryLogger, gpuQueryErrorState: make(map[string]*gpuQueryState), + throttle: throttleTracker{lastEmitted: make(map[string]time.Time)}, } } @@ -381,6 +389,11 @@ func (c *ContainerResourceCollector) collectAllContainerResources(ctx context.Co throttleFraction = 0 } + // Emit CPU throttle event if fraction exceeds threshold + if throttleFraction > 0.1 { + c.emitCPUThrottleEvent(pod, containerMetrics, throttleFraction) + } + // Fetch I/O metrics for this container if !c.config.DisableNetworkIOMetrics { ioMetrics, err = c.collectContainerIOMetrics(queryCtx, pod, containerMetrics.Name) @@ -776,6 +789,68 @@ func (c *ContainerResourceCollector) collectContainerCPUThrottleMetrics(ctx cont return fraction, nil } +// emitCPUThrottleEvent sends a CPU throttle event through the batch channel with 5-minute deduplication. +func (c *ContainerResourceCollector) emitCPUThrottleEvent(pod *corev1.Pod, containerMetrics metricsv1beta1.ContainerMetrics, throttleFraction float64) { + dedupKey := fmt.Sprintf("%s/%s/%s", pod.Namespace, pod.Name, containerMetrics.Name) + + c.throttle.mu.Lock() + if lastEmit, ok := c.throttle.lastEmitted[dedupKey]; ok && time.Since(lastEmit) < 5*time.Minute { + c.throttle.mu.Unlock() + return + } + c.throttle.lastEmitted[dedupKey] = time.Now() + c.throttle.mu.Unlock() + + // Resolve workload info + workloadKind, workloadName := c.resolveWorkload(pod) + if workloadKind == "" { + workloadKind = "Pod" + workloadName = pod.Name + } + + // Get CPU resources from container spec + var cpuUsageMillis, cpuRequestMillis, cpuLimitMillis int64 + cpuUsageMillis = containerMetrics.Usage.Cpu().MilliValue() + for i := range pod.Spec.Containers { + if pod.Spec.Containers[i].Name == containerMetrics.Name { + if pod.Spec.Containers[i].Resources.Requests != nil { + if cpu := pod.Spec.Containers[i].Resources.Requests.Cpu(); cpu != nil { + cpuRequestMillis = cpu.MilliValue() + } + } + if pod.Spec.Containers[i].Resources.Limits != nil { + if cpu := pod.Spec.Containers[i].Resources.Limits.Cpu(); cpu != nil { + cpuLimitMillis = cpu.MilliValue() + } + } + break + } + } + + // Round timestamp to nearest minute for DB dedup + now := time.Now() + roundedTS := now.Truncate(time.Minute) + + c.batchChan <- CollectedResource{ + ResourceType: ContainerCPUThrottleEvent, + Object: map[string]interface{}{ + "namespace": pod.Namespace, + "workload_name": workloadName, + "workload_kind": workloadKind, + "pod_name": pod.Name, + "container_name": containerMetrics.Name, + "cpu_usage_millicores": cpuUsageMillis, + "cpu_request_millicores": cpuRequestMillis, + "cpu_limit_millicores": cpuLimitMillis, + "throttled_fraction": throttleFraction, + "timestamp": roundedTS.Format(time.RFC3339Nano), + }, + Timestamp: now, + EventType: EventTypeAdd, + Key: fmt.Sprintf("cpu-throttle/%s/%s/%s", pod.Namespace, pod.Name, containerMetrics.Name), + } +} + // collectContainerGPUMetrics collects GPU metrics for a container using Prometheus queries func (c *ContainerResourceCollector) collectContainerGPUMetrics(ctx context.Context, pod *corev1.Pod, containerName string) (map[string]interface{}, error) { metrics := make(map[string]interface{}) diff --git a/internal/collector/interface.go b/internal/collector/interface.go index 793c8028..320eb0e2 100644 --- a/internal/collector/interface.go +++ b/internal/collector/interface.go @@ -146,6 +146,10 @@ const ( WorkloadRecommendation WorkloadRule CNPGCluster + ContainerOOMEvent + ContainerCrashLoopEvent + ContainerStartupLifecycle + ContainerCPUThrottleEvent ) // String returns the string representation of the ResourceType @@ -206,6 +210,10 @@ func (r ResourceType) String() string { WorkloadRecommendation: "workload_recommendation", WorkloadRule: "workload_rule", CNPGCluster: "cnpg_cluster", + ContainerOOMEvent: "container_oom_event", + ContainerCrashLoopEvent: "container_crashloop_event", + ContainerStartupLifecycle: "container_startup_lifecycle", + ContainerCPUThrottleEvent: "container_cpu_throttle_event", } if name, ok := names[r]; ok { @@ -327,6 +335,14 @@ func (r ResourceType) ProtoType() gen.ResourceType { return gen.ResourceType_RESOURCE_TYPE_WORKLOAD_RULE case CNPGCluster: return gen.ResourceType_RESOURCE_TYPE_CNPG_CLUSTER + case ContainerOOMEvent: + return gen.ResourceType_RESOURCE_TYPE_CONTAINER_OOM_EVENT + case ContainerCrashLoopEvent: + return gen.ResourceType_RESOURCE_TYPE_CONTAINER_CRASHLOOP_EVENT + case ContainerStartupLifecycle: + return gen.ResourceType_RESOURCE_TYPE_CONTAINER_STARTUP_LIFECYCLE + case ContainerCPUThrottleEvent: + return gen.ResourceType_RESOURCE_TYPE_CONTAINER_CPU_THROTTLE_EVENT default: return gen.ResourceType_RESOURCE_TYPE_UNSPECIFIED } diff --git a/internal/collector/pod_collector.go b/internal/collector/pod_collector.go index 3429de58..e2475681 100644 --- a/internal/collector/pod_collector.go +++ b/internal/collector/pod_collector.go @@ -19,6 +19,30 @@ import ( "k8s.io/client-go/tools/cache" ) +// startupLifecycleKey uniquely identifies a container startup attempt. +type startupLifecycleKey struct { + podUID types.UID + containerName string + restartCount int32 +} + +// startupLifecycleEntry tracks phase transition timestamps for a single container startup. +type startupLifecycleEntry struct { + namespace string + workloadName string + workloadKind string + podName string + containerName string + restartCount int32 + isRestart bool + pendingAt *time.Time + creatingAt *time.Time + runningAt *time.Time + lastSeen time.Time +} + +const startupTrackerTTL = 30 * time.Minute + // PodCollector watches for pod events and collects pod data type PodCollector struct { client kubernetes.Interface @@ -33,6 +57,10 @@ type PodCollector struct { logger logr.Logger telemetryLogger telemetry_logger.Logger mu sync.RWMutex + + // Startup lifecycle tracking + startupTracker map[startupLifecycleKey]*startupLifecycleEntry + startupTrackerMu sync.Mutex } // NewPodCollector creates a new collector for pod resources @@ -77,6 +105,7 @@ func NewPodCollector( excludedPods: excludedPodsMap, logger: logger.WithName("pod-collector"), telemetryLogger: telemetryLogger, + startupTracker: make(map[startupLifecycleKey]*startupLifecycleEntry), } } @@ -186,6 +215,12 @@ func (c *PodCollector) handlePodEvent(pod *corev1.Pod, eventType EventType) { EventType: eventType, Key: fmt.Sprintf("%s/%s", pod.Namespace, pod.Name), } + + // On ADD (initial sync), emit startup lifecycle snapshots for running containers. + // This ensures lifecycle data is captured even if zxporter restarts. + if eventType == EventTypeAdd { + c.snapshotStartupLifecycles(pod) + } } // handlePodUpdate processes pod update events with special handling for container status changes @@ -197,8 +232,11 @@ func (c *PodCollector) handlePodUpdate(oldPod, newPod *corev1.Pod) { // Send the basic pod update c.handlePodEvent(newPod, EventTypeUpdate) - // Check for container events like OOMKilled + // Check for container events like OOMKilled and CrashLoopBackOff c.checkForContainerEvents(oldPod, newPod) + + // Track startup lifecycle phase transitions + c.trackStartupLifecycle(oldPod, newPod) } // checkForContainerEvents checks for container-specific events like OOMKilled @@ -262,10 +300,21 @@ func (c *PodCollector) checkForContainerEvents(oldPod, newPod *corev1.Pod) { }, ) } + + // Emit structured OOM event for direct path + c.emitContainerOOMEvent(newPod, newStatus) } c.sendContainerEvent(newPod, newStatus.Name, EventTypeContainerRestarted, &newStatus) } + + // Check for CrashLoopBackOff + if newStatus.State.Waiting != nil && newStatus.State.Waiting.Reason == "CrashLoopBackOff" { + // Only emit if this is a new CrashLoop state (wasn't in CrashLoop before) + if !exists || oldStatus.State.Waiting == nil || oldStatus.State.Waiting.Reason != "CrashLoopBackOff" { + c.emitContainerCrashLoopEvent(newPod, newStatus) + } + } } } @@ -370,3 +419,326 @@ func (c *PodCollector) AddResource(resource interface{}) error { c.handlePodEvent(pod, EventTypeAdd) return nil } + +// getWorkloadInfo extracts the top-level workload name and kind from a pod's owner references. +// For pods owned by ReplicaSets (which are owned by Deployments), it returns the Deployment info. +func getWorkloadInfo(pod *corev1.Pod) (name, kind string) { + if len(pod.OwnerReferences) == 0 { + return pod.Name, "Pod" + } + + owner := pod.OwnerReferences[0] + switch owner.Kind { + case "ReplicaSet": + // ReplicaSets created by Deployments have names like "-" + // Strip the hash suffix to get the Deployment name + rsName := owner.Name + if idx := strings.LastIndex(rsName, "-"); idx > 0 { + return rsName[:idx], "Deployment" + } + return rsName, "ReplicaSet" + case "StatefulSet": + return owner.Name, "StatefulSet" + case "DaemonSet": + return owner.Name, "DaemonSet" + case "Job": + return owner.Name, "Job" + default: + return owner.Name, owner.Kind + } +} + +// getContainerResources returns the memory request, limit, and usage for a container. +func getContainerResources(pod *corev1.Pod, containerName string) (requestBytes, limitBytes int64) { + for _, c := range pod.Spec.Containers { + if c.Name == containerName { + if req, ok := c.Resources.Requests[corev1.ResourceMemory]; ok { + requestBytes = req.Value() + } + if lim, ok := c.Resources.Limits[corev1.ResourceMemory]; ok { + limitBytes = lim.Value() + } + return + } + } + return 0, 0 +} + +// emitContainerOOMEvent sends a structured OOM event through the batch channel. +func (c *PodCollector) emitContainerOOMEvent(pod *corev1.Pod, status corev1.ContainerStatus) { + workloadName, workloadKind := getWorkloadInfo(pod) + requestBytes, limitBytes := getContainerResources(pod, status.Name) + + var exitCode int32 + var usageBytes int64 + if status.LastTerminationState.Terminated != nil { + exitCode = status.LastTerminationState.Terminated.ExitCode + } + // Memory usage at OOM time is not directly available from status; + // use the limit as an approximation (OOM means usage >= limit) + usageBytes = limitBytes + + c.batchChan <- CollectedResource{ + ResourceType: ContainerOOMEvent, + Object: map[string]interface{}{ + "namespace": pod.Namespace, + "workload_name": workloadName, + "workload_kind": workloadKind, + "pod_name": pod.Name, + "container_name": status.Name, + "memory_usage_bytes": usageBytes, + "memory_request_bytes": requestBytes, + "memory_limit_bytes": limitBytes, + "restart_count": status.RestartCount, + "exit_code": exitCode, + "timestamp": time.Now().Format(time.RFC3339Nano), + }, + Timestamp: time.Now(), + EventType: EventTypeAdd, + Key: fmt.Sprintf("oom/%s/%s/%s", pod.Namespace, pod.Name, status.Name), + } +} + +// emitContainerCrashLoopEvent sends a structured CrashLoopBackOff event through the batch channel. +func (c *PodCollector) emitContainerCrashLoopEvent(pod *corev1.Pod, status corev1.ContainerStatus) { + workloadName, workloadKind := getWorkloadInfo(pod) + + var lastTerminationReason string + var exitCode int32 + var isOOMRelated bool + if status.LastTerminationState.Terminated != nil { + lastTerminationReason = status.LastTerminationState.Terminated.Reason + exitCode = status.LastTerminationState.Terminated.ExitCode + isOOMRelated = lastTerminationReason == "OOMKilled" + } + + c.logger.Info("Container CrashLoopBackOff detected", + "namespace", pod.Namespace, + "pod", pod.Name, + "container", status.Name, + "restartCount", status.RestartCount, + "isOOMRelated", isOOMRelated) + + c.batchChan <- CollectedResource{ + ResourceType: ContainerCrashLoopEvent, + Object: map[string]interface{}{ + "namespace": pod.Namespace, + "workload_name": workloadName, + "workload_kind": workloadKind, + "pod_name": pod.Name, + "container_name": status.Name, + "restart_count": status.RestartCount, + "last_termination_reason": lastTerminationReason, + "exit_code": exitCode, + "is_oom_related": isOOMRelated, + "timestamp": time.Now().Format(time.RFC3339Nano), + }, + Timestamp: time.Now(), + EventType: EventTypeAdd, + Key: fmt.Sprintf("crashloop/%s/%s/%s", pod.Namespace, pod.Name, status.Name), + } +} + +// trackStartupLifecycle tracks container startup phase transitions and emits lifecycle events. +func (c *PodCollector) trackStartupLifecycle(_, newPod *corev1.Pod) { + now := time.Now() + + // Periodically clean up stale entries + c.cleanupStartupTracker() + + for _, newStatus := range newPod.Status.ContainerStatuses { + key := startupLifecycleKey{ + podUID: newPod.UID, + containerName: newStatus.Name, + restartCount: newStatus.RestartCount, + } + + c.startupTrackerMu.Lock() + entry, exists := c.startupTracker[key] + + if !exists { + // New lifecycle entry — pod is in Pending or later phase + workloadName, workloadKind := getWorkloadInfo(newPod) + entry = &startupLifecycleEntry{ + namespace: newPod.Namespace, + workloadName: workloadName, + workloadKind: workloadKind, + podName: newPod.Name, + containerName: newStatus.Name, + restartCount: newStatus.RestartCount, + isRestart: newStatus.RestartCount > 0, + lastSeen: now, + } + + // Record pending time from pod condition + for _, cond := range newPod.Status.Conditions { + if cond.Type == corev1.PodScheduled && cond.Status == corev1.ConditionTrue { + t := cond.LastTransitionTime.Time + entry.pendingAt = &t + break + } + } + + c.startupTracker[key] = entry + } + + entry.lastSeen = now + + // Track ContainerCreating phase + if newStatus.State.Waiting != nil && newStatus.State.Waiting.Reason == "ContainerCreating" && entry.creatingAt == nil { + entry.creatingAt = &now + } + + // Track Running phase + if newStatus.State.Running != nil && entry.runningAt == nil { + t := newStatus.State.Running.StartedAt.Time + if t.IsZero() { + t = now + } + entry.runningAt = &t + } + + // Track Ready phase — emit the lifecycle event + if newStatus.Ready && entry.runningAt != nil { + // Calculate durations + var timeToRunningMs, timeToReadyMs *int64 + if entry.pendingAt != nil && entry.runningAt != nil { + ms := entry.runningAt.Sub(*entry.pendingAt).Milliseconds() + timeToRunningMs = &ms + } + if entry.pendingAt != nil { + ms := now.Sub(*entry.pendingAt).Milliseconds() + timeToReadyMs = &ms + } + + c.emitStartupLifecycleEvent(entry, &now, timeToRunningMs, timeToReadyMs) + + // Clean up the entry + delete(c.startupTracker, key) + c.startupTrackerMu.Unlock() + continue + } + + c.startupTrackerMu.Unlock() + } +} + +// emitStartupLifecycleEvent sends a completed startup lifecycle event through the batch channel. +func (c *PodCollector) emitStartupLifecycleEvent(entry *startupLifecycleEntry, readyAt *time.Time, timeToRunningMs, timeToReadyMs *int64) { + payload := map[string]interface{}{ + "namespace": entry.namespace, + "workload_name": entry.workloadName, + "workload_kind": entry.workloadKind, + "pod_name": entry.podName, + "container_name": entry.containerName, + "restart_count": entry.restartCount, + "is_restart": entry.isRestart, + "timestamp": time.Now().Format(time.RFC3339Nano), + } + + if entry.pendingAt != nil { + payload["pending_at"] = entry.pendingAt.Format(time.RFC3339Nano) + } + if entry.creatingAt != nil { + payload["container_creating_at"] = entry.creatingAt.Format(time.RFC3339Nano) + } + if entry.runningAt != nil { + payload["running_at"] = entry.runningAt.Format(time.RFC3339Nano) + } + if readyAt != nil { + payload["ready_at"] = readyAt.Format(time.RFC3339Nano) + } + if timeToRunningMs != nil { + payload["time_to_running_ms"] = *timeToRunningMs + } + if timeToReadyMs != nil { + payload["time_to_ready_ms"] = *timeToReadyMs + } + + c.batchChan <- CollectedResource{ + ResourceType: ContainerStartupLifecycle, + Object: payload, + Timestamp: time.Now(), + EventType: EventTypeAdd, + Key: fmt.Sprintf("startup/%s/%s/%s/%d", entry.namespace, entry.podName, entry.containerName, entry.restartCount), + } +} + +// cleanupStartupTracker removes stale entries that never reached Ready state. +func (c *PodCollector) cleanupStartupTracker() { + c.startupTrackerMu.Lock() + defer c.startupTrackerMu.Unlock() + + now := time.Now() + for key, entry := range c.startupTracker { + if now.Sub(entry.lastSeen) > startupTrackerTTL { + delete(c.startupTracker, key) + } + } +} + +// snapshotStartupLifecycles reconstructs and emits startup lifecycle events from a pod's +// current status. Called during initial cache sync (ADD events) so that lifecycle data +// is captured for containers that started before zxporter was running. The DB upsert +// (ON CONFLICT DO NOTHING) ensures no duplicates if the event-driven path already captured it. +func (c *PodCollector) snapshotStartupLifecycles(pod *corev1.Pod) { + // Only emit for pods that have started + if pod.Status.StartTime == nil { + return + } + + // Find the Ready condition for ready_at timestamp + var readyAt *time.Time + for _, cond := range pod.Status.Conditions { + if cond.Type == corev1.ContainersReady && cond.Status == corev1.ConditionTrue { + t := cond.LastTransitionTime.Time + if !t.IsZero() { + readyAt = &t + } + break + } + } + + workloadName, workloadKind := getWorkloadInfo(pod) + pendingAt := pod.Status.StartTime.Time + + for _, cs := range pod.Status.ContainerStatuses { + // Only snapshot containers that are running AND ready. + // Non-ready containers are left for the event-driven path to capture, + // avoiding partial records that would block the full record via DoNothing upsert. + if cs.State.Running == nil || !cs.Ready || readyAt == nil { + continue + } + + runningAt := cs.State.Running.StartedAt.Time + if runningAt.IsZero() { + continue + } + + entry := &startupLifecycleEntry{ + namespace: pod.Namespace, + workloadName: workloadName, + workloadKind: workloadKind, + podName: pod.Name, + containerName: cs.Name, + restartCount: cs.RestartCount, + isRestart: cs.RestartCount > 0, + pendingAt: &pendingAt, + runningAt: &runningAt, + } + + var timeToRunningMs, timeToReadyMs *int64 + + ms := runningAt.Sub(pendingAt).Milliseconds() + if ms > 0 { + timeToRunningMs = &ms + } + + ms = readyAt.Sub(pendingAt).Milliseconds() + if ms > 0 { + timeToReadyMs = &ms + } + + c.emitStartupLifecycleEvent(entry, readyAt, timeToRunningMs, timeToReadyMs) + } +} diff --git a/proto/dakr_proto_descriptor.bin b/proto/dakr_proto_descriptor.bin index 68f9cfd4..372ed000 100644 Binary files a/proto/dakr_proto_descriptor.bin and b/proto/dakr_proto_descriptor.bin differ