From b72cab1a6c49288e78182e84688145acd150a2a6 Mon Sep 17 00:00:00 2001 From: Daniel Canter Date: Tue, 19 Apr 2022 14:32:20 -0700 Subject: [PATCH 01/12] Optimize Windows container statistics This change adds in a optimization workaround for cases where we query HCs for just statistics through our internal PropertiesV2 interface. The optimization we can make here if the user is asking for Statistics only, and this is due to an inefficient way that HCS calculates the private working set total for a given container. HCS calls NtQuerySystemInformation with the class SystemProcessInformation which returns an array containing system information for *every* process running on the machine. They then grab the pids that are running in the container and filter down the entries in the array to only what's running in that silo and start tallying up the total. This doesn't work well as performance should get worse if more processess are running on the machine in general and not just in the container. All of the additional information besides the WorkingSetPrivateSize field is ignored as well which isn't great and is wasted work to fetch. HCS only let's you grab statistics in an all or nothing fashion, so we can't just grab the private working set ourselves and ask for everything else seperately. The optimization we can make here is to open the silo ourselves and do the same queries for the rest of the info, as well as calculating the private working set in a more efficient manner by: 1. Find the pids running in the silo 2. Get a process handle for every process (only need PROCESS_QUERY_LIMITED_INFORMATION access) 3. Call NtQueryInformationProcess on each process with the class ProcessVmCounters 4. Tally up the total using the field PrivateWorkingSetSize in VM_COUNTERS_EX2. This change additionally: * Changes the jobcontainers package to use this new way to calculate the private working set. * Change the query the StorageStats method in the jobobject package uses to grab IO counters to match what HCS queries. Signed-off-by: Daniel Canter --- internal/hcs/system.go | 112 +++- internal/jobcontainers/jobcontainer.go | 15 +- internal/jobobject/jobobject.go | 55 +- internal/winapi/jobobject.go | 5 +- internal/winapi/process.go | 57 ++ internal/winapi/zsyscall_windows.go | 7 + .../Microsoft/hcsshim/internal/hcs/system.go | 112 +++- .../hcsshim/internal/jobobject/doc.go | 8 + .../hcsshim/internal/jobobject/iocp.go | 113 ++++ .../hcsshim/internal/jobobject/jobobject.go | 585 ++++++++++++++++++ .../hcsshim/internal/jobobject/limits.go | 317 ++++++++++ .../Microsoft/hcsshim/internal/queue/mq.go | 111 ++++ .../hcsshim/internal/winapi/jobobject.go | 5 +- .../hcsshim/internal/winapi/process.go | 57 ++ .../internal/winapi/zsyscall_windows.go | 7 + test/vendor/modules.txt | 2 + 16 files changed, 1546 insertions(+), 22 deletions(-) create mode 100644 test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/doc.go create mode 100644 test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/iocp.go create mode 100644 test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/jobobject.go create mode 100644 test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/limits.go create mode 100644 test/vendor/github.com/Microsoft/hcsshim/internal/queue/mq.go diff --git a/internal/hcs/system.go b/internal/hcs/system.go index 052d08ccc3..79713756e4 100644 --- a/internal/hcs/system.go +++ b/internal/hcs/system.go @@ -6,18 +6,22 @@ import ( "context" "encoding/json" "errors" + "fmt" "strings" "sync" "syscall" + "time" "github.com/Microsoft/hcsshim/internal/cow" "github.com/Microsoft/hcsshim/internal/hcs/schema1" hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2" + "github.com/Microsoft/hcsshim/internal/jobobject" "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/oc" "github.com/Microsoft/hcsshim/internal/timeout" "github.com/Microsoft/hcsshim/internal/vmcompute" "go.opencensus.io/trace" + "golang.org/x/sys/windows" ) type System struct { @@ -31,6 +35,7 @@ type System struct { waitError error exitError error os, typ string + startTime time.Time } func newSystem(id string) *System { @@ -40,6 +45,11 @@ func newSystem(id string) *System { } } +// Implementation detail for silo naming, this should NOT be relied upon very heavily. +func siloNameFmt(containerID string) string { + return fmt.Sprintf(`\Container_%s`, containerID) +} + // CreateComputeSystem creates a new compute system with the given configuration but does not start it. func CreateComputeSystem(ctx context.Context, id string, hcsDocumentInterface interface{}) (_ *System, err error) { operation := "hcs::CreateComputeSystem" @@ -197,7 +207,7 @@ func (computeSystem *System) Start(ctx context.Context) (err error) { if err != nil { return makeSystemError(computeSystem, operation, err, events) } - + computeSystem.startTime = time.Now() return nil } @@ -326,6 +336,74 @@ func (computeSystem *System) Properties(ctx context.Context, types ...schema1.Pr return properties, nil } +// statisticsInProc emulates what HCS does to grab statistics for a given container with a small +// change to make grabbing the private working set total much more efficient. +func (computeSystem *System) statisticsInProc(ctx context.Context) (*hcsschema.Properties, error) { + // See if we'd even be able to open the silo. We'll need to be local + // system/system so check first. + usr, err := windows.GetCurrentProcessToken().GetTokenUser() + if err != nil { + return nil, err + } + if !usr.User.Sid.IsWellKnown(windows.WinLocalSystemSid) { + return nil, errors.New("process does not have the right permissions to open the silo") + } + + jobOptions := &jobobject.Options{ + UseNTVariant: true, + Name: siloNameFmt(computeSystem.id), + } + job, err := jobobject.Open(ctx, jobOptions) + if err != nil { + return nil, err + } + defer job.Close() + + memInfo, err := job.QueryMemoryStats() + if err != nil { + return nil, err + } + + processorInfo, err := job.QueryProcessorStats() + if err != nil { + return nil, err + } + + storageInfo, err := job.QueryStorageStats() + if err != nil { + return nil, err + } + + privateWorkingSet, err := job.PrivateWorkingSet() + if err != nil { + return nil, err + } + + return &hcsschema.Properties{ + Statistics: &hcsschema.Statistics{ + Timestamp: time.Now(), + ContainerStartTime: computeSystem.startTime, + Uptime100ns: uint64(time.Since(computeSystem.startTime)) / 100, + Memory: &hcsschema.MemoryStats{ + MemoryUsageCommitBytes: memInfo.JobMemory, + MemoryUsageCommitPeakBytes: memInfo.PeakJobMemoryUsed, + MemoryUsagePrivateWorkingSetBytes: privateWorkingSet, + }, + Processor: &hcsschema.ProcessorStats{ + RuntimeKernel100ns: uint64(processorInfo.TotalKernelTime), + RuntimeUser100ns: uint64(processorInfo.TotalUserTime), + TotalRuntime100ns: uint64(processorInfo.TotalKernelTime + processorInfo.TotalUserTime), + }, + Storage: &hcsschema.StorageStats{ + ReadCountNormalized: uint64(storageInfo.ReadStats.IoCount), + ReadSizeBytes: storageInfo.ReadStats.TotalSize, + WriteCountNormalized: uint64(storageInfo.WriteStats.IoCount), + WriteSizeBytes: storageInfo.WriteStats.TotalSize, + }, + }, + }, nil +} + // PropertiesV2 returns the requested container properties targeting a V2 schema container. func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschema.PropertyType) (*hcsschema.Properties, error) { computeSystem.handleLock.RLock() @@ -333,6 +411,38 @@ func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschem operation := "hcs::System::PropertiesV2" + // There's an optimization we can make here if the user is asking for Statistics only, and this is due to + // an inefficient way that HCS calculates the private working set total for a given container. HCS + // calls NtQuerySystemInformation with the class SystemProcessInformation which returns an array containing + // system information for *every* process running on the machine. They then grab the pids that are running in + // the container and filter down the entries in the array to only what's running in that silo and start tallying + // up the total. This doesn't work well as performance should get worse if more processess are running on the + // machine in general and not just in the container. All of the additional information besides the + // WorkingSetPrivateSize field is ignored as well which isn't great and is wasted work to fetch. + // + // HCS only let's you grab statistics in an all or nothing fashion, so we can't just grab the private + // working set ourselves and ask for everything else seperately. The optimization we can make here is + // to open the silo ourselves and do the same queries for the rest of the info, as well as calculating + // the private working set in a more efficient manner by: + // + // 1. Find the pids running in the silo + // 2. Get a process handle for every process (only need PROCESS_QUERY_LIMITED_INFORMATION access) + // 3. Call NtQueryInformationProcess on each process with the class ProcessVmCounters + // 4. Tally up the total using the field PrivateWorkingSetSize in VM_COUNTERS_EX2. + // + // The only caveat is the process invoking this code must be running as SYSTEM to open the silo, + // otherwise we'll fallback to asking HCS for the information. + + // First check if we're only asking for Stats to see if we can apply the above optimization. We should + // only do this for containers and not UVMs. + if (len(types) == 1 && types[0] == hcsschema.PTStatistics) && computeSystem.typ == "container" { + properties, err := computeSystem.statisticsInProc(ctx) + // If no errors just return the info as is, if not we'll fallback to the HCS route. + if err == nil { + return properties, nil + } + } + queryBytes, err := json.Marshal(hcsschema.PropertyQuery{PropertyTypes: types}) if err != nil { return nil, makeSystemError(computeSystem, operation, err, nil) diff --git a/internal/jobcontainers/jobcontainer.go b/internal/jobcontainers/jobcontainer.go index f0433102a4..997c680e3a 100644 --- a/internal/jobcontainers/jobcontainer.go +++ b/internal/jobcontainers/jobcontainer.go @@ -435,12 +435,9 @@ func (c *JobContainer) PropertiesV2(ctx context.Context, types ...hcsschema.Prop return nil, errors.Wrap(err, "failed to query for job containers storage information") } - var privateWorkingSet uint64 - err = forEachProcessInfo(c.job, func(procInfo *winapi.SYSTEM_PROCESS_INFORMATION) { - privateWorkingSet += uint64(procInfo.WorkingSetPrivateSize) - }) + privateWorkingSet, err := c.job.PrivateWorkingSet() if err != nil { - return nil, errors.Wrap(err, "failed to get private working set for container") + return nil, fmt.Errorf("failed to get private working set for container: %w", err) } return &hcsschema.Properties{ @@ -459,10 +456,10 @@ func (c *JobContainer) PropertiesV2(ctx context.Context, types ...hcsschema.Prop TotalRuntime100ns: uint64(processorInfo.TotalKernelTime + processorInfo.TotalUserTime), }, Storage: &hcsschema.StorageStats{ - ReadCountNormalized: storageInfo.IoInfo.ReadOperationCount, - ReadSizeBytes: storageInfo.IoInfo.ReadTransferCount, - WriteCountNormalized: storageInfo.IoInfo.WriteOperationCount, - WriteSizeBytes: storageInfo.IoInfo.WriteTransferCount, + ReadCountNormalized: uint64(storageInfo.ReadStats.IoCount), + ReadSizeBytes: storageInfo.ReadStats.TotalSize, + WriteCountNormalized: uint64(storageInfo.WriteStats.IoCount), + WriteSizeBytes: storageInfo.WriteStats.TotalSize, }, }, }, nil diff --git a/internal/jobobject/jobobject.go b/internal/jobobject/jobobject.go index 7f2003f155..6b8ada3c5d 100644 --- a/internal/jobobject/jobobject.go +++ b/internal/jobobject/jobobject.go @@ -430,18 +430,13 @@ func (job *JobObject) QueryProcessorStats() (*winapi.JOBOBJECT_BASIC_ACCOUNTING_ } // QueryStorageStats gets the storage (I/O) stats for the job object. -func (job *JobObject) QueryStorageStats() (*winapi.JOBOBJECT_BASIC_AND_IO_ACCOUNTING_INFORMATION, error) { - job.handleLock.RLock() - defer job.handleLock.RUnlock() - - if job.handle == 0 { - return nil, ErrAlreadyClosed +func (job *JobObject) QueryStorageStats() (*winapi.JOBOBJECT_IO_ATTRIBUTION_INFORMATION, error) { + info := winapi.JOBOBJECT_IO_ATTRIBUTION_INFORMATION{ + ControlFlags: winapi.JOBOBJECT_IO_ATTRIBUTION_CONTROL_ENABLE, } - - info := winapi.JOBOBJECT_BASIC_AND_IO_ACCOUNTING_INFORMATION{} if err := winapi.QueryInformationJobObject( job.handle, - winapi.JobObjectBasicAndIoAccountingInformation, + winapi.JobObjectIoAttribution, uintptr(unsafe.Pointer(&info)), uint32(unsafe.Sizeof(info)), nil, @@ -546,3 +541,45 @@ func (job *JobObject) PromoteToSilo() error { func (job *JobObject) isSilo() bool { return atomic.LoadUint32(&job.isAppSilo) == 1 } + +// PrivateWorkingSet returns the private working set size for the job. This is calculated by adding up the +// private working set for every process running in the job. +func (job *JobObject) PrivateWorkingSet() (uint64, error) { + pids, err := job.Pids() + if err != nil { + return 0, err + } + + openAndQuery := func(pid uint32) (uint64, error) { + h, err := windows.OpenProcess(windows.PROCESS_QUERY_LIMITED_INFORMATION, false, pid) + if err != nil { + return 0, fmt.Errorf("failed to open process with pid %d: %w", pid, err) + } + defer func() { + _ = windows.Close(h) + }() + + var vmCounters winapi.VM_COUNTERS_EX2 + status := winapi.NtQueryInformationProcess( + h, + winapi.ProcessVmCounters, + uintptr(unsafe.Pointer(&vmCounters)), + uint32(unsafe.Sizeof(vmCounters)), + nil, + ) + if !winapi.NTSuccess(status) { + return 0, fmt.Errorf("failed to query information for process with pid %d: %w", pid, winapi.RtlNtStatusToDosError(status)) + } + return uint64(vmCounters.PrivateWorkingSetSize), nil + } + + var jobWorkingSetSize uint64 + for _, pid := range pids { + workingSet, err := openAndQuery(pid) + if err != nil { + return 0, err + } + jobWorkingSetSize += workingSet + } + return jobWorkingSetSize, nil +} diff --git a/internal/winapi/jobobject.go b/internal/winapi/jobobject.go index ecf7c1c2e4..e770a570e6 100644 --- a/internal/winapi/jobobject.go +++ b/internal/winapi/jobobject.go @@ -26,7 +26,10 @@ const ( // Access rights for creating or opening job objects. // // https://docs.microsoft.com/en-us/windows/win32/procthread/job-object-security-and-access-rights -const JOB_OBJECT_ALL_ACCESS = 0x1F001F +const ( + JOB_OBJECT_QUERY = 0x0004 + JOB_OBJECT_ALL_ACCESS = 0x1F001F +) // IO limit flags // diff --git a/internal/winapi/process.go b/internal/winapi/process.go index 37839435b9..5f9e03fd28 100644 --- a/internal/winapi/process.go +++ b/internal/winapi/process.go @@ -6,3 +6,60 @@ const ( PROC_THREAD_ATTRIBUTE_PSEUDOCONSOLE = 0x20016 PROC_THREAD_ATTRIBUTE_JOB_LIST = 0x2000D ) + +// ProcessVmCounters corresponds to the _VM_COUNTERS_EX and _VM_COUNTERS_EX2 structures. +const ProcessVmCounters = 3 + +// __kernel_entry NTSTATUS NtQueryInformationProcess( +// [in] HANDLE ProcessHandle, +// [in] PROCESSINFOCLASS ProcessInformationClass, +// [out] PVOID ProcessInformation, +// [in] ULONG ProcessInformationLength, +// [out, optional] PULONG ReturnLength +// ); +// +//sys NtQueryInformationProcess(processHandle windows.Handle, processInfoClass uint32, processInfo uintptr, processInfoLength uint32, returnLength *uint32) (status uint32) = ntdll.NtQueryInformationProcess + +// typedef struct _VM_COUNTERS_EX +// { +// SIZE_T PeakVirtualSize; +// SIZE_T VirtualSize; +// ULONG PageFaultCount; +// SIZE_T PeakWorkingSetSize; +// SIZE_T WorkingSetSize; +// SIZE_T QuotaPeakPagedPoolUsage; +// SIZE_T QuotaPagedPoolUsage; +// SIZE_T QuotaPeakNonPagedPoolUsage; +// SIZE_T QuotaNonPagedPoolUsage; +// SIZE_T PagefileUsage; +// SIZE_T PeakPagefileUsage; +// SIZE_T PrivateUsage; +// } VM_COUNTERS_EX, *PVM_COUNTERS_EX; +// +type VM_COUNTERS_EX struct { + PeakVirtualSize uintptr + VirtualSize uintptr + PageFaultCount uint32 + PeakWorkingSetSize uintptr + WorkingSetSize uintptr + QuotaPeakPagedPoolUsage uintptr + QuotaPagedPoolUsage uintptr + QuotaPeakNonPagedPoolUsage uintptr + QuotaNonPagedPoolUsage uintptr + PagefileUsage uintptr + PeakPagefileUsage uintptr + PrivateUsage uintptr +} + +// typedef struct _VM_COUNTERS_EX2 +// { +// VM_COUNTERS_EX CountersEx; +// SIZE_T PrivateWorkingSetSize; +// SIZE_T SharedCommitUsage; +// } VM_COUNTERS_EX2, *PVM_COUNTERS_EX2; +// +type VM_COUNTERS_EX2 struct { + CountersEx VM_COUNTERS_EX + PrivateWorkingSetSize uintptr + SharedCommitUsage uintptr +} diff --git a/internal/winapi/zsyscall_windows.go b/internal/winapi/zsyscall_windows.go index 935219b063..7a1c241389 100644 --- a/internal/winapi/zsyscall_windows.go +++ b/internal/winapi/zsyscall_windows.go @@ -67,6 +67,7 @@ var ( procLogonUserW = modadvapi32.NewProc("LogonUserW") procLocalAlloc = modkernel32.NewProc("LocalAlloc") procLocalFree = modkernel32.NewProc("LocalFree") + procNtQueryInformationProcess = modntdll.NewProc("NtQueryInformationProcess") procGetActiveProcessorCount = modkernel32.NewProc("GetActiveProcessorCount") procCM_Get_Device_ID_List_SizeA = modcfgmgr32.NewProc("CM_Get_Device_ID_List_SizeA") procCM_Get_Device_ID_ListA = modcfgmgr32.NewProc("CM_Get_Device_ID_ListA") @@ -296,6 +297,12 @@ func LocalFree(ptr uintptr) { return } +func NtQueryInformationProcess(processHandle windows.Handle, processInfoClass uint32, processInfo uintptr, processInfoLength uint32, returnLength *uint32) (status uint32) { + r0, _, _ := syscall.Syscall6(procNtQueryInformationProcess.Addr(), 5, uintptr(processHandle), uintptr(processInfoClass), uintptr(processInfo), uintptr(processInfoLength), uintptr(unsafe.Pointer(returnLength)), 0) + status = uint32(r0) + return +} + func GetActiveProcessorCount(groupNumber uint16) (amount uint32) { r0, _, _ := syscall.Syscall(procGetActiveProcessorCount.Addr(), 1, uintptr(groupNumber), 0, 0) amount = uint32(r0) diff --git a/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go b/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go index 052d08ccc3..79713756e4 100644 --- a/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go +++ b/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go @@ -6,18 +6,22 @@ import ( "context" "encoding/json" "errors" + "fmt" "strings" "sync" "syscall" + "time" "github.com/Microsoft/hcsshim/internal/cow" "github.com/Microsoft/hcsshim/internal/hcs/schema1" hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2" + "github.com/Microsoft/hcsshim/internal/jobobject" "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/oc" "github.com/Microsoft/hcsshim/internal/timeout" "github.com/Microsoft/hcsshim/internal/vmcompute" "go.opencensus.io/trace" + "golang.org/x/sys/windows" ) type System struct { @@ -31,6 +35,7 @@ type System struct { waitError error exitError error os, typ string + startTime time.Time } func newSystem(id string) *System { @@ -40,6 +45,11 @@ func newSystem(id string) *System { } } +// Implementation detail for silo naming, this should NOT be relied upon very heavily. +func siloNameFmt(containerID string) string { + return fmt.Sprintf(`\Container_%s`, containerID) +} + // CreateComputeSystem creates a new compute system with the given configuration but does not start it. func CreateComputeSystem(ctx context.Context, id string, hcsDocumentInterface interface{}) (_ *System, err error) { operation := "hcs::CreateComputeSystem" @@ -197,7 +207,7 @@ func (computeSystem *System) Start(ctx context.Context) (err error) { if err != nil { return makeSystemError(computeSystem, operation, err, events) } - + computeSystem.startTime = time.Now() return nil } @@ -326,6 +336,74 @@ func (computeSystem *System) Properties(ctx context.Context, types ...schema1.Pr return properties, nil } +// statisticsInProc emulates what HCS does to grab statistics for a given container with a small +// change to make grabbing the private working set total much more efficient. +func (computeSystem *System) statisticsInProc(ctx context.Context) (*hcsschema.Properties, error) { + // See if we'd even be able to open the silo. We'll need to be local + // system/system so check first. + usr, err := windows.GetCurrentProcessToken().GetTokenUser() + if err != nil { + return nil, err + } + if !usr.User.Sid.IsWellKnown(windows.WinLocalSystemSid) { + return nil, errors.New("process does not have the right permissions to open the silo") + } + + jobOptions := &jobobject.Options{ + UseNTVariant: true, + Name: siloNameFmt(computeSystem.id), + } + job, err := jobobject.Open(ctx, jobOptions) + if err != nil { + return nil, err + } + defer job.Close() + + memInfo, err := job.QueryMemoryStats() + if err != nil { + return nil, err + } + + processorInfo, err := job.QueryProcessorStats() + if err != nil { + return nil, err + } + + storageInfo, err := job.QueryStorageStats() + if err != nil { + return nil, err + } + + privateWorkingSet, err := job.PrivateWorkingSet() + if err != nil { + return nil, err + } + + return &hcsschema.Properties{ + Statistics: &hcsschema.Statistics{ + Timestamp: time.Now(), + ContainerStartTime: computeSystem.startTime, + Uptime100ns: uint64(time.Since(computeSystem.startTime)) / 100, + Memory: &hcsschema.MemoryStats{ + MemoryUsageCommitBytes: memInfo.JobMemory, + MemoryUsageCommitPeakBytes: memInfo.PeakJobMemoryUsed, + MemoryUsagePrivateWorkingSetBytes: privateWorkingSet, + }, + Processor: &hcsschema.ProcessorStats{ + RuntimeKernel100ns: uint64(processorInfo.TotalKernelTime), + RuntimeUser100ns: uint64(processorInfo.TotalUserTime), + TotalRuntime100ns: uint64(processorInfo.TotalKernelTime + processorInfo.TotalUserTime), + }, + Storage: &hcsschema.StorageStats{ + ReadCountNormalized: uint64(storageInfo.ReadStats.IoCount), + ReadSizeBytes: storageInfo.ReadStats.TotalSize, + WriteCountNormalized: uint64(storageInfo.WriteStats.IoCount), + WriteSizeBytes: storageInfo.WriteStats.TotalSize, + }, + }, + }, nil +} + // PropertiesV2 returns the requested container properties targeting a V2 schema container. func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschema.PropertyType) (*hcsschema.Properties, error) { computeSystem.handleLock.RLock() @@ -333,6 +411,38 @@ func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschem operation := "hcs::System::PropertiesV2" + // There's an optimization we can make here if the user is asking for Statistics only, and this is due to + // an inefficient way that HCS calculates the private working set total for a given container. HCS + // calls NtQuerySystemInformation with the class SystemProcessInformation which returns an array containing + // system information for *every* process running on the machine. They then grab the pids that are running in + // the container and filter down the entries in the array to only what's running in that silo and start tallying + // up the total. This doesn't work well as performance should get worse if more processess are running on the + // machine in general and not just in the container. All of the additional information besides the + // WorkingSetPrivateSize field is ignored as well which isn't great and is wasted work to fetch. + // + // HCS only let's you grab statistics in an all or nothing fashion, so we can't just grab the private + // working set ourselves and ask for everything else seperately. The optimization we can make here is + // to open the silo ourselves and do the same queries for the rest of the info, as well as calculating + // the private working set in a more efficient manner by: + // + // 1. Find the pids running in the silo + // 2. Get a process handle for every process (only need PROCESS_QUERY_LIMITED_INFORMATION access) + // 3. Call NtQueryInformationProcess on each process with the class ProcessVmCounters + // 4. Tally up the total using the field PrivateWorkingSetSize in VM_COUNTERS_EX2. + // + // The only caveat is the process invoking this code must be running as SYSTEM to open the silo, + // otherwise we'll fallback to asking HCS for the information. + + // First check if we're only asking for Stats to see if we can apply the above optimization. We should + // only do this for containers and not UVMs. + if (len(types) == 1 && types[0] == hcsschema.PTStatistics) && computeSystem.typ == "container" { + properties, err := computeSystem.statisticsInProc(ctx) + // If no errors just return the info as is, if not we'll fallback to the HCS route. + if err == nil { + return properties, nil + } + } + queryBytes, err := json.Marshal(hcsschema.PropertyQuery{PropertyTypes: types}) if err != nil { return nil, makeSystemError(computeSystem, operation, err, nil) diff --git a/test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/doc.go b/test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/doc.go new file mode 100644 index 0000000000..34b53d6e48 --- /dev/null +++ b/test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/doc.go @@ -0,0 +1,8 @@ +// This package provides higher level constructs for the win32 job object API. +// Most of the core creation and management functions are already present in "golang.org/x/sys/windows" +// (CreateJobObject, AssignProcessToJobObject, etc.) as well as most of the limit information +// structs and associated limit flags. Whatever is not present from the job object API +// in golang.org/x/sys/windows is located in /internal/winapi. +// +// https://docs.microsoft.com/en-us/windows/win32/procthread/job-objects +package jobobject diff --git a/test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/iocp.go b/test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/iocp.go new file mode 100644 index 0000000000..d31a6a1e66 --- /dev/null +++ b/test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/iocp.go @@ -0,0 +1,113 @@ +//go:build windows + +package jobobject + +import ( + "context" + "fmt" + "sync" + "unsafe" + + "github.com/Microsoft/hcsshim/internal/log" + "github.com/Microsoft/hcsshim/internal/queue" + "github.com/Microsoft/hcsshim/internal/winapi" + "github.com/sirupsen/logrus" + "golang.org/x/sys/windows" +) + +var ( + ioInitOnce sync.Once + initIOErr error + // Global iocp handle that will be re-used for every job object + ioCompletionPort windows.Handle + // Mapping of job handle to queue to place notifications in. + jobMap sync.Map +) + +// MsgAllProcessesExited is a type representing a message that every process in a job has exited. +type MsgAllProcessesExited struct{} + +// MsgUnimplemented represents a message that we are aware of, but that isn't implemented currently. +// This should not be treated as an error. +type MsgUnimplemented struct{} + +// pollIOCP polls the io completion port forever. +func pollIOCP(ctx context.Context, iocpHandle windows.Handle) { + var ( + overlapped uintptr + code uint32 + key uintptr + ) + + for { + err := windows.GetQueuedCompletionStatus(iocpHandle, &code, &key, (**windows.Overlapped)(unsafe.Pointer(&overlapped)), windows.INFINITE) + if err != nil { + log.G(ctx).WithError(err).Error("failed to poll for job object message") + continue + } + if val, ok := jobMap.Load(key); ok { + msq, ok := val.(*queue.MessageQueue) + if !ok { + log.G(ctx).WithField("value", msq).Warn("encountered non queue type in job map") + continue + } + notification, err := parseMessage(code, overlapped) + if err != nil { + log.G(ctx).WithFields(logrus.Fields{ + "code": code, + "overlapped": overlapped, + }).Warn("failed to parse job object message") + continue + } + if err := msq.Write(notification); err == queue.ErrQueueClosed { + // Write will only return an error when the queue is closed. + // The only time a queue would ever be closed is when we call `Close` on + // the job it belongs to which also removes it from the jobMap, so something + // went wrong here. We can't return as this is reading messages for all jobs + // so just log it and move on. + log.G(ctx).WithFields(logrus.Fields{ + "code": code, + "overlapped": overlapped, + }).Warn("tried to write to a closed queue") + continue + } + } else { + log.G(ctx).Warn("received a message for a job not present in the mapping") + } + } +} + +func parseMessage(code uint32, overlapped uintptr) (interface{}, error) { + // Check code and parse out relevant information related to that notification + // that we care about. For now all we handle is the message that all processes + // in the job have exited. + switch code { + case winapi.JOB_OBJECT_MSG_ACTIVE_PROCESS_ZERO: + return MsgAllProcessesExited{}, nil + // Other messages for completeness and a check to make sure that if we fall + // into the default case that this is a code we don't know how to handle. + case winapi.JOB_OBJECT_MSG_END_OF_JOB_TIME: + case winapi.JOB_OBJECT_MSG_END_OF_PROCESS_TIME: + case winapi.JOB_OBJECT_MSG_ACTIVE_PROCESS_LIMIT: + case winapi.JOB_OBJECT_MSG_NEW_PROCESS: + case winapi.JOB_OBJECT_MSG_EXIT_PROCESS: + case winapi.JOB_OBJECT_MSG_ABNORMAL_EXIT_PROCESS: + case winapi.JOB_OBJECT_MSG_PROCESS_MEMORY_LIMIT: + case winapi.JOB_OBJECT_MSG_JOB_MEMORY_LIMIT: + case winapi.JOB_OBJECT_MSG_NOTIFICATION_LIMIT: + default: + return nil, fmt.Errorf("unknown job notification type: %d", code) + } + return MsgUnimplemented{}, nil +} + +// Assigns an IO completion port to get notified of events for the registered job +// object. +func attachIOCP(job windows.Handle, iocp windows.Handle) error { + info := winapi.JOBOBJECT_ASSOCIATE_COMPLETION_PORT{ + CompletionKey: job, + CompletionPort: iocp, + } + _, err := windows.SetInformationJobObject(job, windows.JobObjectAssociateCompletionPortInformation, uintptr(unsafe.Pointer(&info)), uint32(unsafe.Sizeof(info))) + return err +} diff --git a/test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/jobobject.go b/test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/jobobject.go new file mode 100644 index 0000000000..6b8ada3c5d --- /dev/null +++ b/test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/jobobject.go @@ -0,0 +1,585 @@ +//go:build windows + +package jobobject + +import ( + "context" + "errors" + "fmt" + "os" + "path/filepath" + "sync" + "sync/atomic" + "unsafe" + + "github.com/Microsoft/hcsshim/internal/queue" + "github.com/Microsoft/hcsshim/internal/winapi" + "golang.org/x/sys/windows" +) + +// JobObject is a high level wrapper around a Windows job object. Holds a handle to +// the job, a queue to receive iocp notifications about the lifecycle +// of the job and a mutex for synchronized handle access. +type JobObject struct { + handle windows.Handle + // All accesses to this MUST be done atomically. 1 signifies that this + // job is currently a silo. + isAppSilo uint32 + mq *queue.MessageQueue + handleLock sync.RWMutex +} + +// JobLimits represents the resource constraints that can be applied to a job object. +type JobLimits struct { + CPULimit uint32 + CPUWeight uint32 + MemoryLimitInBytes uint64 + MaxIOPS int64 + MaxBandwidth int64 +} + +type CPURateControlType uint32 + +const ( + WeightBased CPURateControlType = iota + RateBased +) + +// Processor resource controls +const ( + cpuLimitMin = 1 + cpuLimitMax = 10000 + cpuWeightMin = 1 + cpuWeightMax = 9 +) + +var ( + ErrAlreadyClosed = errors.New("the handle has already been closed") + ErrNotRegistered = errors.New("job is not registered to receive notifications") + ErrNotSilo = errors.New("job is not a silo") +) + +// Options represents the set of configurable options when making or opening a job object. +type Options struct { + // `Name` specifies the name of the job object if a named job object is desired. + Name string + // `Notifications` specifies if the job will be registered to receive notifications. + // Defaults to false. + Notifications bool + // `UseNTVariant` specifies if we should use the `Nt` variant of Open/CreateJobObject. + // Defaults to false. + UseNTVariant bool + // `Silo` specifies to promote the job to a silo. This additionally sets the flag + // JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE as it is required for the upgrade to complete. + Silo bool +} + +// Create creates a job object. +// +// If options.Name is an empty string, the job will not be assigned a name. +// +// If options.Notifications are not enabled `PollNotifications` will return immediately with error `errNotRegistered`. +// +// If `options` is nil, use default option values. +// +// Returns a JobObject structure and an error if there is one. +func Create(ctx context.Context, options *Options) (_ *JobObject, err error) { + if options == nil { + options = &Options{} + } + + var jobName *winapi.UnicodeString + if options.Name != "" { + jobName, err = winapi.NewUnicodeString(options.Name) + if err != nil { + return nil, err + } + } + + var jobHandle windows.Handle + if options.UseNTVariant { + oa := winapi.ObjectAttributes{ + Length: unsafe.Sizeof(winapi.ObjectAttributes{}), + ObjectName: jobName, + Attributes: 0, + } + status := winapi.NtCreateJobObject(&jobHandle, winapi.JOB_OBJECT_ALL_ACCESS, &oa) + if status != 0 { + return nil, winapi.RtlNtStatusToDosError(status) + } + } else { + var jobNameBuf *uint16 + if jobName != nil && jobName.Buffer != nil { + jobNameBuf = jobName.Buffer + } + jobHandle, err = windows.CreateJobObject(nil, jobNameBuf) + if err != nil { + return nil, err + } + } + + defer func() { + if err != nil { + windows.Close(jobHandle) + } + }() + + job := &JobObject{ + handle: jobHandle, + } + + // If the IOCP we'll be using to receive messages for all jobs hasn't been + // created, create it and start polling. + if options.Notifications { + mq, err := setupNotifications(ctx, job) + if err != nil { + return nil, err + } + job.mq = mq + } + + if options.Silo { + // This is a required setting for upgrading to a silo. + if err := job.SetTerminateOnLastHandleClose(); err != nil { + return nil, err + } + if err := job.PromoteToSilo(); err != nil { + return nil, err + } + } + + return job, nil +} + +// Open opens an existing job object with name provided in `options`. If no name is provided +// return an error since we need to know what job object to open. +// +// If options.Notifications is false `PollNotifications` will return immediately with error `errNotRegistered`. +// +// Returns a JobObject structure and an error if there is one. +func Open(ctx context.Context, options *Options) (_ *JobObject, err error) { + if options == nil || (options != nil && options.Name == "") { + return nil, errors.New("no job object name specified to open") + } + + unicodeJobName, err := winapi.NewUnicodeString(options.Name) + if err != nil { + return nil, err + } + + var jobHandle windows.Handle + if options != nil && options.UseNTVariant { + oa := winapi.ObjectAttributes{ + Length: unsafe.Sizeof(winapi.ObjectAttributes{}), + ObjectName: unicodeJobName, + Attributes: 0, + } + status := winapi.NtOpenJobObject(&jobHandle, winapi.JOB_OBJECT_ALL_ACCESS, &oa) + if status != 0 { + return nil, winapi.RtlNtStatusToDosError(status) + } + } else { + jobHandle, err = winapi.OpenJobObject(winapi.JOB_OBJECT_ALL_ACCESS, false, unicodeJobName.Buffer) + if err != nil { + return nil, err + } + } + + defer func() { + if err != nil { + windows.Close(jobHandle) + } + }() + + job := &JobObject{ + handle: jobHandle, + } + + // If the IOCP we'll be using to receive messages for all jobs hasn't been + // created, create it and start polling. + if options != nil && options.Notifications { + mq, err := setupNotifications(ctx, job) + if err != nil { + return nil, err + } + job.mq = mq + } + + return job, nil +} + +// helper function to setup notifications for creating/opening a job object +func setupNotifications(ctx context.Context, job *JobObject) (*queue.MessageQueue, error) { + job.handleLock.RLock() + defer job.handleLock.RUnlock() + + if job.handle == 0 { + return nil, ErrAlreadyClosed + } + + ioInitOnce.Do(func() { + h, err := windows.CreateIoCompletionPort(windows.InvalidHandle, 0, 0, 0xffffffff) + if err != nil { + initIOErr = err + return + } + ioCompletionPort = h + go pollIOCP(ctx, h) + }) + + if initIOErr != nil { + return nil, initIOErr + } + + mq := queue.NewMessageQueue() + jobMap.Store(uintptr(job.handle), mq) + if err := attachIOCP(job.handle, ioCompletionPort); err != nil { + jobMap.Delete(uintptr(job.handle)) + return nil, fmt.Errorf("failed to attach job to IO completion port: %w", err) + } + return mq, nil +} + +// PollNotification will poll for a job object notification. This call should only be called once +// per job (ideally in a goroutine loop) and will block if there is not a notification ready. +// This call will return immediately with error `ErrNotRegistered` if the job was not registered +// to receive notifications during `Create`. Internally, messages will be queued and there +// is no worry of messages being dropped. +func (job *JobObject) PollNotification() (interface{}, error) { + if job.mq == nil { + return nil, ErrNotRegistered + } + return job.mq.ReadOrWait() +} + +// UpdateProcThreadAttribute updates the passed in ProcThreadAttributeList to contain what is necessary to +// launch a process in a job at creation time. This can be used to avoid having to call Assign() after a process +// has already started running. +func (job *JobObject) UpdateProcThreadAttribute(attrList *windows.ProcThreadAttributeListContainer) error { + job.handleLock.RLock() + defer job.handleLock.RUnlock() + + if job.handle == 0 { + return ErrAlreadyClosed + } + + if err := attrList.Update( + winapi.PROC_THREAD_ATTRIBUTE_JOB_LIST, + unsafe.Pointer(&job.handle), + unsafe.Sizeof(job.handle), + ); err != nil { + return fmt.Errorf("failed to update proc thread attributes for job object: %w", err) + } + + return nil +} + +// Close closes the job object handle. +func (job *JobObject) Close() error { + job.handleLock.Lock() + defer job.handleLock.Unlock() + + if job.handle == 0 { + return ErrAlreadyClosed + } + + if err := windows.Close(job.handle); err != nil { + return err + } + + if job.mq != nil { + job.mq.Close() + } + // Handles now invalid so if the map entry to receive notifications for this job still + // exists remove it so we can stop receiving notifications. + if _, ok := jobMap.Load(uintptr(job.handle)); ok { + jobMap.Delete(uintptr(job.handle)) + } + + job.handle = 0 + return nil +} + +// Assign assigns a process to the job object. +func (job *JobObject) Assign(pid uint32) error { + job.handleLock.RLock() + defer job.handleLock.RUnlock() + + if job.handle == 0 { + return ErrAlreadyClosed + } + + if pid == 0 { + return errors.New("invalid pid: 0") + } + hProc, err := windows.OpenProcess(winapi.PROCESS_ALL_ACCESS, true, pid) + if err != nil { + return err + } + defer windows.Close(hProc) + return windows.AssignProcessToJobObject(job.handle, hProc) +} + +// Terminate terminates the job, essentially calls TerminateProcess on every process in the +// job. +func (job *JobObject) Terminate(exitCode uint32) error { + job.handleLock.RLock() + defer job.handleLock.RUnlock() + if job.handle == 0 { + return ErrAlreadyClosed + } + return windows.TerminateJobObject(job.handle, exitCode) +} + +// Pids returns all of the process IDs in the job object. +func (job *JobObject) Pids() ([]uint32, error) { + job.handleLock.RLock() + defer job.handleLock.RUnlock() + + if job.handle == 0 { + return nil, ErrAlreadyClosed + } + + info := winapi.JOBOBJECT_BASIC_PROCESS_ID_LIST{} + err := winapi.QueryInformationJobObject( + job.handle, + winapi.JobObjectBasicProcessIdList, + uintptr(unsafe.Pointer(&info)), + uint32(unsafe.Sizeof(info)), + nil, + ) + + // This is either the case where there is only one process or no processes in + // the job. Any other case will result in ERROR_MORE_DATA. Check if info.NumberOfProcessIdsInList + // is 1 and just return this, otherwise return an empty slice. + if err == nil { + if info.NumberOfProcessIdsInList == 1 { + return []uint32{uint32(info.ProcessIdList[0])}, nil + } + // Return empty slice instead of nil to play well with the caller of this. + // Do not return an error if no processes are running inside the job + return []uint32{}, nil + } + + if err != winapi.ERROR_MORE_DATA { + return nil, fmt.Errorf("failed initial query for PIDs in job object: %w", err) + } + + jobBasicProcessIDListSize := unsafe.Sizeof(info) + (unsafe.Sizeof(info.ProcessIdList[0]) * uintptr(info.NumberOfAssignedProcesses-1)) + buf := make([]byte, jobBasicProcessIDListSize) + if err = winapi.QueryInformationJobObject( + job.handle, + winapi.JobObjectBasicProcessIdList, + uintptr(unsafe.Pointer(&buf[0])), + uint32(len(buf)), + nil, + ); err != nil { + return nil, fmt.Errorf("failed to query for PIDs in job object: %w", err) + } + + bufInfo := (*winapi.JOBOBJECT_BASIC_PROCESS_ID_LIST)(unsafe.Pointer(&buf[0])) + pids := make([]uint32, bufInfo.NumberOfProcessIdsInList) + for i, bufPid := range bufInfo.AllPids() { + pids[i] = uint32(bufPid) + } + return pids, nil +} + +// QueryMemoryStats gets the memory stats for the job object. +func (job *JobObject) QueryMemoryStats() (*winapi.JOBOBJECT_MEMORY_USAGE_INFORMATION, error) { + job.handleLock.RLock() + defer job.handleLock.RUnlock() + + if job.handle == 0 { + return nil, ErrAlreadyClosed + } + + info := winapi.JOBOBJECT_MEMORY_USAGE_INFORMATION{} + if err := winapi.QueryInformationJobObject( + job.handle, + winapi.JobObjectMemoryUsageInformation, + uintptr(unsafe.Pointer(&info)), + uint32(unsafe.Sizeof(info)), + nil, + ); err != nil { + return nil, fmt.Errorf("failed to query for job object memory stats: %w", err) + } + return &info, nil +} + +// QueryProcessorStats gets the processor stats for the job object. +func (job *JobObject) QueryProcessorStats() (*winapi.JOBOBJECT_BASIC_ACCOUNTING_INFORMATION, error) { + job.handleLock.RLock() + defer job.handleLock.RUnlock() + + if job.handle == 0 { + return nil, ErrAlreadyClosed + } + + info := winapi.JOBOBJECT_BASIC_ACCOUNTING_INFORMATION{} + if err := winapi.QueryInformationJobObject( + job.handle, + winapi.JobObjectBasicAccountingInformation, + uintptr(unsafe.Pointer(&info)), + uint32(unsafe.Sizeof(info)), + nil, + ); err != nil { + return nil, fmt.Errorf("failed to query for job object process stats: %w", err) + } + return &info, nil +} + +// QueryStorageStats gets the storage (I/O) stats for the job object. +func (job *JobObject) QueryStorageStats() (*winapi.JOBOBJECT_IO_ATTRIBUTION_INFORMATION, error) { + info := winapi.JOBOBJECT_IO_ATTRIBUTION_INFORMATION{ + ControlFlags: winapi.JOBOBJECT_IO_ATTRIBUTION_CONTROL_ENABLE, + } + if err := winapi.QueryInformationJobObject( + job.handle, + winapi.JobObjectIoAttribution, + uintptr(unsafe.Pointer(&info)), + uint32(unsafe.Sizeof(info)), + nil, + ); err != nil { + return nil, fmt.Errorf("failed to query for job object storage stats: %w", err) + } + return &info, nil +} + +// ApplyFileBinding makes a file binding using the Bind Filter from target to root. If the job has +// not been upgraded to a silo this call will fail. The binding is only applied and visible for processes +// running in the job, any processes on the host or in another job will not be able to see the binding. +func (job *JobObject) ApplyFileBinding(root, target string, merged bool) error { + job.handleLock.RLock() + defer job.handleLock.RUnlock() + + if job.handle == 0 { + return ErrAlreadyClosed + } + + if !job.isSilo() { + return ErrNotSilo + } + + // The parent directory needs to exist for the bind to work. + if _, err := os.Stat(filepath.Dir(root)); err != nil { + if !os.IsNotExist(err) { + return err + } + if err := os.MkdirAll(filepath.Dir(root), 0); err != nil { + return err + } + } + + rootPtr, err := windows.UTF16PtrFromString(root) + if err != nil { + return err + } + + targetPtr, err := windows.UTF16PtrFromString(target) + if err != nil { + return err + } + + flags := winapi.BINDFLT_FLAG_USE_CURRENT_SILO_MAPPING + if merged { + flags |= winapi.BINDFLT_FLAG_MERGED_BIND_MAPPING + } + + if err := winapi.BfSetupFilterEx( + flags, + job.handle, + nil, + rootPtr, + targetPtr, + nil, + 0, + ); err != nil { + return fmt.Errorf("failed to bind target %q to root %q for job object: %w", target, root, err) + } + return nil +} + +// PromoteToSilo promotes a job object to a silo. There must be no running processess +// in the job for this to succeed. If the job is already a silo this is a no-op. +func (job *JobObject) PromoteToSilo() error { + job.handleLock.RLock() + defer job.handleLock.RUnlock() + + if job.handle == 0 { + return ErrAlreadyClosed + } + + if job.isSilo() { + return nil + } + + pids, err := job.Pids() + if err != nil { + return err + } + + if len(pids) != 0 { + return fmt.Errorf("job cannot have running processes to be promoted to a silo, found %d running processes", len(pids)) + } + + _, err = windows.SetInformationJobObject( + job.handle, + winapi.JobObjectCreateSilo, + 0, + 0, + ) + if err != nil { + return fmt.Errorf("failed to promote job to silo: %w", err) + } + + atomic.StoreUint32(&job.isAppSilo, 1) + return nil +} + +// isSilo returns if the job object is a silo. +func (job *JobObject) isSilo() bool { + return atomic.LoadUint32(&job.isAppSilo) == 1 +} + +// PrivateWorkingSet returns the private working set size for the job. This is calculated by adding up the +// private working set for every process running in the job. +func (job *JobObject) PrivateWorkingSet() (uint64, error) { + pids, err := job.Pids() + if err != nil { + return 0, err + } + + openAndQuery := func(pid uint32) (uint64, error) { + h, err := windows.OpenProcess(windows.PROCESS_QUERY_LIMITED_INFORMATION, false, pid) + if err != nil { + return 0, fmt.Errorf("failed to open process with pid %d: %w", pid, err) + } + defer func() { + _ = windows.Close(h) + }() + + var vmCounters winapi.VM_COUNTERS_EX2 + status := winapi.NtQueryInformationProcess( + h, + winapi.ProcessVmCounters, + uintptr(unsafe.Pointer(&vmCounters)), + uint32(unsafe.Sizeof(vmCounters)), + nil, + ) + if !winapi.NTSuccess(status) { + return 0, fmt.Errorf("failed to query information for process with pid %d: %w", pid, winapi.RtlNtStatusToDosError(status)) + } + return uint64(vmCounters.PrivateWorkingSetSize), nil + } + + var jobWorkingSetSize uint64 + for _, pid := range pids { + workingSet, err := openAndQuery(pid) + if err != nil { + return 0, err + } + jobWorkingSetSize += workingSet + } + return jobWorkingSetSize, nil +} diff --git a/test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/limits.go b/test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/limits.go new file mode 100644 index 0000000000..8c0c979402 --- /dev/null +++ b/test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/limits.go @@ -0,0 +1,317 @@ +//go:build windows + +package jobobject + +import ( + "errors" + "fmt" + "unsafe" + + "github.com/Microsoft/hcsshim/internal/winapi" + "golang.org/x/sys/windows" +) + +const ( + memoryLimitMax uint64 = 0xffffffffffffffff +) + +func isFlagSet(flag, controlFlags uint32) bool { + return (flag & controlFlags) == flag +} + +// SetResourceLimits sets resource limits on the job object (cpu, memory, storage). +func (job *JobObject) SetResourceLimits(limits *JobLimits) error { + // Go through and check what limits were specified and apply them to the job. + if limits.MemoryLimitInBytes != 0 { + if err := job.SetMemoryLimit(limits.MemoryLimitInBytes); err != nil { + return fmt.Errorf("failed to set job object memory limit: %w", err) + } + } + + if limits.CPULimit != 0 { + if err := job.SetCPULimit(RateBased, limits.CPULimit); err != nil { + return fmt.Errorf("failed to set job object cpu limit: %w", err) + } + } else if limits.CPUWeight != 0 { + if err := job.SetCPULimit(WeightBased, limits.CPUWeight); err != nil { + return fmt.Errorf("failed to set job object cpu limit: %w", err) + } + } + + if limits.MaxBandwidth != 0 || limits.MaxIOPS != 0 { + if err := job.SetIOLimit(limits.MaxBandwidth, limits.MaxIOPS); err != nil { + return fmt.Errorf("failed to set io limit on job object: %w", err) + } + } + return nil +} + +// SetTerminateOnLastHandleClose sets the job object flag that specifies that the job should terminate +// all processes in the job on the last open handle being closed. +func (job *JobObject) SetTerminateOnLastHandleClose() error { + info, err := job.getExtendedInformation() + if err != nil { + return err + } + info.BasicLimitInformation.LimitFlags |= windows.JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE + return job.setExtendedInformation(info) +} + +// SetMemoryLimit sets the memory limit of the job object based on the given `memoryLimitInBytes`. +func (job *JobObject) SetMemoryLimit(memoryLimitInBytes uint64) error { + if memoryLimitInBytes >= memoryLimitMax { + return errors.New("memory limit specified exceeds the max size") + } + + info, err := job.getExtendedInformation() + if err != nil { + return err + } + + info.JobMemoryLimit = uintptr(memoryLimitInBytes) + info.BasicLimitInformation.LimitFlags |= windows.JOB_OBJECT_LIMIT_JOB_MEMORY + return job.setExtendedInformation(info) +} + +// GetMemoryLimit gets the memory limit in bytes of the job object. +func (job *JobObject) GetMemoryLimit() (uint64, error) { + info, err := job.getExtendedInformation() + if err != nil { + return 0, err + } + return uint64(info.JobMemoryLimit), nil +} + +// SetCPULimit sets the CPU limit depending on the specified `CPURateControlType` to +// `rateControlValue` for the job object. +func (job *JobObject) SetCPULimit(rateControlType CPURateControlType, rateControlValue uint32) error { + cpuInfo, err := job.getCPURateControlInformation() + if err != nil { + return err + } + switch rateControlType { + case WeightBased: + if rateControlValue < cpuWeightMin || rateControlValue > cpuWeightMax { + return fmt.Errorf("processor weight value of `%d` is invalid", rateControlValue) + } + cpuInfo.ControlFlags |= winapi.JOB_OBJECT_CPU_RATE_CONTROL_ENABLE | winapi.JOB_OBJECT_CPU_RATE_CONTROL_WEIGHT_BASED + cpuInfo.Value = rateControlValue + case RateBased: + if rateControlValue < cpuLimitMin || rateControlValue > cpuLimitMax { + return fmt.Errorf("processor rate of `%d` is invalid", rateControlValue) + } + cpuInfo.ControlFlags |= winapi.JOB_OBJECT_CPU_RATE_CONTROL_ENABLE | winapi.JOB_OBJECT_CPU_RATE_CONTROL_HARD_CAP + cpuInfo.Value = rateControlValue + default: + return errors.New("invalid job object cpu rate control type") + } + return job.setCPURateControlInfo(cpuInfo) +} + +// GetCPULimit gets the cpu limits for the job object. +// `rateControlType` is used to indicate what type of cpu limit to query for. +func (job *JobObject) GetCPULimit(rateControlType CPURateControlType) (uint32, error) { + info, err := job.getCPURateControlInformation() + if err != nil { + return 0, err + } + + if !isFlagSet(winapi.JOB_OBJECT_CPU_RATE_CONTROL_ENABLE, info.ControlFlags) { + return 0, errors.New("the job does not have cpu rate control enabled") + } + + switch rateControlType { + case WeightBased: + if !isFlagSet(winapi.JOB_OBJECT_CPU_RATE_CONTROL_WEIGHT_BASED, info.ControlFlags) { + return 0, errors.New("cannot get cpu weight for job object without cpu weight option set") + } + case RateBased: + if !isFlagSet(winapi.JOB_OBJECT_CPU_RATE_CONTROL_HARD_CAP, info.ControlFlags) { + return 0, errors.New("cannot get cpu rate hard cap for job object without cpu rate hard cap option set") + } + default: + return 0, errors.New("invalid job object cpu rate control type") + } + return info.Value, nil +} + +// SetCPUAffinity sets the processor affinity for the job object. +// The affinity is passed in as a bitmask. +func (job *JobObject) SetCPUAffinity(affinityBitMask uint64) error { + info, err := job.getExtendedInformation() + if err != nil { + return err + } + info.BasicLimitInformation.LimitFlags |= uint32(windows.JOB_OBJECT_LIMIT_AFFINITY) + info.BasicLimitInformation.Affinity = uintptr(affinityBitMask) + return job.setExtendedInformation(info) +} + +// GetCPUAffinity gets the processor affinity for the job object. +// The returned affinity is a bitmask. +func (job *JobObject) GetCPUAffinity() (uint64, error) { + info, err := job.getExtendedInformation() + if err != nil { + return 0, err + } + return uint64(info.BasicLimitInformation.Affinity), nil +} + +// SetIOLimit sets the IO limits specified on the job object. +func (job *JobObject) SetIOLimit(maxBandwidth, maxIOPS int64) error { + ioInfo, err := job.getIOLimit() + if err != nil { + return err + } + ioInfo.ControlFlags |= winapi.JOB_OBJECT_IO_RATE_CONTROL_ENABLE + if maxBandwidth != 0 { + ioInfo.MaxBandwidth = maxBandwidth + } + if maxIOPS != 0 { + ioInfo.MaxIops = maxIOPS + } + return job.setIORateControlInfo(ioInfo) +} + +// GetIOMaxBandwidthLimit gets the max bandwidth for the job object. +func (job *JobObject) GetIOMaxBandwidthLimit() (int64, error) { + info, err := job.getIOLimit() + if err != nil { + return 0, err + } + return info.MaxBandwidth, nil +} + +// GetIOMaxIopsLimit gets the max iops for the job object. +func (job *JobObject) GetIOMaxIopsLimit() (int64, error) { + info, err := job.getIOLimit() + if err != nil { + return 0, err + } + return info.MaxIops, nil +} + +// Helper function for getting a job object's extended information. +func (job *JobObject) getExtendedInformation() (*windows.JOBOBJECT_EXTENDED_LIMIT_INFORMATION, error) { + job.handleLock.RLock() + defer job.handleLock.RUnlock() + + if job.handle == 0 { + return nil, ErrAlreadyClosed + } + + info := windows.JOBOBJECT_EXTENDED_LIMIT_INFORMATION{} + if err := winapi.QueryInformationJobObject( + job.handle, + windows.JobObjectExtendedLimitInformation, + uintptr(unsafe.Pointer(&info)), + uint32(unsafe.Sizeof(info)), + nil, + ); err != nil { + return nil, fmt.Errorf("query %v returned error: %w", info, err) + } + return &info, nil +} + +// Helper function for getting a job object's CPU rate control information. +func (job *JobObject) getCPURateControlInformation() (*winapi.JOBOBJECT_CPU_RATE_CONTROL_INFORMATION, error) { + job.handleLock.RLock() + defer job.handleLock.RUnlock() + + if job.handle == 0 { + return nil, ErrAlreadyClosed + } + + info := winapi.JOBOBJECT_CPU_RATE_CONTROL_INFORMATION{} + if err := winapi.QueryInformationJobObject( + job.handle, + windows.JobObjectCpuRateControlInformation, + uintptr(unsafe.Pointer(&info)), + uint32(unsafe.Sizeof(info)), + nil, + ); err != nil { + return nil, fmt.Errorf("query %v returned error: %w", info, err) + } + return &info, nil +} + +// Helper function for setting a job object's extended information. +func (job *JobObject) setExtendedInformation(info *windows.JOBOBJECT_EXTENDED_LIMIT_INFORMATION) error { + job.handleLock.RLock() + defer job.handleLock.RUnlock() + + if job.handle == 0 { + return ErrAlreadyClosed + } + + if _, err := windows.SetInformationJobObject( + job.handle, + windows.JobObjectExtendedLimitInformation, + uintptr(unsafe.Pointer(info)), + uint32(unsafe.Sizeof(*info)), + ); err != nil { + return fmt.Errorf("failed to set Extended info %v on job object: %w", info, err) + } + return nil +} + +// Helper function for querying job handle for IO limit information. +func (job *JobObject) getIOLimit() (*winapi.JOBOBJECT_IO_RATE_CONTROL_INFORMATION, error) { + job.handleLock.RLock() + defer job.handleLock.RUnlock() + + if job.handle == 0 { + return nil, ErrAlreadyClosed + } + + ioInfo := &winapi.JOBOBJECT_IO_RATE_CONTROL_INFORMATION{} + var blockCount uint32 = 1 + + if _, err := winapi.QueryIoRateControlInformationJobObject( + job.handle, + nil, + &ioInfo, + &blockCount, + ); err != nil { + return nil, fmt.Errorf("query %v returned error: %w", ioInfo, err) + } + + if !isFlagSet(winapi.JOB_OBJECT_IO_RATE_CONTROL_ENABLE, ioInfo.ControlFlags) { + return nil, fmt.Errorf("query %v cannot get IO limits for job object without IO rate control option set", ioInfo) + } + return ioInfo, nil +} + +// Helper function for setting a job object's IO rate control information. +func (job *JobObject) setIORateControlInfo(ioInfo *winapi.JOBOBJECT_IO_RATE_CONTROL_INFORMATION) error { + job.handleLock.RLock() + defer job.handleLock.RUnlock() + + if job.handle == 0 { + return ErrAlreadyClosed + } + + if _, err := winapi.SetIoRateControlInformationJobObject(job.handle, ioInfo); err != nil { + return fmt.Errorf("failed to set IO limit info %v on job object: %w", ioInfo, err) + } + return nil +} + +// Helper function for setting a job object's CPU rate control information. +func (job *JobObject) setCPURateControlInfo(cpuInfo *winapi.JOBOBJECT_CPU_RATE_CONTROL_INFORMATION) error { + job.handleLock.RLock() + defer job.handleLock.RUnlock() + + if job.handle == 0 { + return ErrAlreadyClosed + } + if _, err := windows.SetInformationJobObject( + job.handle, + windows.JobObjectCpuRateControlInformation, + uintptr(unsafe.Pointer(cpuInfo)), + uint32(unsafe.Sizeof(cpuInfo)), + ); err != nil { + return fmt.Errorf("failed to set cpu limit info %v on job object: %w", cpuInfo, err) + } + return nil +} diff --git a/test/vendor/github.com/Microsoft/hcsshim/internal/queue/mq.go b/test/vendor/github.com/Microsoft/hcsshim/internal/queue/mq.go new file mode 100644 index 0000000000..e177c9a629 --- /dev/null +++ b/test/vendor/github.com/Microsoft/hcsshim/internal/queue/mq.go @@ -0,0 +1,111 @@ +package queue + +import ( + "errors" + "sync" +) + +var ( + ErrQueueClosed = errors.New("the queue is closed for reading and writing") + ErrQueueEmpty = errors.New("the queue is empty") +) + +// MessageQueue represents a threadsafe message queue to be used to retrieve or +// write messages to. +type MessageQueue struct { + m *sync.RWMutex + c *sync.Cond + messages []interface{} + closed bool +} + +// NewMessageQueue returns a new MessageQueue. +func NewMessageQueue() *MessageQueue { + m := &sync.RWMutex{} + return &MessageQueue{ + m: m, + c: sync.NewCond(m), + messages: []interface{}{}, + } +} + +// Write writes `msg` to the queue. +func (mq *MessageQueue) Write(msg interface{}) error { + mq.m.Lock() + defer mq.m.Unlock() + + if mq.closed { + return ErrQueueClosed + } + mq.messages = append(mq.messages, msg) + // Signal a waiter that there is now a value available in the queue. + mq.c.Signal() + return nil +} + +// Read will read a value from the queue if available, otherwise return an error. +func (mq *MessageQueue) Read() (interface{}, error) { + mq.m.Lock() + defer mq.m.Unlock() + if mq.closed { + return nil, ErrQueueClosed + } + if mq.isEmpty() { + return nil, ErrQueueEmpty + } + val := mq.messages[0] + mq.messages[0] = nil + mq.messages = mq.messages[1:] + return val, nil +} + +// ReadOrWait will read a value from the queue if available, else it will wait for a +// value to become available. This will block forever if nothing gets written or until +// the queue gets closed. +func (mq *MessageQueue) ReadOrWait() (interface{}, error) { + mq.m.Lock() + if mq.closed { + mq.m.Unlock() + return nil, ErrQueueClosed + } + if mq.isEmpty() { + for !mq.closed && mq.isEmpty() { + mq.c.Wait() + } + mq.m.Unlock() + return mq.Read() + } + val := mq.messages[0] + mq.messages[0] = nil + mq.messages = mq.messages[1:] + mq.m.Unlock() + return val, nil +} + +// IsEmpty returns if the queue is empty +func (mq *MessageQueue) IsEmpty() bool { + mq.m.RLock() + defer mq.m.RUnlock() + return len(mq.messages) == 0 +} + +// Nonexported empty check that doesn't lock so we can call this in Read and Write. +func (mq *MessageQueue) isEmpty() bool { + return len(mq.messages) == 0 +} + +// Close closes the queue for future writes or reads. Any attempts to read or write from the +// queue after close will return ErrQueueClosed. This is safe to call multiple times. +func (mq *MessageQueue) Close() { + mq.m.Lock() + defer mq.m.Unlock() + // Already closed + if mq.closed { + return + } + mq.messages = nil + mq.closed = true + // If there's anybody currently waiting on a value from ReadOrWait, we need to + // broadcast so the read(s) can return ErrQueueClosed. + mq.c.Broadcast() +} diff --git a/test/vendor/github.com/Microsoft/hcsshim/internal/winapi/jobobject.go b/test/vendor/github.com/Microsoft/hcsshim/internal/winapi/jobobject.go index ecf7c1c2e4..e770a570e6 100644 --- a/test/vendor/github.com/Microsoft/hcsshim/internal/winapi/jobobject.go +++ b/test/vendor/github.com/Microsoft/hcsshim/internal/winapi/jobobject.go @@ -26,7 +26,10 @@ const ( // Access rights for creating or opening job objects. // // https://docs.microsoft.com/en-us/windows/win32/procthread/job-object-security-and-access-rights -const JOB_OBJECT_ALL_ACCESS = 0x1F001F +const ( + JOB_OBJECT_QUERY = 0x0004 + JOB_OBJECT_ALL_ACCESS = 0x1F001F +) // IO limit flags // diff --git a/test/vendor/github.com/Microsoft/hcsshim/internal/winapi/process.go b/test/vendor/github.com/Microsoft/hcsshim/internal/winapi/process.go index 37839435b9..5f9e03fd28 100644 --- a/test/vendor/github.com/Microsoft/hcsshim/internal/winapi/process.go +++ b/test/vendor/github.com/Microsoft/hcsshim/internal/winapi/process.go @@ -6,3 +6,60 @@ const ( PROC_THREAD_ATTRIBUTE_PSEUDOCONSOLE = 0x20016 PROC_THREAD_ATTRIBUTE_JOB_LIST = 0x2000D ) + +// ProcessVmCounters corresponds to the _VM_COUNTERS_EX and _VM_COUNTERS_EX2 structures. +const ProcessVmCounters = 3 + +// __kernel_entry NTSTATUS NtQueryInformationProcess( +// [in] HANDLE ProcessHandle, +// [in] PROCESSINFOCLASS ProcessInformationClass, +// [out] PVOID ProcessInformation, +// [in] ULONG ProcessInformationLength, +// [out, optional] PULONG ReturnLength +// ); +// +//sys NtQueryInformationProcess(processHandle windows.Handle, processInfoClass uint32, processInfo uintptr, processInfoLength uint32, returnLength *uint32) (status uint32) = ntdll.NtQueryInformationProcess + +// typedef struct _VM_COUNTERS_EX +// { +// SIZE_T PeakVirtualSize; +// SIZE_T VirtualSize; +// ULONG PageFaultCount; +// SIZE_T PeakWorkingSetSize; +// SIZE_T WorkingSetSize; +// SIZE_T QuotaPeakPagedPoolUsage; +// SIZE_T QuotaPagedPoolUsage; +// SIZE_T QuotaPeakNonPagedPoolUsage; +// SIZE_T QuotaNonPagedPoolUsage; +// SIZE_T PagefileUsage; +// SIZE_T PeakPagefileUsage; +// SIZE_T PrivateUsage; +// } VM_COUNTERS_EX, *PVM_COUNTERS_EX; +// +type VM_COUNTERS_EX struct { + PeakVirtualSize uintptr + VirtualSize uintptr + PageFaultCount uint32 + PeakWorkingSetSize uintptr + WorkingSetSize uintptr + QuotaPeakPagedPoolUsage uintptr + QuotaPagedPoolUsage uintptr + QuotaPeakNonPagedPoolUsage uintptr + QuotaNonPagedPoolUsage uintptr + PagefileUsage uintptr + PeakPagefileUsage uintptr + PrivateUsage uintptr +} + +// typedef struct _VM_COUNTERS_EX2 +// { +// VM_COUNTERS_EX CountersEx; +// SIZE_T PrivateWorkingSetSize; +// SIZE_T SharedCommitUsage; +// } VM_COUNTERS_EX2, *PVM_COUNTERS_EX2; +// +type VM_COUNTERS_EX2 struct { + CountersEx VM_COUNTERS_EX + PrivateWorkingSetSize uintptr + SharedCommitUsage uintptr +} diff --git a/test/vendor/github.com/Microsoft/hcsshim/internal/winapi/zsyscall_windows.go b/test/vendor/github.com/Microsoft/hcsshim/internal/winapi/zsyscall_windows.go index 935219b063..7a1c241389 100644 --- a/test/vendor/github.com/Microsoft/hcsshim/internal/winapi/zsyscall_windows.go +++ b/test/vendor/github.com/Microsoft/hcsshim/internal/winapi/zsyscall_windows.go @@ -67,6 +67,7 @@ var ( procLogonUserW = modadvapi32.NewProc("LogonUserW") procLocalAlloc = modkernel32.NewProc("LocalAlloc") procLocalFree = modkernel32.NewProc("LocalFree") + procNtQueryInformationProcess = modntdll.NewProc("NtQueryInformationProcess") procGetActiveProcessorCount = modkernel32.NewProc("GetActiveProcessorCount") procCM_Get_Device_ID_List_SizeA = modcfgmgr32.NewProc("CM_Get_Device_ID_List_SizeA") procCM_Get_Device_ID_ListA = modcfgmgr32.NewProc("CM_Get_Device_ID_ListA") @@ -296,6 +297,12 @@ func LocalFree(ptr uintptr) { return } +func NtQueryInformationProcess(processHandle windows.Handle, processInfoClass uint32, processInfo uintptr, processInfoLength uint32, returnLength *uint32) (status uint32) { + r0, _, _ := syscall.Syscall6(procNtQueryInformationProcess.Addr(), 5, uintptr(processHandle), uintptr(processInfoClass), uintptr(processInfo), uintptr(processInfoLength), uintptr(unsafe.Pointer(returnLength)), 0) + status = uint32(r0) + return +} + func GetActiveProcessorCount(groupNumber uint16) (amount uint32) { r0, _, _ := syscall.Syscall(procGetActiveProcessorCount.Addr(), 1, uintptr(groupNumber), 0, 0) amount = uint32(r0) diff --git a/test/vendor/modules.txt b/test/vendor/modules.txt index b0d29873d5..c8556db3f2 100644 --- a/test/vendor/modules.txt +++ b/test/vendor/modules.txt @@ -39,6 +39,7 @@ github.com/Microsoft/hcsshim/internal/hcsoci github.com/Microsoft/hcsshim/internal/hns github.com/Microsoft/hcsshim/internal/hooks github.com/Microsoft/hcsshim/internal/interop +github.com/Microsoft/hcsshim/internal/jobobject github.com/Microsoft/hcsshim/internal/layers github.com/Microsoft/hcsshim/internal/lcow github.com/Microsoft/hcsshim/internal/log @@ -54,6 +55,7 @@ github.com/Microsoft/hcsshim/internal/ospath github.com/Microsoft/hcsshim/internal/processorinfo github.com/Microsoft/hcsshim/internal/protocol/guestrequest github.com/Microsoft/hcsshim/internal/protocol/guestresource +github.com/Microsoft/hcsshim/internal/queue github.com/Microsoft/hcsshim/internal/regstate github.com/Microsoft/hcsshim/internal/resources github.com/Microsoft/hcsshim/internal/runhcs From c8451c2622d5bb8389db1a23cce75a72865f3cb9 Mon Sep 17 00:00:00 2001 From: Daniel Canter Date: Wed, 20 Apr 2022 08:53:29 -0700 Subject: [PATCH 02/12] PR Feedback * Change to explicitly using Nanosecond units for timestamps and not relying on time.Duration to be ns. * Add RLock locking back to QueryStorageStats method * Get rid of SYSTEM check and just bail if job object open fails. Signed-off-by: Daniel Canter --- internal/hcs/system.go | 20 +++++++------------ internal/jobcontainers/jobcontainer.go | 7 +++++-- internal/jobobject/jobobject.go | 15 +++++++++++--- .../Microsoft/hcsshim/internal/hcs/system.go | 20 +++++++------------ .../hcsshim/internal/jobobject/jobobject.go | 15 +++++++++++--- 5 files changed, 43 insertions(+), 34 deletions(-) diff --git a/internal/hcs/system.go b/internal/hcs/system.go index 79713756e4..de0d6ebdd1 100644 --- a/internal/hcs/system.go +++ b/internal/hcs/system.go @@ -21,7 +21,6 @@ import ( "github.com/Microsoft/hcsshim/internal/timeout" "github.com/Microsoft/hcsshim/internal/vmcompute" "go.opencensus.io/trace" - "golang.org/x/sys/windows" ) type System struct { @@ -339,16 +338,11 @@ func (computeSystem *System) Properties(ctx context.Context, types ...schema1.Pr // statisticsInProc emulates what HCS does to grab statistics for a given container with a small // change to make grabbing the private working set total much more efficient. func (computeSystem *System) statisticsInProc(ctx context.Context) (*hcsschema.Properties, error) { - // See if we'd even be able to open the silo. We'll need to be local - // system/system so check first. - usr, err := windows.GetCurrentProcessToken().GetTokenUser() - if err != nil { - return nil, err - } - if !usr.User.Sid.IsWellKnown(windows.WinLocalSystemSid) { - return nil, errors.New("process does not have the right permissions to open the silo") - } - + // Start timestamp for these stats before we grab them to match HCS + timestamp := time.Now() + // In the future we can make use of some new functionality in the HCS that allows you + // to pass a job object for HCS to use for the container. Currently, the only way we'll + // be able to open the job/silo is if we're running as SYSTEM. jobOptions := &jobobject.Options{ UseNTVariant: true, Name: siloNameFmt(computeSystem.id), @@ -381,9 +375,9 @@ func (computeSystem *System) statisticsInProc(ctx context.Context) (*hcsschema.P return &hcsschema.Properties{ Statistics: &hcsschema.Statistics{ - Timestamp: time.Now(), + Timestamp: timestamp, ContainerStartTime: computeSystem.startTime, - Uptime100ns: uint64(time.Since(computeSystem.startTime)) / 100, + Uptime100ns: uint64(time.Since(computeSystem.startTime).Nanoseconds()) / 100, Memory: &hcsschema.MemoryStats{ MemoryUsageCommitBytes: memInfo.JobMemory, MemoryUsageCommitPeakBytes: memInfo.PeakJobMemoryUsed, diff --git a/internal/jobcontainers/jobcontainer.go b/internal/jobcontainers/jobcontainer.go index 997c680e3a..83d57a1c49 100644 --- a/internal/jobcontainers/jobcontainer.go +++ b/internal/jobcontainers/jobcontainer.go @@ -420,6 +420,9 @@ func (c *JobContainer) PropertiesV2(ctx context.Context, types ...hcsschema.Prop return nil, errors.New("PTStatistics is the only supported property type for job containers") } + // Start timestamp before we grab the stats to match HCS' behavior + timestamp := time.Now() + memInfo, err := c.job.QueryMemoryStats() if err != nil { return nil, errors.Wrap(err, "failed to query for job containers memory information") @@ -442,8 +445,8 @@ func (c *JobContainer) PropertiesV2(ctx context.Context, types ...hcsschema.Prop return &hcsschema.Properties{ Statistics: &hcsschema.Statistics{ - Timestamp: time.Now(), - Uptime100ns: uint64(time.Since(c.startTimestamp)) / 100, + Timestamp: timestamp, + Uptime100ns: uint64(time.Since(c.startTimestamp).Nanoseconds()) / 100, ContainerStartTime: c.startTimestamp, Memory: &hcsschema.MemoryStats{ MemoryUsageCommitBytes: memInfo.JobMemory, diff --git a/internal/jobobject/jobobject.go b/internal/jobobject/jobobject.go index 6b8ada3c5d..f394e1fb71 100644 --- a/internal/jobobject/jobobject.go +++ b/internal/jobobject/jobobject.go @@ -431,6 +431,13 @@ func (job *JobObject) QueryProcessorStats() (*winapi.JOBOBJECT_BASIC_ACCOUNTING_ // QueryStorageStats gets the storage (I/O) stats for the job object. func (job *JobObject) QueryStorageStats() (*winapi.JOBOBJECT_IO_ATTRIBUTION_INFORMATION, error) { + job.handleLock.RLock() + defer job.handleLock.RUnlock() + + if job.handle == 0 { + return nil, ErrAlreadyClosed + } + info := winapi.JOBOBJECT_IO_ATTRIBUTION_INFORMATION{ ControlFlags: winapi.JOBOBJECT_IO_ATTRIBUTION_CONTROL_ENABLE, } @@ -550,10 +557,12 @@ func (job *JobObject) PrivateWorkingSet() (uint64, error) { return 0, err } - openAndQuery := func(pid uint32) (uint64, error) { + openAndQueryWorkingSet := func(pid uint32) (uint64, error) { h, err := windows.OpenProcess(windows.PROCESS_QUERY_LIMITED_INFORMATION, false, pid) if err != nil { - return 0, fmt.Errorf("failed to open process with pid %d: %w", pid, err) + // Match HCS' behavior, just bail if if OpenProcess doesn't return a valid handle (fails). + // Could be to handle the case where one of the pids in the job exited before we open here. + return 0, nil } defer func() { _ = windows.Close(h) @@ -575,7 +584,7 @@ func (job *JobObject) PrivateWorkingSet() (uint64, error) { var jobWorkingSetSize uint64 for _, pid := range pids { - workingSet, err := openAndQuery(pid) + workingSet, err := openAndQueryWorkingSet(pid) if err != nil { return 0, err } diff --git a/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go b/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go index 79713756e4..de0d6ebdd1 100644 --- a/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go +++ b/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go @@ -21,7 +21,6 @@ import ( "github.com/Microsoft/hcsshim/internal/timeout" "github.com/Microsoft/hcsshim/internal/vmcompute" "go.opencensus.io/trace" - "golang.org/x/sys/windows" ) type System struct { @@ -339,16 +338,11 @@ func (computeSystem *System) Properties(ctx context.Context, types ...schema1.Pr // statisticsInProc emulates what HCS does to grab statistics for a given container with a small // change to make grabbing the private working set total much more efficient. func (computeSystem *System) statisticsInProc(ctx context.Context) (*hcsschema.Properties, error) { - // See if we'd even be able to open the silo. We'll need to be local - // system/system so check first. - usr, err := windows.GetCurrentProcessToken().GetTokenUser() - if err != nil { - return nil, err - } - if !usr.User.Sid.IsWellKnown(windows.WinLocalSystemSid) { - return nil, errors.New("process does not have the right permissions to open the silo") - } - + // Start timestamp for these stats before we grab them to match HCS + timestamp := time.Now() + // In the future we can make use of some new functionality in the HCS that allows you + // to pass a job object for HCS to use for the container. Currently, the only way we'll + // be able to open the job/silo is if we're running as SYSTEM. jobOptions := &jobobject.Options{ UseNTVariant: true, Name: siloNameFmt(computeSystem.id), @@ -381,9 +375,9 @@ func (computeSystem *System) statisticsInProc(ctx context.Context) (*hcsschema.P return &hcsschema.Properties{ Statistics: &hcsschema.Statistics{ - Timestamp: time.Now(), + Timestamp: timestamp, ContainerStartTime: computeSystem.startTime, - Uptime100ns: uint64(time.Since(computeSystem.startTime)) / 100, + Uptime100ns: uint64(time.Since(computeSystem.startTime).Nanoseconds()) / 100, Memory: &hcsschema.MemoryStats{ MemoryUsageCommitBytes: memInfo.JobMemory, MemoryUsageCommitPeakBytes: memInfo.PeakJobMemoryUsed, diff --git a/test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/jobobject.go b/test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/jobobject.go index 6b8ada3c5d..f394e1fb71 100644 --- a/test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/jobobject.go +++ b/test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/jobobject.go @@ -431,6 +431,13 @@ func (job *JobObject) QueryProcessorStats() (*winapi.JOBOBJECT_BASIC_ACCOUNTING_ // QueryStorageStats gets the storage (I/O) stats for the job object. func (job *JobObject) QueryStorageStats() (*winapi.JOBOBJECT_IO_ATTRIBUTION_INFORMATION, error) { + job.handleLock.RLock() + defer job.handleLock.RUnlock() + + if job.handle == 0 { + return nil, ErrAlreadyClosed + } + info := winapi.JOBOBJECT_IO_ATTRIBUTION_INFORMATION{ ControlFlags: winapi.JOBOBJECT_IO_ATTRIBUTION_CONTROL_ENABLE, } @@ -550,10 +557,12 @@ func (job *JobObject) PrivateWorkingSet() (uint64, error) { return 0, err } - openAndQuery := func(pid uint32) (uint64, error) { + openAndQueryWorkingSet := func(pid uint32) (uint64, error) { h, err := windows.OpenProcess(windows.PROCESS_QUERY_LIMITED_INFORMATION, false, pid) if err != nil { - return 0, fmt.Errorf("failed to open process with pid %d: %w", pid, err) + // Match HCS' behavior, just bail if if OpenProcess doesn't return a valid handle (fails). + // Could be to handle the case where one of the pids in the job exited before we open here. + return 0, nil } defer func() { _ = windows.Close(h) @@ -575,7 +584,7 @@ func (job *JobObject) PrivateWorkingSet() (uint64, error) { var jobWorkingSetSize uint64 for _, pid := range pids { - workingSet, err := openAndQuery(pid) + workingSet, err := openAndQueryWorkingSet(pid) if err != nil { return 0, err } From 9a05474cf27a610703d170acd84c5a57a43574e7 Mon Sep 17 00:00:00 2001 From: Daniel Canter Date: Thu, 21 Apr 2022 16:29:05 -0700 Subject: [PATCH 03/12] PR Feedback Add check to make sure we don't count a processes working set if its pid was re-used. Signed-off-by: Daniel Canter --- internal/jobobject/jobobject.go | 24 +++++++++++++++++-- .../hcsshim/internal/jobobject/jobobject.go | 24 +++++++++++++++++-- 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/internal/jobobject/jobobject.go b/internal/jobobject/jobobject.go index f394e1fb71..70d6f4cd03 100644 --- a/internal/jobobject/jobobject.go +++ b/internal/jobobject/jobobject.go @@ -10,6 +10,7 @@ import ( "path/filepath" "sync" "sync/atomic" + "time" "unsafe" "github.com/Microsoft/hcsshim/internal/queue" @@ -552,6 +553,14 @@ func (job *JobObject) isSilo() bool { // PrivateWorkingSet returns the private working set size for the job. This is calculated by adding up the // private working set for every process running in the job. func (job *JobObject) PrivateWorkingSet() (uint64, error) { + // Grab a timestamp before we call pids. This will be used below to handle a rare case where a pid + // in the job could exit and get re-used for another process and we end up reporting its + // memory info instead. We'll compare the creation time for the process we open against this + // timestamp to see if it was created after; if it is, we'll ignore the working set reading and + // move on to the next process. This isn't perfect, and the timestamp could be before or after + // we call pids, both with drawbacks, but this should be rare enough that missing a processes + // reading or missing a newly started process should be acceptable. + t := time.Now() pids, err := job.Pids() if err != nil { return 0, err @@ -560,14 +569,25 @@ func (job *JobObject) PrivateWorkingSet() (uint64, error) { openAndQueryWorkingSet := func(pid uint32) (uint64, error) { h, err := windows.OpenProcess(windows.PROCESS_QUERY_LIMITED_INFORMATION, false, pid) if err != nil { - // Match HCS' behavior, just bail if if OpenProcess doesn't return a valid handle (fails). - // Could be to handle the case where one of the pids in the job exited before we open here. + // Bail if OpenProcess doesn't return a valid handle (fails). Handles a + // case where one of the pids in the job exited before we open. return 0, nil } defer func() { _ = windows.Close(h) }() + var createTime, exitTime, kernelTime, userTime windows.Filetime + if err := windows.GetProcessTimes(h, &createTime, &exitTime, &kernelTime, &userTime); err != nil { + return 0, fmt.Errorf("failed to get creation time for pid %d: %w", pid, err) + } + + // If the process was created after our timestamp we took before Pids(), don't report the total. Pid + // may have been re-used for another process. + if createTime.Nanoseconds() > t.UnixNano() { + return 0, nil + } + var vmCounters winapi.VM_COUNTERS_EX2 status := winapi.NtQueryInformationProcess( h, diff --git a/test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/jobobject.go b/test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/jobobject.go index f394e1fb71..70d6f4cd03 100644 --- a/test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/jobobject.go +++ b/test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/jobobject.go @@ -10,6 +10,7 @@ import ( "path/filepath" "sync" "sync/atomic" + "time" "unsafe" "github.com/Microsoft/hcsshim/internal/queue" @@ -552,6 +553,14 @@ func (job *JobObject) isSilo() bool { // PrivateWorkingSet returns the private working set size for the job. This is calculated by adding up the // private working set for every process running in the job. func (job *JobObject) PrivateWorkingSet() (uint64, error) { + // Grab a timestamp before we call pids. This will be used below to handle a rare case where a pid + // in the job could exit and get re-used for another process and we end up reporting its + // memory info instead. We'll compare the creation time for the process we open against this + // timestamp to see if it was created after; if it is, we'll ignore the working set reading and + // move on to the next process. This isn't perfect, and the timestamp could be before or after + // we call pids, both with drawbacks, but this should be rare enough that missing a processes + // reading or missing a newly started process should be acceptable. + t := time.Now() pids, err := job.Pids() if err != nil { return 0, err @@ -560,14 +569,25 @@ func (job *JobObject) PrivateWorkingSet() (uint64, error) { openAndQueryWorkingSet := func(pid uint32) (uint64, error) { h, err := windows.OpenProcess(windows.PROCESS_QUERY_LIMITED_INFORMATION, false, pid) if err != nil { - // Match HCS' behavior, just bail if if OpenProcess doesn't return a valid handle (fails). - // Could be to handle the case where one of the pids in the job exited before we open here. + // Bail if OpenProcess doesn't return a valid handle (fails). Handles a + // case where one of the pids in the job exited before we open. return 0, nil } defer func() { _ = windows.Close(h) }() + var createTime, exitTime, kernelTime, userTime windows.Filetime + if err := windows.GetProcessTimes(h, &createTime, &exitTime, &kernelTime, &userTime); err != nil { + return 0, fmt.Errorf("failed to get creation time for pid %d: %w", pid, err) + } + + // If the process was created after our timestamp we took before Pids(), don't report the total. Pid + // may have been re-used for another process. + if createTime.Nanoseconds() > t.UnixNano() { + return 0, nil + } + var vmCounters winapi.VM_COUNTERS_EX2 status := winapi.NtQueryInformationProcess( h, From 5c485cd5bc8ca91b9f4c25d9e05a2ffedf24c62b Mon Sep 17 00:00:00 2001 From: Daniel Canter Date: Fri, 29 Apr 2022 16:22:04 -0700 Subject: [PATCH 04/12] PR Feedback Get rid of timestamp approach in favor of just calling IsProcessInJob after opening the process handle. Signed-off-by: Daniel Canter --- internal/jobobject/jobobject.go | 57 ++++++++----------- internal/winapi/jobobject.go | 2 +- internal/winapi/zsyscall_windows.go | 2 +- .../hcsshim/internal/jobobject/jobobject.go | 57 ++++++++----------- .../hcsshim/internal/winapi/jobobject.go | 2 +- .../internal/winapi/zsyscall_windows.go | 2 +- 6 files changed, 50 insertions(+), 72 deletions(-) diff --git a/internal/jobobject/jobobject.go b/internal/jobobject/jobobject.go index 70d6f4cd03..36d6e993a4 100644 --- a/internal/jobobject/jobobject.go +++ b/internal/jobobject/jobobject.go @@ -10,7 +10,6 @@ import ( "path/filepath" "sync" "sync/atomic" - "time" "unsafe" "github.com/Microsoft/hcsshim/internal/queue" @@ -553,41 +552,12 @@ func (job *JobObject) isSilo() bool { // PrivateWorkingSet returns the private working set size for the job. This is calculated by adding up the // private working set for every process running in the job. func (job *JobObject) PrivateWorkingSet() (uint64, error) { - // Grab a timestamp before we call pids. This will be used below to handle a rare case where a pid - // in the job could exit and get re-used for another process and we end up reporting its - // memory info instead. We'll compare the creation time for the process we open against this - // timestamp to see if it was created after; if it is, we'll ignore the working set reading and - // move on to the next process. This isn't perfect, and the timestamp could be before or after - // we call pids, both with drawbacks, but this should be rare enough that missing a processes - // reading or missing a newly started process should be acceptable. - t := time.Now() pids, err := job.Pids() if err != nil { return 0, err } - openAndQueryWorkingSet := func(pid uint32) (uint64, error) { - h, err := windows.OpenProcess(windows.PROCESS_QUERY_LIMITED_INFORMATION, false, pid) - if err != nil { - // Bail if OpenProcess doesn't return a valid handle (fails). Handles a - // case where one of the pids in the job exited before we open. - return 0, nil - } - defer func() { - _ = windows.Close(h) - }() - - var createTime, exitTime, kernelTime, userTime windows.Filetime - if err := windows.GetProcessTimes(h, &createTime, &exitTime, &kernelTime, &userTime); err != nil { - return 0, fmt.Errorf("failed to get creation time for pid %d: %w", pid, err) - } - - // If the process was created after our timestamp we took before Pids(), don't report the total. Pid - // may have been re-used for another process. - if createTime.Nanoseconds() > t.UnixNano() { - return 0, nil - } - + queryProcessWorkingSet := func(h windows.Handle) (uint64, error) { var vmCounters winapi.VM_COUNTERS_EX2 status := winapi.NtQueryInformationProcess( h, @@ -597,18 +567,37 @@ func (job *JobObject) PrivateWorkingSet() (uint64, error) { nil, ) if !winapi.NTSuccess(status) { - return 0, fmt.Errorf("failed to query information for process with pid %d: %w", pid, winapi.RtlNtStatusToDosError(status)) + return 0, fmt.Errorf("failed to query information for process: %w", winapi.RtlNtStatusToDosError(status)) } return uint64(vmCounters.PrivateWorkingSetSize), nil } var jobWorkingSetSize uint64 for _, pid := range pids { - workingSet, err := openAndQueryWorkingSet(pid) + h, err := windows.OpenProcess(windows.PROCESS_QUERY_LIMITED_INFORMATION, false, pid) if err != nil { + // Continue to the next if OpenProcess doesn't return a valid handle (fails). Handles a + // case where one of the pids in the job exited before we open. + continue + } + // Check if the process is actually running in the job still. There's a small chance + // that the process could have exited and had its pid re-used between grabbing the pids + // in the job and opening the handle to it above. + var inJob int32 + if err := winapi.IsProcessInJob(h, job.handle, &inJob); err != nil { + windows.Close(h) + // This shouldn't fail unless we have incorrect access rights which we control + // here so probably best to error out if this failed. return 0, err } - jobWorkingSetSize += workingSet + if inJob == 1 { + workingSet, err := queryProcessWorkingSet(h) + if err != nil { + return 0, err + } + jobWorkingSetSize += workingSet + } } + return jobWorkingSetSize, nil } diff --git a/internal/winapi/jobobject.go b/internal/winapi/jobobject.go index e770a570e6..4ada2f53cc 100644 --- a/internal/winapi/jobobject.go +++ b/internal/winapi/jobobject.go @@ -168,7 +168,7 @@ type JOBOBJECT_ASSOCIATE_COMPLETION_PORT struct { // PBOOL Result // ); // -//sys IsProcessInJob(procHandle windows.Handle, jobHandle windows.Handle, result *bool) (err error) = kernel32.IsProcessInJob +//sys IsProcessInJob(procHandle windows.Handle, jobHandle windows.Handle, result *int32) (err error) = kernel32.IsProcessInJob // BOOL QueryInformationJobObject( // HANDLE hJob, diff --git a/internal/winapi/zsyscall_windows.go b/internal/winapi/zsyscall_windows.go index 7a1c241389..ea4eb27e62 100644 --- a/internal/winapi/zsyscall_windows.go +++ b/internal/winapi/zsyscall_windows.go @@ -193,7 +193,7 @@ func CreateRemoteThread(process windows.Handle, sa *windows.SecurityAttributes, return } -func IsProcessInJob(procHandle windows.Handle, jobHandle windows.Handle, result *bool) (err error) { +func IsProcessInJob(procHandle windows.Handle, jobHandle windows.Handle, result *int32) (err error) { r1, _, e1 := syscall.Syscall(procIsProcessInJob.Addr(), 3, uintptr(procHandle), uintptr(jobHandle), uintptr(unsafe.Pointer(result))) if r1 == 0 { if e1 != 0 { diff --git a/test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/jobobject.go b/test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/jobobject.go index 70d6f4cd03..36d6e993a4 100644 --- a/test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/jobobject.go +++ b/test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/jobobject.go @@ -10,7 +10,6 @@ import ( "path/filepath" "sync" "sync/atomic" - "time" "unsafe" "github.com/Microsoft/hcsshim/internal/queue" @@ -553,41 +552,12 @@ func (job *JobObject) isSilo() bool { // PrivateWorkingSet returns the private working set size for the job. This is calculated by adding up the // private working set for every process running in the job. func (job *JobObject) PrivateWorkingSet() (uint64, error) { - // Grab a timestamp before we call pids. This will be used below to handle a rare case where a pid - // in the job could exit and get re-used for another process and we end up reporting its - // memory info instead. We'll compare the creation time for the process we open against this - // timestamp to see if it was created after; if it is, we'll ignore the working set reading and - // move on to the next process. This isn't perfect, and the timestamp could be before or after - // we call pids, both with drawbacks, but this should be rare enough that missing a processes - // reading or missing a newly started process should be acceptable. - t := time.Now() pids, err := job.Pids() if err != nil { return 0, err } - openAndQueryWorkingSet := func(pid uint32) (uint64, error) { - h, err := windows.OpenProcess(windows.PROCESS_QUERY_LIMITED_INFORMATION, false, pid) - if err != nil { - // Bail if OpenProcess doesn't return a valid handle (fails). Handles a - // case where one of the pids in the job exited before we open. - return 0, nil - } - defer func() { - _ = windows.Close(h) - }() - - var createTime, exitTime, kernelTime, userTime windows.Filetime - if err := windows.GetProcessTimes(h, &createTime, &exitTime, &kernelTime, &userTime); err != nil { - return 0, fmt.Errorf("failed to get creation time for pid %d: %w", pid, err) - } - - // If the process was created after our timestamp we took before Pids(), don't report the total. Pid - // may have been re-used for another process. - if createTime.Nanoseconds() > t.UnixNano() { - return 0, nil - } - + queryProcessWorkingSet := func(h windows.Handle) (uint64, error) { var vmCounters winapi.VM_COUNTERS_EX2 status := winapi.NtQueryInformationProcess( h, @@ -597,18 +567,37 @@ func (job *JobObject) PrivateWorkingSet() (uint64, error) { nil, ) if !winapi.NTSuccess(status) { - return 0, fmt.Errorf("failed to query information for process with pid %d: %w", pid, winapi.RtlNtStatusToDosError(status)) + return 0, fmt.Errorf("failed to query information for process: %w", winapi.RtlNtStatusToDosError(status)) } return uint64(vmCounters.PrivateWorkingSetSize), nil } var jobWorkingSetSize uint64 for _, pid := range pids { - workingSet, err := openAndQueryWorkingSet(pid) + h, err := windows.OpenProcess(windows.PROCESS_QUERY_LIMITED_INFORMATION, false, pid) if err != nil { + // Continue to the next if OpenProcess doesn't return a valid handle (fails). Handles a + // case where one of the pids in the job exited before we open. + continue + } + // Check if the process is actually running in the job still. There's a small chance + // that the process could have exited and had its pid re-used between grabbing the pids + // in the job and opening the handle to it above. + var inJob int32 + if err := winapi.IsProcessInJob(h, job.handle, &inJob); err != nil { + windows.Close(h) + // This shouldn't fail unless we have incorrect access rights which we control + // here so probably best to error out if this failed. return 0, err } - jobWorkingSetSize += workingSet + if inJob == 1 { + workingSet, err := queryProcessWorkingSet(h) + if err != nil { + return 0, err + } + jobWorkingSetSize += workingSet + } } + return jobWorkingSetSize, nil } diff --git a/test/vendor/github.com/Microsoft/hcsshim/internal/winapi/jobobject.go b/test/vendor/github.com/Microsoft/hcsshim/internal/winapi/jobobject.go index e770a570e6..4ada2f53cc 100644 --- a/test/vendor/github.com/Microsoft/hcsshim/internal/winapi/jobobject.go +++ b/test/vendor/github.com/Microsoft/hcsshim/internal/winapi/jobobject.go @@ -168,7 +168,7 @@ type JOBOBJECT_ASSOCIATE_COMPLETION_PORT struct { // PBOOL Result // ); // -//sys IsProcessInJob(procHandle windows.Handle, jobHandle windows.Handle, result *bool) (err error) = kernel32.IsProcessInJob +//sys IsProcessInJob(procHandle windows.Handle, jobHandle windows.Handle, result *int32) (err error) = kernel32.IsProcessInJob // BOOL QueryInformationJobObject( // HANDLE hJob, diff --git a/test/vendor/github.com/Microsoft/hcsshim/internal/winapi/zsyscall_windows.go b/test/vendor/github.com/Microsoft/hcsshim/internal/winapi/zsyscall_windows.go index 7a1c241389..ea4eb27e62 100644 --- a/test/vendor/github.com/Microsoft/hcsshim/internal/winapi/zsyscall_windows.go +++ b/test/vendor/github.com/Microsoft/hcsshim/internal/winapi/zsyscall_windows.go @@ -193,7 +193,7 @@ func CreateRemoteThread(process windows.Handle, sa *windows.SecurityAttributes, return } -func IsProcessInJob(procHandle windows.Handle, jobHandle windows.Handle, result *bool) (err error) { +func IsProcessInJob(procHandle windows.Handle, jobHandle windows.Handle, result *int32) (err error) { r1, _, e1 := syscall.Syscall(procIsProcessInJob.Addr(), 3, uintptr(procHandle), uintptr(jobHandle), uintptr(unsafe.Pointer(result))) if r1 == 0 { if e1 != 0 { From 03dd92bc0332e091ba43f3fd0551247ee894b850 Mon Sep 17 00:00:00 2001 From: Daniel Canter Date: Fri, 29 Apr 2022 16:50:42 -0700 Subject: [PATCH 05/12] PR feedback Don't leak process handles. Signed-off-by: Daniel Canter --- internal/jobobject/jobobject.go | 48 +++++++++++-------- .../hcsshim/internal/jobobject/jobobject.go | 48 +++++++++++-------- 2 files changed, 54 insertions(+), 42 deletions(-) diff --git a/internal/jobobject/jobobject.go b/internal/jobobject/jobobject.go index 36d6e993a4..41299b6d63 100644 --- a/internal/jobobject/jobobject.go +++ b/internal/jobobject/jobobject.go @@ -557,7 +557,31 @@ func (job *JobObject) PrivateWorkingSet() (uint64, error) { return 0, err } - queryProcessWorkingSet := func(h windows.Handle) (uint64, error) { + openAndQueryWorkingSet := func(pid uint32) (uint64, error) { + h, err := windows.OpenProcess(windows.PROCESS_QUERY_LIMITED_INFORMATION, false, pid) + if err != nil { + // Continue to the next if OpenProcess doesn't return a valid handle (fails). Handles a + // case where one of the pids in the job exited before we open. + return 0, nil + } + defer func() { + _ = windows.Close(h) + }() + // Check if the process is actually running in the job still. There's a small chance + // that the process could have exited and had its pid re-used between grabbing the pids + // in the job and opening the handle to it above. + var inJob int32 + if err := winapi.IsProcessInJob(h, job.handle, &inJob); err != nil { + // This shouldn't fail unless we have incorrect access rights which we control + // here so probably best to error out if this failed. + return 0, err + } + // Don't report stats for this process as it's not running in the job. This shouldn't be + // an error condition though. + if inJob == 0 { + return 0, nil + } + var vmCounters winapi.VM_COUNTERS_EX2 status := winapi.NtQueryInformationProcess( h, @@ -574,29 +598,11 @@ func (job *JobObject) PrivateWorkingSet() (uint64, error) { var jobWorkingSetSize uint64 for _, pid := range pids { - h, err := windows.OpenProcess(windows.PROCESS_QUERY_LIMITED_INFORMATION, false, pid) + workingSet, err := openAndQueryWorkingSet(pid) if err != nil { - // Continue to the next if OpenProcess doesn't return a valid handle (fails). Handles a - // case where one of the pids in the job exited before we open. - continue - } - // Check if the process is actually running in the job still. There's a small chance - // that the process could have exited and had its pid re-used between grabbing the pids - // in the job and opening the handle to it above. - var inJob int32 - if err := winapi.IsProcessInJob(h, job.handle, &inJob); err != nil { - windows.Close(h) - // This shouldn't fail unless we have incorrect access rights which we control - // here so probably best to error out if this failed. return 0, err } - if inJob == 1 { - workingSet, err := queryProcessWorkingSet(h) - if err != nil { - return 0, err - } - jobWorkingSetSize += workingSet - } + jobWorkingSetSize += workingSet } return jobWorkingSetSize, nil diff --git a/test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/jobobject.go b/test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/jobobject.go index 36d6e993a4..41299b6d63 100644 --- a/test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/jobobject.go +++ b/test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/jobobject.go @@ -557,7 +557,31 @@ func (job *JobObject) PrivateWorkingSet() (uint64, error) { return 0, err } - queryProcessWorkingSet := func(h windows.Handle) (uint64, error) { + openAndQueryWorkingSet := func(pid uint32) (uint64, error) { + h, err := windows.OpenProcess(windows.PROCESS_QUERY_LIMITED_INFORMATION, false, pid) + if err != nil { + // Continue to the next if OpenProcess doesn't return a valid handle (fails). Handles a + // case where one of the pids in the job exited before we open. + return 0, nil + } + defer func() { + _ = windows.Close(h) + }() + // Check if the process is actually running in the job still. There's a small chance + // that the process could have exited and had its pid re-used between grabbing the pids + // in the job and opening the handle to it above. + var inJob int32 + if err := winapi.IsProcessInJob(h, job.handle, &inJob); err != nil { + // This shouldn't fail unless we have incorrect access rights which we control + // here so probably best to error out if this failed. + return 0, err + } + // Don't report stats for this process as it's not running in the job. This shouldn't be + // an error condition though. + if inJob == 0 { + return 0, nil + } + var vmCounters winapi.VM_COUNTERS_EX2 status := winapi.NtQueryInformationProcess( h, @@ -574,29 +598,11 @@ func (job *JobObject) PrivateWorkingSet() (uint64, error) { var jobWorkingSetSize uint64 for _, pid := range pids { - h, err := windows.OpenProcess(windows.PROCESS_QUERY_LIMITED_INFORMATION, false, pid) + workingSet, err := openAndQueryWorkingSet(pid) if err != nil { - // Continue to the next if OpenProcess doesn't return a valid handle (fails). Handles a - // case where one of the pids in the job exited before we open. - continue - } - // Check if the process is actually running in the job still. There's a small chance - // that the process could have exited and had its pid re-used between grabbing the pids - // in the job and opening the handle to it above. - var inJob int32 - if err := winapi.IsProcessInJob(h, job.handle, &inJob); err != nil { - windows.Close(h) - // This shouldn't fail unless we have incorrect access rights which we control - // here so probably best to error out if this failed. return 0, err } - if inJob == 1 { - workingSet, err := queryProcessWorkingSet(h) - if err != nil { - return 0, err - } - jobWorkingSetSize += workingSet - } + jobWorkingSetSize += workingSet } return jobWorkingSetSize, nil From 7001abde47a016c130a7aab2642135e4a5a16046 Mon Sep 17 00:00:00 2001 From: Daniel Canter Date: Thu, 5 May 2022 16:47:05 -0700 Subject: [PATCH 06/12] PR feedback * Don't special case if the client asks for only stats; reach out to hcs for the rest. * Log if we hit the fallback path * Move explalanation of private working set query to where the function is called. Signed-off-by: Daniel Canter --- internal/hcs/system.go | 132 ++++++++++-------- .../Microsoft/hcsshim/internal/hcs/system.go | 132 ++++++++++-------- 2 files changed, 154 insertions(+), 110 deletions(-) diff --git a/internal/hcs/system.go b/internal/hcs/system.go index de0d6ebdd1..87b36adacf 100644 --- a/internal/hcs/system.go +++ b/internal/hcs/system.go @@ -17,6 +17,7 @@ import ( hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2" "github.com/Microsoft/hcsshim/internal/jobobject" "github.com/Microsoft/hcsshim/internal/log" + "github.com/Microsoft/hcsshim/internal/logfields" "github.com/Microsoft/hcsshim/internal/oc" "github.com/Microsoft/hcsshim/internal/timeout" "github.com/Microsoft/hcsshim/internal/vmcompute" @@ -33,7 +34,7 @@ type System struct { waitBlock chan struct{} waitError error exitError error - os, typ string + os, typ, owner string startTime time.Time } @@ -138,6 +139,7 @@ func (computeSystem *System) getCachedProperties(ctx context.Context) error { } computeSystem.typ = strings.ToLower(props.SystemType) computeSystem.os = strings.ToLower(props.RuntimeOSType) + computeSystem.owner = strings.ToLower(props.Owner) if computeSystem.os == "" && computeSystem.typ == "container" { // Pre-RS5 HCS did not return the OS, but it only supported containers // that ran Windows. @@ -337,9 +339,10 @@ func (computeSystem *System) Properties(ctx context.Context, types ...schema1.Pr // statisticsInProc emulates what HCS does to grab statistics for a given container with a small // change to make grabbing the private working set total much more efficient. -func (computeSystem *System) statisticsInProc(ctx context.Context) (*hcsschema.Properties, error) { +func (computeSystem *System) statisticsInProc(ctx context.Context) (*hcsschema.Statistics, error) { // Start timestamp for these stats before we grab them to match HCS timestamp := time.Now() + // In the future we can make use of some new functionality in the HCS that allows you // to pass a job object for HCS to use for the container. Currently, the only way we'll // be able to open the job/silo is if we're running as SYSTEM. @@ -368,72 +371,86 @@ func (computeSystem *System) statisticsInProc(ctx context.Context) (*hcsschema.P return nil, err } + // This calculates the private working set more efficiently than HCS does. HCS calls NtQuerySystemInformation + // with the class SystemProcessInformation which returns an array containing system information for *every* + // process running on the machine. They then grab the pids that are running in the container and filter down + // the entries in the array to only what's running in that silo and start tallying up the total. This doesn't + // work well as performance should get worse if more processess are running on the machine in general and not + // just in the container. All of the additional information besides the WorkingSetPrivateSize field is ignored + // as well which isn't great and is wasted work to fetch. + // + // HCS only let's you grab statistics in an all or nothing fashion, so we can't just grab the private + // working set ourselves and ask for everything else seperately. The optimization we can make here is + // to open the silo ourselves and do the same queries for the rest of the info, as well as calculating + // the private working set in a more efficient manner by: + // + // 1. Find the pids running in the silo + // 2. Get a process handle for every process (only need PROCESS_QUERY_LIMITED_INFORMATION access) + // 3. Call NtQueryInformationProcess on each process with the class ProcessVmCounters + // 4. Tally up the total using the field PrivateWorkingSetSize in VM_COUNTERS_EX2. privateWorkingSet, err := job.PrivateWorkingSet() if err != nil { return nil, err } - return &hcsschema.Properties{ - Statistics: &hcsschema.Statistics{ - Timestamp: timestamp, - ContainerStartTime: computeSystem.startTime, - Uptime100ns: uint64(time.Since(computeSystem.startTime).Nanoseconds()) / 100, - Memory: &hcsschema.MemoryStats{ - MemoryUsageCommitBytes: memInfo.JobMemory, - MemoryUsageCommitPeakBytes: memInfo.PeakJobMemoryUsed, - MemoryUsagePrivateWorkingSetBytes: privateWorkingSet, - }, - Processor: &hcsschema.ProcessorStats{ - RuntimeKernel100ns: uint64(processorInfo.TotalKernelTime), - RuntimeUser100ns: uint64(processorInfo.TotalUserTime), - TotalRuntime100ns: uint64(processorInfo.TotalKernelTime + processorInfo.TotalUserTime), - }, - Storage: &hcsschema.StorageStats{ - ReadCountNormalized: uint64(storageInfo.ReadStats.IoCount), - ReadSizeBytes: storageInfo.ReadStats.TotalSize, - WriteCountNormalized: uint64(storageInfo.WriteStats.IoCount), - WriteSizeBytes: storageInfo.WriteStats.TotalSize, - }, + return &hcsschema.Statistics{ + Timestamp: timestamp, + ContainerStartTime: computeSystem.startTime, + Uptime100ns: uint64(time.Since(computeSystem.startTime).Nanoseconds()) / 100, + Memory: &hcsschema.MemoryStats{ + MemoryUsageCommitBytes: memInfo.JobMemory, + MemoryUsageCommitPeakBytes: memInfo.PeakJobMemoryUsed, + MemoryUsagePrivateWorkingSetBytes: privateWorkingSet, + }, + Processor: &hcsschema.ProcessorStats{ + RuntimeKernel100ns: uint64(processorInfo.TotalKernelTime), + RuntimeUser100ns: uint64(processorInfo.TotalUserTime), + TotalRuntime100ns: uint64(processorInfo.TotalKernelTime + processorInfo.TotalUserTime), + }, + Storage: &hcsschema.StorageStats{ + ReadCountNormalized: uint64(storageInfo.ReadStats.IoCount), + ReadSizeBytes: storageInfo.ReadStats.TotalSize, + WriteCountNormalized: uint64(storageInfo.WriteStats.IoCount), + WriteSizeBytes: storageInfo.WriteStats.TotalSize, }, }, nil } // PropertiesV2 returns the requested container properties targeting a V2 schema container. -func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschema.PropertyType) (*hcsschema.Properties, error) { +func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschema.PropertyType) (_ *hcsschema.Properties, err error) { computeSystem.handleLock.RLock() defer computeSystem.handleLock.RUnlock() operation := "hcs::System::PropertiesV2" - // There's an optimization we can make here if the user is asking for Statistics only, and this is due to - // an inefficient way that HCS calculates the private working set total for a given container. HCS - // calls NtQuerySystemInformation with the class SystemProcessInformation which returns an array containing - // system information for *every* process running on the machine. They then grab the pids that are running in - // the container and filter down the entries in the array to only what's running in that silo and start tallying - // up the total. This doesn't work well as performance should get worse if more processess are running on the - // machine in general and not just in the container. All of the additional information besides the - // WorkingSetPrivateSize field is ignored as well which isn't great and is wasted work to fetch. - // - // HCS only let's you grab statistics in an all or nothing fashion, so we can't just grab the private - // working set ourselves and ask for everything else seperately. The optimization we can make here is - // to open the silo ourselves and do the same queries for the rest of the info, as well as calculating - // the private working set in a more efficient manner by: - // - // 1. Find the pids running in the silo - // 2. Get a process handle for every process (only need PROCESS_QUERY_LIMITED_INFORMATION access) - // 3. Call NtQueryInformationProcess on each process with the class ProcessVmCounters - // 4. Tally up the total using the field PrivateWorkingSetSize in VM_COUNTERS_EX2. - // - // The only caveat is the process invoking this code must be running as SYSTEM to open the silo, - // otherwise we'll fallback to asking HCS for the information. - - // First check if we're only asking for Stats to see if we can apply the above optimization. We should - // only do this for containers and not UVMs. - if (len(types) == 1 && types[0] == hcsschema.PTStatistics) && computeSystem.typ == "container" { - properties, err := computeSystem.statisticsInProc(ctx) - // If no errors just return the info as is, if not we'll fallback to the HCS route. + // Check if any of the queries are for stats. We can grab these in process and skip the hop to + // HCS. + var statsPresent, onlyStats bool + for i, prop := range types { + if prop == hcsschema.PTStatistics { + statsPresent = true + onlyStats = len(types) == 1 + // Remove stats from the query types. + types = append(types[:i], types[i+1:]...) + } + } + + properties := &hcsschema.Properties{} + if statsPresent && computeSystem.typ == "container" { + properties.Statistics, err = computeSystem.statisticsInProc(ctx) if err == nil { - return properties, nil + // Early return if this was the only thing we were querying for. + if onlyStats { + properties.Id = computeSystem.id + properties.SystemType = computeSystem.typ + properties.RuntimeOsType = computeSystem.os + properties.Owner = computeSystem.owner + return properties, nil + } + } else { + logTxt := "failed to grab statistics in process - falling back to HCS" + log.G(ctx).WithError(err).WithField(logfields.ContainerID, computeSystem.id).Warn(logTxt) + types = append(types, hcsschema.PTStatistics) } } @@ -451,12 +468,17 @@ func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschem if propertiesJSON == "" { return nil, ErrUnexpectedValue } - properties := &hcsschema.Properties{} - if err := json.Unmarshal([]byte(propertiesJSON), properties); err != nil { + hcsProps := &hcsschema.Properties{} + if err := json.Unmarshal([]byte(propertiesJSON), hcsProps); err != nil { return nil, makeSystemError(computeSystem, operation, err, nil) } + // Copy over stats if we might've failed the inProc query. + if properties.Statistics != nil { + hcsProps.Statistics = properties.Statistics + hcsProps.Owner = computeSystem.owner + } - return properties, nil + return hcsProps, nil } // Pause pauses the execution of the computeSystem. This feature is not enabled in TP5. diff --git a/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go b/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go index de0d6ebdd1..87b36adacf 100644 --- a/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go +++ b/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go @@ -17,6 +17,7 @@ import ( hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2" "github.com/Microsoft/hcsshim/internal/jobobject" "github.com/Microsoft/hcsshim/internal/log" + "github.com/Microsoft/hcsshim/internal/logfields" "github.com/Microsoft/hcsshim/internal/oc" "github.com/Microsoft/hcsshim/internal/timeout" "github.com/Microsoft/hcsshim/internal/vmcompute" @@ -33,7 +34,7 @@ type System struct { waitBlock chan struct{} waitError error exitError error - os, typ string + os, typ, owner string startTime time.Time } @@ -138,6 +139,7 @@ func (computeSystem *System) getCachedProperties(ctx context.Context) error { } computeSystem.typ = strings.ToLower(props.SystemType) computeSystem.os = strings.ToLower(props.RuntimeOSType) + computeSystem.owner = strings.ToLower(props.Owner) if computeSystem.os == "" && computeSystem.typ == "container" { // Pre-RS5 HCS did not return the OS, but it only supported containers // that ran Windows. @@ -337,9 +339,10 @@ func (computeSystem *System) Properties(ctx context.Context, types ...schema1.Pr // statisticsInProc emulates what HCS does to grab statistics for a given container with a small // change to make grabbing the private working set total much more efficient. -func (computeSystem *System) statisticsInProc(ctx context.Context) (*hcsschema.Properties, error) { +func (computeSystem *System) statisticsInProc(ctx context.Context) (*hcsschema.Statistics, error) { // Start timestamp for these stats before we grab them to match HCS timestamp := time.Now() + // In the future we can make use of some new functionality in the HCS that allows you // to pass a job object for HCS to use for the container. Currently, the only way we'll // be able to open the job/silo is if we're running as SYSTEM. @@ -368,72 +371,86 @@ func (computeSystem *System) statisticsInProc(ctx context.Context) (*hcsschema.P return nil, err } + // This calculates the private working set more efficiently than HCS does. HCS calls NtQuerySystemInformation + // with the class SystemProcessInformation which returns an array containing system information for *every* + // process running on the machine. They then grab the pids that are running in the container and filter down + // the entries in the array to only what's running in that silo and start tallying up the total. This doesn't + // work well as performance should get worse if more processess are running on the machine in general and not + // just in the container. All of the additional information besides the WorkingSetPrivateSize field is ignored + // as well which isn't great and is wasted work to fetch. + // + // HCS only let's you grab statistics in an all or nothing fashion, so we can't just grab the private + // working set ourselves and ask for everything else seperately. The optimization we can make here is + // to open the silo ourselves and do the same queries for the rest of the info, as well as calculating + // the private working set in a more efficient manner by: + // + // 1. Find the pids running in the silo + // 2. Get a process handle for every process (only need PROCESS_QUERY_LIMITED_INFORMATION access) + // 3. Call NtQueryInformationProcess on each process with the class ProcessVmCounters + // 4. Tally up the total using the field PrivateWorkingSetSize in VM_COUNTERS_EX2. privateWorkingSet, err := job.PrivateWorkingSet() if err != nil { return nil, err } - return &hcsschema.Properties{ - Statistics: &hcsschema.Statistics{ - Timestamp: timestamp, - ContainerStartTime: computeSystem.startTime, - Uptime100ns: uint64(time.Since(computeSystem.startTime).Nanoseconds()) / 100, - Memory: &hcsschema.MemoryStats{ - MemoryUsageCommitBytes: memInfo.JobMemory, - MemoryUsageCommitPeakBytes: memInfo.PeakJobMemoryUsed, - MemoryUsagePrivateWorkingSetBytes: privateWorkingSet, - }, - Processor: &hcsschema.ProcessorStats{ - RuntimeKernel100ns: uint64(processorInfo.TotalKernelTime), - RuntimeUser100ns: uint64(processorInfo.TotalUserTime), - TotalRuntime100ns: uint64(processorInfo.TotalKernelTime + processorInfo.TotalUserTime), - }, - Storage: &hcsschema.StorageStats{ - ReadCountNormalized: uint64(storageInfo.ReadStats.IoCount), - ReadSizeBytes: storageInfo.ReadStats.TotalSize, - WriteCountNormalized: uint64(storageInfo.WriteStats.IoCount), - WriteSizeBytes: storageInfo.WriteStats.TotalSize, - }, + return &hcsschema.Statistics{ + Timestamp: timestamp, + ContainerStartTime: computeSystem.startTime, + Uptime100ns: uint64(time.Since(computeSystem.startTime).Nanoseconds()) / 100, + Memory: &hcsschema.MemoryStats{ + MemoryUsageCommitBytes: memInfo.JobMemory, + MemoryUsageCommitPeakBytes: memInfo.PeakJobMemoryUsed, + MemoryUsagePrivateWorkingSetBytes: privateWorkingSet, + }, + Processor: &hcsschema.ProcessorStats{ + RuntimeKernel100ns: uint64(processorInfo.TotalKernelTime), + RuntimeUser100ns: uint64(processorInfo.TotalUserTime), + TotalRuntime100ns: uint64(processorInfo.TotalKernelTime + processorInfo.TotalUserTime), + }, + Storage: &hcsschema.StorageStats{ + ReadCountNormalized: uint64(storageInfo.ReadStats.IoCount), + ReadSizeBytes: storageInfo.ReadStats.TotalSize, + WriteCountNormalized: uint64(storageInfo.WriteStats.IoCount), + WriteSizeBytes: storageInfo.WriteStats.TotalSize, }, }, nil } // PropertiesV2 returns the requested container properties targeting a V2 schema container. -func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschema.PropertyType) (*hcsschema.Properties, error) { +func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschema.PropertyType) (_ *hcsschema.Properties, err error) { computeSystem.handleLock.RLock() defer computeSystem.handleLock.RUnlock() operation := "hcs::System::PropertiesV2" - // There's an optimization we can make here if the user is asking for Statistics only, and this is due to - // an inefficient way that HCS calculates the private working set total for a given container. HCS - // calls NtQuerySystemInformation with the class SystemProcessInformation which returns an array containing - // system information for *every* process running on the machine. They then grab the pids that are running in - // the container and filter down the entries in the array to only what's running in that silo and start tallying - // up the total. This doesn't work well as performance should get worse if more processess are running on the - // machine in general and not just in the container. All of the additional information besides the - // WorkingSetPrivateSize field is ignored as well which isn't great and is wasted work to fetch. - // - // HCS only let's you grab statistics in an all or nothing fashion, so we can't just grab the private - // working set ourselves and ask for everything else seperately. The optimization we can make here is - // to open the silo ourselves and do the same queries for the rest of the info, as well as calculating - // the private working set in a more efficient manner by: - // - // 1. Find the pids running in the silo - // 2. Get a process handle for every process (only need PROCESS_QUERY_LIMITED_INFORMATION access) - // 3. Call NtQueryInformationProcess on each process with the class ProcessVmCounters - // 4. Tally up the total using the field PrivateWorkingSetSize in VM_COUNTERS_EX2. - // - // The only caveat is the process invoking this code must be running as SYSTEM to open the silo, - // otherwise we'll fallback to asking HCS for the information. - - // First check if we're only asking for Stats to see if we can apply the above optimization. We should - // only do this for containers and not UVMs. - if (len(types) == 1 && types[0] == hcsschema.PTStatistics) && computeSystem.typ == "container" { - properties, err := computeSystem.statisticsInProc(ctx) - // If no errors just return the info as is, if not we'll fallback to the HCS route. + // Check if any of the queries are for stats. We can grab these in process and skip the hop to + // HCS. + var statsPresent, onlyStats bool + for i, prop := range types { + if prop == hcsschema.PTStatistics { + statsPresent = true + onlyStats = len(types) == 1 + // Remove stats from the query types. + types = append(types[:i], types[i+1:]...) + } + } + + properties := &hcsschema.Properties{} + if statsPresent && computeSystem.typ == "container" { + properties.Statistics, err = computeSystem.statisticsInProc(ctx) if err == nil { - return properties, nil + // Early return if this was the only thing we were querying for. + if onlyStats { + properties.Id = computeSystem.id + properties.SystemType = computeSystem.typ + properties.RuntimeOsType = computeSystem.os + properties.Owner = computeSystem.owner + return properties, nil + } + } else { + logTxt := "failed to grab statistics in process - falling back to HCS" + log.G(ctx).WithError(err).WithField(logfields.ContainerID, computeSystem.id).Warn(logTxt) + types = append(types, hcsschema.PTStatistics) } } @@ -451,12 +468,17 @@ func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschem if propertiesJSON == "" { return nil, ErrUnexpectedValue } - properties := &hcsschema.Properties{} - if err := json.Unmarshal([]byte(propertiesJSON), properties); err != nil { + hcsProps := &hcsschema.Properties{} + if err := json.Unmarshal([]byte(propertiesJSON), hcsProps); err != nil { return nil, makeSystemError(computeSystem, operation, err, nil) } + // Copy over stats if we might've failed the inProc query. + if properties.Statistics != nil { + hcsProps.Statistics = properties.Statistics + hcsProps.Owner = computeSystem.owner + } - return properties, nil + return hcsProps, nil } // Pause pauses the execution of the computeSystem. This feature is not enabled in TP5. From 683a4cdd30db93b4b2d35d421d88ebc1572a5b88 Mon Sep 17 00:00:00 2001 From: Daniel Canter Date: Thu, 5 May 2022 18:58:13 -0700 Subject: [PATCH 07/12] pr feedback PrivateWorkingSet -> QueryPrivateWorkingSet to match the other job object methods. Signed-off-by: Daniel Canter --- internal/hcs/system.go | 2 +- internal/jobcontainers/jobcontainer.go | 2 +- internal/jobobject/jobobject.go | 4 ++-- .../github.com/Microsoft/hcsshim/internal/hcs/system.go | 2 +- .../Microsoft/hcsshim/internal/jobobject/jobobject.go | 4 ++-- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/internal/hcs/system.go b/internal/hcs/system.go index 87b36adacf..1d86f0c594 100644 --- a/internal/hcs/system.go +++ b/internal/hcs/system.go @@ -388,7 +388,7 @@ func (computeSystem *System) statisticsInProc(ctx context.Context) (*hcsschema.S // 2. Get a process handle for every process (only need PROCESS_QUERY_LIMITED_INFORMATION access) // 3. Call NtQueryInformationProcess on each process with the class ProcessVmCounters // 4. Tally up the total using the field PrivateWorkingSetSize in VM_COUNTERS_EX2. - privateWorkingSet, err := job.PrivateWorkingSet() + privateWorkingSet, err := job.QueryPrivateWorkingSet() if err != nil { return nil, err } diff --git a/internal/jobcontainers/jobcontainer.go b/internal/jobcontainers/jobcontainer.go index 83d57a1c49..bf89257b0c 100644 --- a/internal/jobcontainers/jobcontainer.go +++ b/internal/jobcontainers/jobcontainer.go @@ -438,7 +438,7 @@ func (c *JobContainer) PropertiesV2(ctx context.Context, types ...hcsschema.Prop return nil, errors.Wrap(err, "failed to query for job containers storage information") } - privateWorkingSet, err := c.job.PrivateWorkingSet() + privateWorkingSet, err := c.job.QueryPrivateWorkingSet() if err != nil { return nil, fmt.Errorf("failed to get private working set for container: %w", err) } diff --git a/internal/jobobject/jobobject.go b/internal/jobobject/jobobject.go index 41299b6d63..de62e1b3ab 100644 --- a/internal/jobobject/jobobject.go +++ b/internal/jobobject/jobobject.go @@ -549,9 +549,9 @@ func (job *JobObject) isSilo() bool { return atomic.LoadUint32(&job.isAppSilo) == 1 } -// PrivateWorkingSet returns the private working set size for the job. This is calculated by adding up the +// QueryPrivateWorkingSet returns the private working set size for the job. This is calculated by adding up the // private working set for every process running in the job. -func (job *JobObject) PrivateWorkingSet() (uint64, error) { +func (job *JobObject) QueryPrivateWorkingSet() (uint64, error) { pids, err := job.Pids() if err != nil { return 0, err diff --git a/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go b/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go index 87b36adacf..1d86f0c594 100644 --- a/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go +++ b/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go @@ -388,7 +388,7 @@ func (computeSystem *System) statisticsInProc(ctx context.Context) (*hcsschema.S // 2. Get a process handle for every process (only need PROCESS_QUERY_LIMITED_INFORMATION access) // 3. Call NtQueryInformationProcess on each process with the class ProcessVmCounters // 4. Tally up the total using the field PrivateWorkingSetSize in VM_COUNTERS_EX2. - privateWorkingSet, err := job.PrivateWorkingSet() + privateWorkingSet, err := job.QueryPrivateWorkingSet() if err != nil { return nil, err } diff --git a/test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/jobobject.go b/test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/jobobject.go index 41299b6d63..de62e1b3ab 100644 --- a/test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/jobobject.go +++ b/test/vendor/github.com/Microsoft/hcsshim/internal/jobobject/jobobject.go @@ -549,9 +549,9 @@ func (job *JobObject) isSilo() bool { return atomic.LoadUint32(&job.isAppSilo) == 1 } -// PrivateWorkingSet returns the private working set size for the job. This is calculated by adding up the +// QueryPrivateWorkingSet returns the private working set size for the job. This is calculated by adding up the // private working set for every process running in the job. -func (job *JobObject) PrivateWorkingSet() (uint64, error) { +func (job *JobObject) QueryPrivateWorkingSet() (uint64, error) { pids, err := job.Pids() if err != nil { return 0, err From 956ae54c4cfa0c23dac634d624ff031253680ee6 Mon Sep 17 00:00:00 2001 From: Daniel Canter Date: Thu, 5 May 2022 19:41:50 -0700 Subject: [PATCH 08/12] pr feedback * Check len(type) == 0 instead * Signed-off-by: Daniel Canter --- internal/hcs/system.go | 7 +++---- .../github.com/Microsoft/hcsshim/internal/hcs/system.go | 7 +++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/internal/hcs/system.go b/internal/hcs/system.go index 1d86f0c594..83c974ac22 100644 --- a/internal/hcs/system.go +++ b/internal/hcs/system.go @@ -425,11 +425,10 @@ func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschem // Check if any of the queries are for stats. We can grab these in process and skip the hop to // HCS. - var statsPresent, onlyStats bool + var statsPresent bool for i, prop := range types { if prop == hcsschema.PTStatistics { statsPresent = true - onlyStats = len(types) == 1 // Remove stats from the query types. types = append(types[:i], types[i+1:]...) } @@ -440,7 +439,7 @@ func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschem properties.Statistics, err = computeSystem.statisticsInProc(ctx) if err == nil { // Early return if this was the only thing we were querying for. - if onlyStats { + if len(types) == 0 { properties.Id = computeSystem.id properties.SystemType = computeSystem.typ properties.RuntimeOsType = computeSystem.os @@ -472,7 +471,7 @@ func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschem if err := json.Unmarshal([]byte(propertiesJSON), hcsProps); err != nil { return nil, makeSystemError(computeSystem, operation, err, nil) } - // Copy over stats if we might've failed the inProc query. + // Copy over stats if we grabbed these in proc if properties.Statistics != nil { hcsProps.Statistics = properties.Statistics hcsProps.Owner = computeSystem.owner diff --git a/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go b/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go index 1d86f0c594..83c974ac22 100644 --- a/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go +++ b/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go @@ -425,11 +425,10 @@ func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschem // Check if any of the queries are for stats. We can grab these in process and skip the hop to // HCS. - var statsPresent, onlyStats bool + var statsPresent bool for i, prop := range types { if prop == hcsschema.PTStatistics { statsPresent = true - onlyStats = len(types) == 1 // Remove stats from the query types. types = append(types[:i], types[i+1:]...) } @@ -440,7 +439,7 @@ func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschem properties.Statistics, err = computeSystem.statisticsInProc(ctx) if err == nil { // Early return if this was the only thing we were querying for. - if onlyStats { + if len(types) == 0 { properties.Id = computeSystem.id properties.SystemType = computeSystem.typ properties.RuntimeOsType = computeSystem.os @@ -472,7 +471,7 @@ func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschem if err := json.Unmarshal([]byte(propertiesJSON), hcsProps); err != nil { return nil, makeSystemError(computeSystem, operation, err, nil) } - // Copy over stats if we might've failed the inProc query. + // Copy over stats if we grabbed these in proc if properties.Statistics != nil { hcsProps.Statistics = properties.Statistics hcsProps.Owner = computeSystem.owner From 527b0e75fb4dddaf7fdca1d578c311f2b23c4f14 Mon Sep 17 00:00:00 2001 From: Daniel Canter Date: Tue, 10 May 2022 00:16:10 -0700 Subject: [PATCH 09/12] pr feedback Rearrange in process queries to account for other query types in the future. Signed-off-by: Daniel Canter --- internal/hcs/system.go | 143 ++++++++++++------ .../Microsoft/hcsshim/internal/hcs/system.go | 143 ++++++++++++------ 2 files changed, 190 insertions(+), 96 deletions(-) diff --git a/internal/hcs/system.go b/internal/hcs/system.go index 83c974ac22..f52724fa4b 100644 --- a/internal/hcs/system.go +++ b/internal/hcs/system.go @@ -21,6 +21,7 @@ import ( "github.com/Microsoft/hcsshim/internal/oc" "github.com/Microsoft/hcsshim/internal/timeout" "github.com/Microsoft/hcsshim/internal/vmcompute" + "github.com/sirupsen/logrus" "go.opencensus.io/trace" ) @@ -337,12 +338,12 @@ func (computeSystem *System) Properties(ctx context.Context, types ...schema1.Pr return properties, nil } -// statisticsInProc emulates what HCS does to grab statistics for a given container with a small -// change to make grabbing the private working set total much more efficient. -func (computeSystem *System) statisticsInProc(ctx context.Context) (*hcsschema.Statistics, error) { - // Start timestamp for these stats before we grab them to match HCS - timestamp := time.Now() - +// queryInProc handles querying for container properties without reaching out to HCS. `props` +// will be updated to contain any data returned from the queries present in `types`. If any properties +// failed to be queried they will be tallied up and returned in as the first return value. Failures on +// query are NOT considered errors; the only failure case for this method is if the containers job object +// cannot be opened. +func (computeSystem *System) queryInProc(ctx context.Context, props *hcsschema.Properties, types []hcsschema.PropertyType) ([]hcsschema.PropertyType, error) { // In the future we can make use of some new functionality in the HCS that allows you // to pass a job object for HCS to use for the container. Currently, the only way we'll // be able to open the job/silo is if we're running as SYSTEM. @@ -356,6 +357,33 @@ func (computeSystem *System) statisticsInProc(ctx context.Context) (*hcsschema.S } defer job.Close() + var fallbackQueryTypes []hcsschema.PropertyType + for _, propType := range types { + switch propType { + case hcsschema.PTStatistics: + // Handle a bad caller asking for the same type twice. No use in re-querying if this is + // filled in already. + if props.Statistics == nil { + props.Statistics, err = computeSystem.statisticsInProc(job) + if err != nil { + fallbackQueryTypes = append(fallbackQueryTypes, propType) + } + } + default: + fallbackQueryTypes = append(fallbackQueryTypes, propType) + } + } + + return fallbackQueryTypes, nil + +} + +// statisticsInProc emulates what HCS does to grab statistics for a given container with a small +// change to make grabbing the private working set total much more efficient. +func (computeSystem *System) statisticsInProc(job *jobobject.JobObject) (*hcsschema.Statistics, error) { + // Start timestamp for these stats before we grab them to match HCS + timestamp := time.Now() + memInfo, err := job.QueryMemoryStats() if err != nil { return nil, err @@ -416,43 +444,10 @@ func (computeSystem *System) statisticsInProc(ctx context.Context) (*hcsschema.S }, nil } -// PropertiesV2 returns the requested container properties targeting a V2 schema container. -func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschema.PropertyType) (_ *hcsschema.Properties, err error) { - computeSystem.handleLock.RLock() - defer computeSystem.handleLock.RUnlock() - +// hcsPropertiesV2Query is a helper to make a HcsGetComputeSystemProperties call using the V2 schema property types. +func (computeSystem *System) hcsPropertiesV2Query(ctx context.Context, types []hcsschema.PropertyType) (*hcsschema.Properties, error) { operation := "hcs::System::PropertiesV2" - // Check if any of the queries are for stats. We can grab these in process and skip the hop to - // HCS. - var statsPresent bool - for i, prop := range types { - if prop == hcsschema.PTStatistics { - statsPresent = true - // Remove stats from the query types. - types = append(types[:i], types[i+1:]...) - } - } - - properties := &hcsschema.Properties{} - if statsPresent && computeSystem.typ == "container" { - properties.Statistics, err = computeSystem.statisticsInProc(ctx) - if err == nil { - // Early return if this was the only thing we were querying for. - if len(types) == 0 { - properties.Id = computeSystem.id - properties.SystemType = computeSystem.typ - properties.RuntimeOsType = computeSystem.os - properties.Owner = computeSystem.owner - return properties, nil - } - } else { - logTxt := "failed to grab statistics in process - falling back to HCS" - log.G(ctx).WithError(err).WithField(logfields.ContainerID, computeSystem.id).Warn(logTxt) - types = append(types, hcsschema.PTStatistics) - } - } - queryBytes, err := json.Marshal(hcsschema.PropertyQuery{PropertyTypes: types}) if err != nil { return nil, makeSystemError(computeSystem, operation, err, nil) @@ -467,17 +462,69 @@ func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschem if propertiesJSON == "" { return nil, ErrUnexpectedValue } - hcsProps := &hcsschema.Properties{} - if err := json.Unmarshal([]byte(propertiesJSON), hcsProps); err != nil { + props := &hcsschema.Properties{} + if err := json.Unmarshal([]byte(propertiesJSON), props); err != nil { return nil, makeSystemError(computeSystem, operation, err, nil) } - // Copy over stats if we grabbed these in proc - if properties.Statistics != nil { - hcsProps.Statistics = properties.Statistics - hcsProps.Owner = computeSystem.owner + + return props, nil +} + +// PropertiesV2 returns the requested container properties targeting a V2 schema container. +func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschema.PropertyType) (_ *hcsschema.Properties, err error) { + computeSystem.handleLock.RLock() + defer computeSystem.handleLock.RUnlock() + + // Let HCS tally up the total for VM based queries instead of querying ourselves. + if computeSystem.typ != "container" { + return computeSystem.hcsPropertiesV2Query(ctx, types) } - return hcsProps, nil + // Define a starter Properties struct with the default fields returned from every + // query. Owner is only returned from Statistics but it's harmless to include. + properties := &hcsschema.Properties{ + Id: computeSystem.id, + SystemType: computeSystem.typ, + RuntimeOsType: computeSystem.os, + Owner: computeSystem.owner, + } + + logEntry := log.G(ctx) + // First lets try and query ourselves without reaching to HCS. If any of the queries fail + // we'll take note and fallback to querying HCS for any of the failed types. + fallbackTypes, err := computeSystem.queryInProc(ctx, properties, types) + if err == nil && len(fallbackTypes) == 0 { + return properties, nil + } else if err != nil { + logEntry.WithError(err) + fallbackTypes = types + } + + logEntry.WithFields(logrus.Fields{ + logfields.ContainerID: computeSystem.id, + "propertyTypes": fallbackTypes, + }).Warn("falling back to HCS for property type queries") + + hcsProperties, err := computeSystem.hcsPropertiesV2Query(ctx, fallbackTypes) + if err != nil { + return nil, err + } + + // Now loop through the fallback types and fill in any fields on the Properties struct. These are the set of + // queries that we have defined types for and that are handled in the V2 schema query code path in HCS. + for _, propType := range fallbackTypes { + switch propType { + case hcsschema.PTStatistics: + properties.Statistics = hcsProperties.Statistics + case hcsschema.PTProcessList: + properties.ProcessList = hcsProperties.ProcessList + case hcsschema.PTTerminateOnLastHandleClosed: + properties.TerminateOnLastHandleClosed = hcsProperties.TerminateOnLastHandleClosed + default: + } + } + + return properties, nil } // Pause pauses the execution of the computeSystem. This feature is not enabled in TP5. diff --git a/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go b/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go index 83c974ac22..f52724fa4b 100644 --- a/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go +++ b/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go @@ -21,6 +21,7 @@ import ( "github.com/Microsoft/hcsshim/internal/oc" "github.com/Microsoft/hcsshim/internal/timeout" "github.com/Microsoft/hcsshim/internal/vmcompute" + "github.com/sirupsen/logrus" "go.opencensus.io/trace" ) @@ -337,12 +338,12 @@ func (computeSystem *System) Properties(ctx context.Context, types ...schema1.Pr return properties, nil } -// statisticsInProc emulates what HCS does to grab statistics for a given container with a small -// change to make grabbing the private working set total much more efficient. -func (computeSystem *System) statisticsInProc(ctx context.Context) (*hcsschema.Statistics, error) { - // Start timestamp for these stats before we grab them to match HCS - timestamp := time.Now() - +// queryInProc handles querying for container properties without reaching out to HCS. `props` +// will be updated to contain any data returned from the queries present in `types`. If any properties +// failed to be queried they will be tallied up and returned in as the first return value. Failures on +// query are NOT considered errors; the only failure case for this method is if the containers job object +// cannot be opened. +func (computeSystem *System) queryInProc(ctx context.Context, props *hcsschema.Properties, types []hcsschema.PropertyType) ([]hcsschema.PropertyType, error) { // In the future we can make use of some new functionality in the HCS that allows you // to pass a job object for HCS to use for the container. Currently, the only way we'll // be able to open the job/silo is if we're running as SYSTEM. @@ -356,6 +357,33 @@ func (computeSystem *System) statisticsInProc(ctx context.Context) (*hcsschema.S } defer job.Close() + var fallbackQueryTypes []hcsschema.PropertyType + for _, propType := range types { + switch propType { + case hcsschema.PTStatistics: + // Handle a bad caller asking for the same type twice. No use in re-querying if this is + // filled in already. + if props.Statistics == nil { + props.Statistics, err = computeSystem.statisticsInProc(job) + if err != nil { + fallbackQueryTypes = append(fallbackQueryTypes, propType) + } + } + default: + fallbackQueryTypes = append(fallbackQueryTypes, propType) + } + } + + return fallbackQueryTypes, nil + +} + +// statisticsInProc emulates what HCS does to grab statistics for a given container with a small +// change to make grabbing the private working set total much more efficient. +func (computeSystem *System) statisticsInProc(job *jobobject.JobObject) (*hcsschema.Statistics, error) { + // Start timestamp for these stats before we grab them to match HCS + timestamp := time.Now() + memInfo, err := job.QueryMemoryStats() if err != nil { return nil, err @@ -416,43 +444,10 @@ func (computeSystem *System) statisticsInProc(ctx context.Context) (*hcsschema.S }, nil } -// PropertiesV2 returns the requested container properties targeting a V2 schema container. -func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschema.PropertyType) (_ *hcsschema.Properties, err error) { - computeSystem.handleLock.RLock() - defer computeSystem.handleLock.RUnlock() - +// hcsPropertiesV2Query is a helper to make a HcsGetComputeSystemProperties call using the V2 schema property types. +func (computeSystem *System) hcsPropertiesV2Query(ctx context.Context, types []hcsschema.PropertyType) (*hcsschema.Properties, error) { operation := "hcs::System::PropertiesV2" - // Check if any of the queries are for stats. We can grab these in process and skip the hop to - // HCS. - var statsPresent bool - for i, prop := range types { - if prop == hcsschema.PTStatistics { - statsPresent = true - // Remove stats from the query types. - types = append(types[:i], types[i+1:]...) - } - } - - properties := &hcsschema.Properties{} - if statsPresent && computeSystem.typ == "container" { - properties.Statistics, err = computeSystem.statisticsInProc(ctx) - if err == nil { - // Early return if this was the only thing we were querying for. - if len(types) == 0 { - properties.Id = computeSystem.id - properties.SystemType = computeSystem.typ - properties.RuntimeOsType = computeSystem.os - properties.Owner = computeSystem.owner - return properties, nil - } - } else { - logTxt := "failed to grab statistics in process - falling back to HCS" - log.G(ctx).WithError(err).WithField(logfields.ContainerID, computeSystem.id).Warn(logTxt) - types = append(types, hcsschema.PTStatistics) - } - } - queryBytes, err := json.Marshal(hcsschema.PropertyQuery{PropertyTypes: types}) if err != nil { return nil, makeSystemError(computeSystem, operation, err, nil) @@ -467,17 +462,69 @@ func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschem if propertiesJSON == "" { return nil, ErrUnexpectedValue } - hcsProps := &hcsschema.Properties{} - if err := json.Unmarshal([]byte(propertiesJSON), hcsProps); err != nil { + props := &hcsschema.Properties{} + if err := json.Unmarshal([]byte(propertiesJSON), props); err != nil { return nil, makeSystemError(computeSystem, operation, err, nil) } - // Copy over stats if we grabbed these in proc - if properties.Statistics != nil { - hcsProps.Statistics = properties.Statistics - hcsProps.Owner = computeSystem.owner + + return props, nil +} + +// PropertiesV2 returns the requested container properties targeting a V2 schema container. +func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschema.PropertyType) (_ *hcsschema.Properties, err error) { + computeSystem.handleLock.RLock() + defer computeSystem.handleLock.RUnlock() + + // Let HCS tally up the total for VM based queries instead of querying ourselves. + if computeSystem.typ != "container" { + return computeSystem.hcsPropertiesV2Query(ctx, types) } - return hcsProps, nil + // Define a starter Properties struct with the default fields returned from every + // query. Owner is only returned from Statistics but it's harmless to include. + properties := &hcsschema.Properties{ + Id: computeSystem.id, + SystemType: computeSystem.typ, + RuntimeOsType: computeSystem.os, + Owner: computeSystem.owner, + } + + logEntry := log.G(ctx) + // First lets try and query ourselves without reaching to HCS. If any of the queries fail + // we'll take note and fallback to querying HCS for any of the failed types. + fallbackTypes, err := computeSystem.queryInProc(ctx, properties, types) + if err == nil && len(fallbackTypes) == 0 { + return properties, nil + } else if err != nil { + logEntry.WithError(err) + fallbackTypes = types + } + + logEntry.WithFields(logrus.Fields{ + logfields.ContainerID: computeSystem.id, + "propertyTypes": fallbackTypes, + }).Warn("falling back to HCS for property type queries") + + hcsProperties, err := computeSystem.hcsPropertiesV2Query(ctx, fallbackTypes) + if err != nil { + return nil, err + } + + // Now loop through the fallback types and fill in any fields on the Properties struct. These are the set of + // queries that we have defined types for and that are handled in the V2 schema query code path in HCS. + for _, propType := range fallbackTypes { + switch propType { + case hcsschema.PTStatistics: + properties.Statistics = hcsProperties.Statistics + case hcsschema.PTProcessList: + properties.ProcessList = hcsProperties.ProcessList + case hcsschema.PTTerminateOnLastHandleClosed: + properties.TerminateOnLastHandleClosed = hcsProperties.TerminateOnLastHandleClosed + default: + } + } + + return properties, nil } // Pause pauses the execution of the computeSystem. This feature is not enabled in TP5. From df077394d468854f1ddfecbe2d4a5648039f9508 Mon Sep 17 00:00:00 2001 From: Daniel Canter Date: Thu, 12 May 2022 15:49:22 -0700 Subject: [PATCH 10/12] PR Feedback * Log if we failed to grab stats * Add more context for if the in proc query fails as it would be hard to tell what happened. * Rephrase PropertiesV2 comment to not say "container" as it works on virtual machines as well. * Return an error if we encounter an unknown fallback property type. Signed-off-by: Daniel Canter --- internal/hcs/system.go | 8 +++++--- .../github.com/Microsoft/hcsshim/internal/hcs/system.go | 8 +++++--- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/internal/hcs/system.go b/internal/hcs/system.go index f52724fa4b..e8e7fc7c89 100644 --- a/internal/hcs/system.go +++ b/internal/hcs/system.go @@ -366,6 +366,8 @@ func (computeSystem *System) queryInProc(ctx context.Context, props *hcsschema.P if props.Statistics == nil { props.Statistics, err = computeSystem.statisticsInProc(job) if err != nil { + log.G(ctx).WithError(err).Warn("failed to get statistics in-proc") + fallbackQueryTypes = append(fallbackQueryTypes, propType) } } @@ -375,7 +377,6 @@ func (computeSystem *System) queryInProc(ctx context.Context, props *hcsschema.P } return fallbackQueryTypes, nil - } // statisticsInProc emulates what HCS does to grab statistics for a given container with a small @@ -470,7 +471,7 @@ func (computeSystem *System) hcsPropertiesV2Query(ctx context.Context, types []h return props, nil } -// PropertiesV2 returns the requested container properties targeting a V2 schema container. +// PropertiesV2 returns the requested compute systems properties targeting a V2 schema compute system. func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschema.PropertyType) (_ *hcsschema.Properties, err error) { computeSystem.handleLock.RLock() defer computeSystem.handleLock.RUnlock() @@ -496,7 +497,7 @@ func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschem if err == nil && len(fallbackTypes) == 0 { return properties, nil } else if err != nil { - logEntry.WithError(err) + logEntry.WithError(fmt.Errorf("failed to query compute system properties in-proc: %s", err)) fallbackTypes = types } @@ -521,6 +522,7 @@ func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschem case hcsschema.PTTerminateOnLastHandleClosed: properties.TerminateOnLastHandleClosed = hcsProperties.TerminateOnLastHandleClosed default: + return nil, fmt.Errorf("unknown property type ecountered %q", propType) } } diff --git a/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go b/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go index f52724fa4b..e8e7fc7c89 100644 --- a/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go +++ b/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go @@ -366,6 +366,8 @@ func (computeSystem *System) queryInProc(ctx context.Context, props *hcsschema.P if props.Statistics == nil { props.Statistics, err = computeSystem.statisticsInProc(job) if err != nil { + log.G(ctx).WithError(err).Warn("failed to get statistics in-proc") + fallbackQueryTypes = append(fallbackQueryTypes, propType) } } @@ -375,7 +377,6 @@ func (computeSystem *System) queryInProc(ctx context.Context, props *hcsschema.P } return fallbackQueryTypes, nil - } // statisticsInProc emulates what HCS does to grab statistics for a given container with a small @@ -470,7 +471,7 @@ func (computeSystem *System) hcsPropertiesV2Query(ctx context.Context, types []h return props, nil } -// PropertiesV2 returns the requested container properties targeting a V2 schema container. +// PropertiesV2 returns the requested compute systems properties targeting a V2 schema compute system. func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschema.PropertyType) (_ *hcsschema.Properties, err error) { computeSystem.handleLock.RLock() defer computeSystem.handleLock.RUnlock() @@ -496,7 +497,7 @@ func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschem if err == nil && len(fallbackTypes) == 0 { return properties, nil } else if err != nil { - logEntry.WithError(err) + logEntry.WithError(fmt.Errorf("failed to query compute system properties in-proc: %s", err)) fallbackTypes = types } @@ -521,6 +522,7 @@ func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschem case hcsschema.PTTerminateOnLastHandleClosed: properties.TerminateOnLastHandleClosed = hcsProperties.TerminateOnLastHandleClosed default: + return nil, fmt.Errorf("unknown property type ecountered %q", propType) } } From db60db206c2eee9fd5d1a20f1d3a47af171c295e Mon Sep 17 00:00:00 2001 From: Daniel Canter Date: Thu, 12 May 2022 16:16:31 -0700 Subject: [PATCH 11/12] PR feedback * Return the properties returned from HCS and fill in what we've gathered in-proc instead of vice versa. Signed-off-by: Daniel Canter --- internal/hcs/system.go | 23 ++++++++----------- .../Microsoft/hcsshim/internal/hcs/system.go | 23 ++++++++----------- 2 files changed, 18 insertions(+), 28 deletions(-) diff --git a/internal/hcs/system.go b/internal/hcs/system.go index e8e7fc7c89..3f99087537 100644 --- a/internal/hcs/system.go +++ b/internal/hcs/system.go @@ -511,22 +511,17 @@ func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschem return nil, err } - // Now loop through the fallback types and fill in any fields on the Properties struct. These are the set of - // queries that we have defined types for and that are handled in the V2 schema query code path in HCS. - for _, propType := range fallbackTypes { - switch propType { - case hcsschema.PTStatistics: - properties.Statistics = hcsProperties.Statistics - case hcsschema.PTProcessList: - properties.ProcessList = hcsProperties.ProcessList - case hcsschema.PTTerminateOnLastHandleClosed: - properties.TerminateOnLastHandleClosed = hcsProperties.TerminateOnLastHandleClosed - default: - return nil, fmt.Errorf("unknown property type ecountered %q", propType) - } + // Now add in anything that we might have successfully queried in process. + if properties.Statistics != nil { + hcsProperties.Statistics = properties.Statistics + hcsProperties.Owner = properties.Owner } - return properties, nil + if properties.ProcessList != nil { + hcsProperties.ProcessList = properties.ProcessList + } + + return hcsProperties, nil } // Pause pauses the execution of the computeSystem. This feature is not enabled in TP5. diff --git a/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go b/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go index e8e7fc7c89..3f99087537 100644 --- a/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go +++ b/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go @@ -511,22 +511,17 @@ func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschem return nil, err } - // Now loop through the fallback types and fill in any fields on the Properties struct. These are the set of - // queries that we have defined types for and that are handled in the V2 schema query code path in HCS. - for _, propType := range fallbackTypes { - switch propType { - case hcsschema.PTStatistics: - properties.Statistics = hcsProperties.Statistics - case hcsschema.PTProcessList: - properties.ProcessList = hcsProperties.ProcessList - case hcsschema.PTTerminateOnLastHandleClosed: - properties.TerminateOnLastHandleClosed = hcsProperties.TerminateOnLastHandleClosed - default: - return nil, fmt.Errorf("unknown property type ecountered %q", propType) - } + // Now add in anything that we might have successfully queried in process. + if properties.Statistics != nil { + hcsProperties.Statistics = properties.Statistics + hcsProperties.Owner = properties.Owner } - return properties, nil + if properties.ProcessList != nil { + hcsProperties.ProcessList = properties.ProcessList + } + + return hcsProperties, nil } // Pause pauses the execution of the computeSystem. This feature is not enabled in TP5. From f8c8a9a6da3e4941a88158d2c27f3ce5ac5ed217 Mon Sep 17 00:00:00 2001 From: Daniel Canter Date: Fri, 13 May 2022 11:20:41 -0700 Subject: [PATCH 12/12] pr feedback * %w on error * Change warning log to Info as it's expected to happen for some types. * Comment on that processlist querying in the shim is for the future. Signed-off-by: Daniel Canter --- internal/hcs/system.go | 5 +++-- .../github.com/Microsoft/hcsshim/internal/hcs/system.go | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/internal/hcs/system.go b/internal/hcs/system.go index 3f99087537..a9f92e681d 100644 --- a/internal/hcs/system.go +++ b/internal/hcs/system.go @@ -497,14 +497,14 @@ func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschem if err == nil && len(fallbackTypes) == 0 { return properties, nil } else if err != nil { - logEntry.WithError(fmt.Errorf("failed to query compute system properties in-proc: %s", err)) + logEntry.WithError(fmt.Errorf("failed to query compute system properties in-proc: %w", err)) fallbackTypes = types } logEntry.WithFields(logrus.Fields{ logfields.ContainerID: computeSystem.id, "propertyTypes": fallbackTypes, - }).Warn("falling back to HCS for property type queries") + }).Info("falling back to HCS for property type queries") hcsProperties, err := computeSystem.hcsPropertiesV2Query(ctx, fallbackTypes) if err != nil { @@ -517,6 +517,7 @@ func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschem hcsProperties.Owner = properties.Owner } + // For future support for querying processlist in-proc as well. if properties.ProcessList != nil { hcsProperties.ProcessList = properties.ProcessList } diff --git a/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go b/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go index 3f99087537..a9f92e681d 100644 --- a/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go +++ b/test/vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go @@ -497,14 +497,14 @@ func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschem if err == nil && len(fallbackTypes) == 0 { return properties, nil } else if err != nil { - logEntry.WithError(fmt.Errorf("failed to query compute system properties in-proc: %s", err)) + logEntry.WithError(fmt.Errorf("failed to query compute system properties in-proc: %w", err)) fallbackTypes = types } logEntry.WithFields(logrus.Fields{ logfields.ContainerID: computeSystem.id, "propertyTypes": fallbackTypes, - }).Warn("falling back to HCS for property type queries") + }).Info("falling back to HCS for property type queries") hcsProperties, err := computeSystem.hcsPropertiesV2Query(ctx, fallbackTypes) if err != nil { @@ -517,6 +517,7 @@ func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschem hcsProperties.Owner = properties.Owner } + // For future support for querying processlist in-proc as well. if properties.ProcessList != nil { hcsProperties.ProcessList = properties.ProcessList }