From ee4b8f6a3c4ef84fba8230105d9bf9e695e0127d Mon Sep 17 00:00:00 2001 From: "pangjiping.pjp" Date: Thu, 23 Jun 2022 15:54:00 +0800 Subject: [PATCH 01/14] fix npd oom caused by listing pods --- pkg/systemlogmonitor/log_monitor.go | 156 +++++++++++++++++++++++----- 1 file changed, 132 insertions(+), 24 deletions(-) diff --git a/pkg/systemlogmonitor/log_monitor.go b/pkg/systemlogmonitor/log_monitor.go index b182585ec..f8f80c065 100644 --- a/pkg/systemlogmonitor/log_monitor.go +++ b/pkg/systemlogmonitor/log_monitor.go @@ -19,7 +19,12 @@ package systemlogmonitor import ( "encoding/json" "io/ioutil" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/watch" clientset "k8s.io/client-go/kubernetes" + v12 "k8s.io/client-go/kubernetes/typed/core/v1" "k8s.io/heapster/common/kubernetes" "k8s.io/node-problem-detector/cmd/options" "net/url" @@ -27,6 +32,7 @@ import ( "path/filepath" "regexp" "strings" + "sync" "fmt" "time" @@ -54,6 +60,7 @@ const ( var ( uuidRegx *regexp.Regexp k8sClient *clientset.Clientset + nodeName string ) func init() { @@ -69,14 +76,18 @@ func init() { } type logMonitor struct { - configPath string - watcher watchertypes.LogWatcher - buffer LogBuffer - config MonitorConfig - conditions []types.Condition - logCh <-chan *logtypes.Log - output chan *types.Status - tomb *tomb.Tomb + configPath string + watcher watchertypes.LogWatcher + buffer LogBuffer + config MonitorConfig + conditions []types.Condition + logCh <-chan *logtypes.Log + output chan *types.Status + tomb *tomb.Tomb + podList []v1.Pod + rwLock sync.RWMutex + listOptions metav1.ListOptions + listerWatcher v12.PodInterface } func InitK8sClientOrDie(options *options.NodeProblemDetectorOptions) *clientset.Clientset { @@ -87,6 +98,7 @@ func InitK8sClientOrDie(options *options.NodeProblemDetectorOptions) *clientset. } cfg.UserAgent = fmt.Sprintf("%s/%s", filepath.Base(os.Args[0]), version.Version()) k8sClient = clientset.NewForConfigOrDie(cfg) + nodeName = options.NodeName return k8sClient } @@ -95,6 +107,12 @@ func NewLogMonitorOrDie(configPath string) types.Monitor { l := &logMonitor{ configPath: configPath, tomb: tomb.NewTomb(), + rwLock: sync.RWMutex{}, + listOptions: metav1.ListOptions{ + FieldSelector: fmt.Sprintf("spec.nodeName=%s", nodeName), + ResourceVersion: "0", + }, + listerWatcher: k8sClient.CoreV1().Pods(""), } f, err := ioutil.ReadFile(configPath) @@ -149,6 +167,7 @@ func (l *logMonitor) Start() (<-chan *types.Status, error) { if err != nil { return nil, err } + go l.listAndWatch() go l.monitorLoop() return l.output, nil } @@ -202,32 +221,32 @@ func (l *logMonitor) generateStatus(logs []*logtypes.Log, rule systemlogtypes.Ru // We use the timestamp of the first log line as the timestamp of the status. timestamp := logs[0].Timestamp message := generateMessage(logs) - if rule.Reason == OOMREASON && k8sClient != nil { + if rule.Reason == OOMREASON && l.listerWatcher != nil { uuid := string(uuidRegx.Find([]byte(message))) - uuid = strings.ReplaceAll(uuid, "_", "-") - pl, err := k8sClient.CoreV1().Pods("").List(metav1.ListOptions{}) - if err != nil { - glog.Error("Error in getting pods: %v", err.Error()) - } else { - for _, pod := range pl.Items { - if string(pod.UID) == uuid { - message = fmt.Sprintf("pod was OOM killed. node:%s pod:%s namespace:%s uuid:%s", - pod.Spec.NodeName, pod.Name, pod.Namespace, uuid) - break - } + + // visit cache to get podList + l.rwLock.RLock() + // if cache is empty, list pods + if l.podList == nil || len(l.podList) <= 0 { + if err := l.list(); err != nil { + glog.Errorf("Failed to list pods, err:%v", err) } } + + message = l.generateOOMMessage(uuid) + l.rwLock.RUnlock() } + var events []types.Event var changedConditions []*types.Condition if rule.Type == types.Temp { // For temporary error only generate event events = append(events, types.Event{ - Severity: types.Warn, - Timestamp: timestamp, - Reason: rule.Reason, - Message: message, + Severity: types.Warn, + Timestamp: timestamp, + Reason: rule.Reason, + Message: message, }) } else { // For permanent error changes the condition @@ -293,6 +312,95 @@ func (l *logMonitor) initializeStatus() { } } +// listAndWatch watch and watch handle. +func (l *logMonitor) listAndWatch() { + if err := l.list(); err != nil { + glog.Errorf("Failed to init podList, err:%v", err) + } + + for { + w, err := l.listerWatcher.Watch(l.listOptions) + if err != nil { + glog.Errorf("Failed to watch pod, err:%v", err) + return + } + + select { + case event := <-w.ResultChan(): + switch event.Type { + case watch.Added: + if err := l.list(); err != nil { + glog.Errorf("Failed to list pods, err:%v", err) + } + w.Stop() + case watch.Modified: + if err := l.list(); err != nil { + glog.Errorf("Failed to list pods, err:%v", err) + } + w.Stop() + case watch.Deleted: + if err := l.list(); err != nil { + glog.Errorf("Failed to list pods, err:%v", err) + } + w.Stop() + default: + glog.Errorf("Invalid watch event %v", event.Type) + w.Stop() + } + case <-l.tomb.Stopping(): + w.Stop() + glog.Infof("Log monitor stopped at %s", time.Now().Format("2006-01-02 15:04:05 MST Mon")) + return + } + } +} + +// syncWith sync logMonitor's podList cache. +// write lock inside. +func (l *logMonitor) syncWith(obj runtime.Object, resourceVersion string) error { + list, ok := obj.(*v1.PodList) + if !ok { + return fmt.Errorf("cannot convert obj %v to podList", obj) + } + + l.rwLock.Lock() + defer l.rwLock.Unlock() + l.podList = list.Items + l.listOptions.ResourceVersion = resourceVersion + return nil +} + +// list lists pods with FieldSelector and resourceVersion. +func (l *logMonitor) list() error { + var ( + list runtime.Object + err error + listMetaInterface meta.List + ) + if list, err = l.listerWatcher.List(l.listOptions); err != nil { + return err + } + if listMetaInterface, err = meta.ListAccessor(list); err != nil { + return err + } + resourceVersion := listMetaInterface.GetResourceVersion() + + err = l.syncWith(list, resourceVersion) + return err +} + +// generateOOMMessage get oom pod message. +// need read lock before use. +func (l *logMonitor) generateOOMMessage(uuid string) string { + for _, pod := range l.podList { + if string(pod.UID) == uuid { + return fmt.Sprintf("pod was OOM killed. node:%s pod:%s namespace:%s uuid:%s", + pod.Spec.NodeName, pod.Name, pod.Namespace, uuid) + } + } + return "" +} + func initialConditions(defaults []types.Condition) []types.Condition { conditions := make([]types.Condition, len(defaults)) copy(conditions, defaults) From cb6d04560c9091c6ba97847ee3b08e47919a243f Mon Sep 17 00:00:00 2001 From: "pangjiping.pjp" Date: Wed, 29 Jun 2022 18:10:41 +0800 Subject: [PATCH 02/14] fix --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 6ba59cc73..73aaa5a98 100644 --- a/Makefile +++ b/Makefile @@ -247,8 +247,8 @@ build-binaries: $(ALL_BINARIES) build-container: build-binaries Dockerfile docker build -t $(IMAGE) --build-arg BASEIMAGE=$(BASEIMAGE) --build-arg LOGCOUNTER=$(LOGCOUNTER) . --progress=plain --no-cache -$(TARBALL): ./bin/node-problem-odetector ./bin/log-counter ./bin/health-checker ./test/bin/problem-maker - tar -zcvf $(TARBALL) bin/ cnfig/ test/e2e-install.sh test/bin/problem-maker +$(TARBALL): ./bin/node-problem-detector ./bin/log-counter ./bin/health-checker ./test/bin/problem-maker + tar -zcvf $(TARBALL) bin/ config/ test/e2e-install.sh test/bin/problem-maker sha1sum $(TARBALL) md5sum $(TARBALL) From 6f9959454cfb87dcc1968445560b3466b7bb9dda Mon Sep 17 00:00:00 2001 From: "pangjiping.pjp" Date: Wed, 29 Jun 2022 20:16:08 +0800 Subject: [PATCH 03/14] fix --- pkg/systemlogmonitor/log_monitor.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/systemlogmonitor/log_monitor.go b/pkg/systemlogmonitor/log_monitor.go index f8f80c065..f142a7b16 100644 --- a/pkg/systemlogmonitor/log_monitor.go +++ b/pkg/systemlogmonitor/log_monitor.go @@ -112,7 +112,6 @@ func NewLogMonitorOrDie(configPath string) types.Monitor { FieldSelector: fmt.Sprintf("spec.nodeName=%s", nodeName), ResourceVersion: "0", }, - listerWatcher: k8sClient.CoreV1().Pods(""), } f, err := ioutil.ReadFile(configPath) @@ -163,6 +162,7 @@ func initializeProblemMetricsOrDie(rules []systemlogtypes.Rule) { func (l *logMonitor) Start() (<-chan *types.Status, error) { glog.Infof("Start log monitor %s", l.configPath) var err error + l.listerWatcher = k8sClient.CoreV1().Pods("") l.logCh, err = l.watcher.Watch() if err != nil { return nil, err From 6f7a3dadc6792f9ebcda16eda673223b4428fb18 Mon Sep 17 00:00:00 2001 From: "pangjiping.pjp" Date: Mon, 4 Jul 2022 16:15:12 +0800 Subject: [PATCH 04/14] fix timeout --- pkg/systemlogmonitor/log_monitor.go | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pkg/systemlogmonitor/log_monitor.go b/pkg/systemlogmonitor/log_monitor.go index f142a7b16..0bf92ec72 100644 --- a/pkg/systemlogmonitor/log_monitor.go +++ b/pkg/systemlogmonitor/log_monitor.go @@ -58,9 +58,10 @@ const ( ) var ( - uuidRegx *regexp.Regexp - k8sClient *clientset.Clientset - nodeName string + uuidRegx *regexp.Regexp + k8sClient *clientset.Clientset + nodeName string + podEventTimeout = int64(3) // 3 minutes ) func init() { @@ -111,6 +112,7 @@ func NewLogMonitorOrDie(configPath string) types.Monitor { listOptions: metav1.ListOptions{ FieldSelector: fmt.Sprintf("spec.nodeName=%s", nodeName), ResourceVersion: "0", + TimeoutSeconds: &podEventTimeout, }, } @@ -344,7 +346,7 @@ func (l *logMonitor) listAndWatch() { } w.Stop() default: - glog.Errorf("Invalid watch event %v", event.Type) + // glog.Errorf("Invalid watch event %v", event.Type) w.Stop() } case <-l.tomb.Stopping(): From cbb6aab472ac0dac3bb317be36d6c3a922dfdc9d Mon Sep 17 00:00:00 2001 From: "pangjiping.pjp" Date: Mon, 4 Jul 2022 16:39:59 +0800 Subject: [PATCH 05/14] fix timeout --- pkg/systemlogmonitor/log_monitor.go | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pkg/systemlogmonitor/log_monitor.go b/pkg/systemlogmonitor/log_monitor.go index 0bf92ec72..8e940636e 100644 --- a/pkg/systemlogmonitor/log_monitor.go +++ b/pkg/systemlogmonitor/log_monitor.go @@ -114,6 +114,7 @@ func NewLogMonitorOrDie(configPath string) types.Monitor { ResourceVersion: "0", TimeoutSeconds: &podEventTimeout, }, + podList: make([]v1.Pod, 0), } f, err := ioutil.ReadFile(configPath) @@ -240,6 +241,10 @@ func (l *logMonitor) generateStatus(logs []*logtypes.Log, rule systemlogtypes.Ru l.rwLock.RUnlock() } + if l.listerWatcher == nil { + glog.Errorf("Empty ListAndWatcher!") + } + var events []types.Event var changedConditions []*types.Condition if rule.Type == types.Temp { @@ -331,17 +336,17 @@ func (l *logMonitor) listAndWatch() { case event := <-w.ResultChan(): switch event.Type { case watch.Added: - if err := l.list(); err != nil { + if err = l.list(); err != nil { glog.Errorf("Failed to list pods, err:%v", err) } w.Stop() case watch.Modified: - if err := l.list(); err != nil { + if err = l.list(); err != nil { glog.Errorf("Failed to list pods, err:%v", err) } w.Stop() case watch.Deleted: - if err := l.list(); err != nil { + if err = l.list(); err != nil { glog.Errorf("Failed to list pods, err:%v", err) } w.Stop() @@ -394,6 +399,8 @@ func (l *logMonitor) list() error { // generateOOMMessage get oom pod message. // need read lock before use. func (l *logMonitor) generateOOMMessage(uuid string) string { + l.rwLock.RLock() + defer l.rwLock.RUnlock() for _, pod := range l.podList { if string(pod.UID) == uuid { return fmt.Sprintf("pod was OOM killed. node:%s pod:%s namespace:%s uuid:%s", From 04856d1c5405094a23e4ade101b4711c115b805a Mon Sep 17 00:00:00 2001 From: "pangjiping.pjp" Date: Mon, 4 Jul 2022 16:57:44 +0800 Subject: [PATCH 06/14] fix --- pkg/systemlogmonitor/log_monitor.go | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/pkg/systemlogmonitor/log_monitor.go b/pkg/systemlogmonitor/log_monitor.go index 8e940636e..626069bf5 100644 --- a/pkg/systemlogmonitor/log_monitor.go +++ b/pkg/systemlogmonitor/log_monitor.go @@ -109,12 +109,7 @@ func NewLogMonitorOrDie(configPath string) types.Monitor { configPath: configPath, tomb: tomb.NewTomb(), rwLock: sync.RWMutex{}, - listOptions: metav1.ListOptions{ - FieldSelector: fmt.Sprintf("spec.nodeName=%s", nodeName), - ResourceVersion: "0", - TimeoutSeconds: &podEventTimeout, - }, - podList: make([]v1.Pod, 0), + podList: make([]v1.Pod, 0), } f, err := ioutil.ReadFile(configPath) @@ -165,7 +160,16 @@ func initializeProblemMetricsOrDie(rules []systemlogtypes.Rule) { func (l *logMonitor) Start() (<-chan *types.Status, error) { glog.Infof("Start log monitor %s", l.configPath) var err error + + // init listAndWatcher l.listerWatcher = k8sClient.CoreV1().Pods("") + l.listOptions = metav1.ListOptions{ + FieldSelector: fmt.Sprintf("spec.nodeName=%s", nodeName), + ResourceVersion: "0", + TimeoutSeconds: &podEventTimeout, + } + glog.V(4).Infof("init k8sClient %v and listOptions %v on node %s", l.listerWatcher, l.listOptions, nodeName) + l.logCh, err = l.watcher.Watch() if err != nil { return nil, err @@ -331,26 +335,31 @@ func (l *logMonitor) listAndWatch() { glog.Errorf("Failed to watch pod, err:%v", err) return } + glog.V(4).Infof("Start watch pod on Node %s", nodeName) select { case event := <-w.ResultChan(): switch event.Type { case watch.Added: + glog.V(4).Infof("Watch event Added") if err = l.list(); err != nil { glog.Errorf("Failed to list pods, err:%v", err) } w.Stop() case watch.Modified: + glog.V(4).Infof("Watch event Modified") if err = l.list(); err != nil { glog.Errorf("Failed to list pods, err:%v", err) } w.Stop() case watch.Deleted: + glog.V(4).Infof("Watch event Deleted") if err = l.list(); err != nil { glog.Errorf("Failed to list pods, err:%v", err) } w.Stop() default: + glog.V(4).Infof("Watch event %v", event.Type) // glog.Errorf("Invalid watch event %v", event.Type) w.Stop() } @@ -374,6 +383,7 @@ func (l *logMonitor) syncWith(obj runtime.Object, resourceVersion string) error defer l.rwLock.Unlock() l.podList = list.Items l.listOptions.ResourceVersion = resourceVersion + glog.V(4).Infof("syncWith logMonitor cache, length of podList: %v", len(l.podList)) return nil } @@ -399,8 +409,6 @@ func (l *logMonitor) list() error { // generateOOMMessage get oom pod message. // need read lock before use. func (l *logMonitor) generateOOMMessage(uuid string) string { - l.rwLock.RLock() - defer l.rwLock.RUnlock() for _, pod := range l.podList { if string(pod.UID) == uuid { return fmt.Sprintf("pod was OOM killed. node:%s pod:%s namespace:%s uuid:%s", From f369bfd6ef3fb754e0f7d9c17207f91541a6bc3f Mon Sep 17 00:00:00 2001 From: "pangjiping.pjp" Date: Mon, 4 Jul 2022 17:07:31 +0800 Subject: [PATCH 07/14] fix --- pkg/systemlogmonitor/log_monitor.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/systemlogmonitor/log_monitor.go b/pkg/systemlogmonitor/log_monitor.go index 626069bf5..07efa0218 100644 --- a/pkg/systemlogmonitor/log_monitor.go +++ b/pkg/systemlogmonitor/log_monitor.go @@ -415,7 +415,7 @@ func (l *logMonitor) generateOOMMessage(uuid string) string { pod.Spec.NodeName, pod.Name, pod.Namespace, uuid) } } - return "" + return fmt.Sprintf("pod was OOM killed. node:%s pod:Unknow namespace:Unknow uuid:%s", nodeName, uuid) } func initialConditions(defaults []types.Condition) []types.Condition { From 79b89464713c827a435b03738be4a00e7b0304cf Mon Sep 17 00:00:00 2001 From: "pangjiping.pjp" Date: Thu, 11 Aug 2022 11:42:10 +0800 Subject: [PATCH 08/14] fix oom by cache --- go.mod | 1 + go.sum | 2 + pkg/systemlogmonitor/log_monitor.go | 240 ++-- pkg/util/helpers.go | 25 + .../patrickmn/go-cache/CONTRIBUTORS | 9 + vendor/github.com/patrickmn/go-cache/LICENSE | 19 + .../github.com/patrickmn/go-cache/README.md | 83 ++ vendor/github.com/patrickmn/go-cache/cache.go | 1161 +++++++++++++++++ .../github.com/patrickmn/go-cache/sharded.go | 192 +++ vendor/modules.txt | 3 + 10 files changed, 1603 insertions(+), 132 deletions(-) create mode 100644 vendor/github.com/patrickmn/go-cache/CONTRIBUTORS create mode 100644 vendor/github.com/patrickmn/go-cache/LICENSE create mode 100644 vendor/github.com/patrickmn/go-cache/README.md create mode 100644 vendor/github.com/patrickmn/go-cache/cache.go create mode 100644 vendor/github.com/patrickmn/go-cache/sharded.go diff --git a/go.mod b/go.mod index 54a325938..e780269a8 100644 --- a/go.mod +++ b/go.mod @@ -18,6 +18,7 @@ require ( github.com/hpcloud/tail v1.0.0 github.com/onsi/ginkgo v1.10.3 github.com/onsi/gomega v1.7.1 + github.com/patrickmn/go-cache v2.1.0+incompatible github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4 github.com/prometheus/common v0.4.1 github.com/prometheus/procfs v0.2.0 diff --git a/go.sum b/go.sum index bb19e81d6..70f861cfd 100644 --- a/go.sum +++ b/go.sum @@ -330,6 +330,8 @@ github.com/opencontainers/runtime-tools v0.0.0-20181011054405-1d69bd0f9c39/go.mo github.com/opencontainers/selinux v1.3.3/go.mod h1:yTcKuYAh6R95iDpefGLQaPaRwJFwyzAJufJyiTt7s0g= github.com/openzipkin/zipkin-go v0.1.1/go.mod h1:NtoC/o8u3JlF1lSlyPNswIbeQH9bJTmOf0Erfk+hxe8= github.com/openzipkin/zipkin-go v0.1.6/go.mod h1:QgAqvLzwWbR/WpD4A3cGpPtJrZXNIiJc5AZX7/PBEpw= +github.com/patrickmn/go-cache v2.1.0+incompatible h1:HRMgzkcYKYpi3C8ajMPV8OFXaaRUnok+kx1WdO15EQc= +github.com/patrickmn/go-cache v2.1.0+incompatible/go.mod h1:3Qf8kWWT7OJRJbdiICTKqZju1ZixQ/KpMGzzAfe6+WQ= github.com/pborman/uuid v0.0.0-20150824212802-cccd189d45f7/go.mod h1:VyrYX9gd7irzKovcSS6BIIEwPRkP2Wm2m9ufcdFSJ34= github.com/pborman/uuid v0.0.0-20170612153648-e790cca94e6c/go.mod h1:VyrYX9gd7irzKovcSS6BIIEwPRkP2Wm2m9ufcdFSJ34= github.com/pborman/uuid v1.2.0/go.mod h1:X/NO0urCmaxf9VXbdlT7C2Yzkj2IKimNn4k+gtPdI/k= diff --git a/pkg/systemlogmonitor/log_monitor.go b/pkg/systemlogmonitor/log_monitor.go index 07efa0218..9c85a6296 100644 --- a/pkg/systemlogmonitor/log_monitor.go +++ b/pkg/systemlogmonitor/log_monitor.go @@ -18,13 +18,12 @@ package systemlogmonitor import ( "encoding/json" + "fmt" + "github.com/patrickmn/go-cache" "io/ioutil" v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/meta" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/watch" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" clientset "k8s.io/client-go/kubernetes" - v12 "k8s.io/client-go/kubernetes/typed/core/v1" "k8s.io/heapster/common/kubernetes" "k8s.io/node-problem-detector/cmd/options" "net/url" @@ -32,14 +31,9 @@ import ( "path/filepath" "regexp" "strings" - "sync" - - "fmt" "time" "github.com/golang/glog" - - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/node-problem-detector/pkg/problemdaemon" "k8s.io/node-problem-detector/pkg/problemmetrics" "k8s.io/node-problem-detector/pkg/systemlogmonitor/logwatchers" @@ -58,10 +52,13 @@ const ( ) var ( - uuidRegx *regexp.Regexp - k8sClient *clientset.Clientset - nodeName string - podEventTimeout = int64(3) // 3 minutes + uuidRegx *regexp.Regexp + k8sClient *clientset.Clientset + nodeName string + + // cache setting + cacheExpireDuration = time.Minute * 30 // cache过期时间为30min + cacheCleanupInterval = time.Minute * 60 // cache清理缓存时间为60min ) func init() { @@ -77,18 +74,19 @@ func init() { } type logMonitor struct { - configPath string - watcher watchertypes.LogWatcher - buffer LogBuffer - config MonitorConfig - conditions []types.Condition - logCh <-chan *logtypes.Log - output chan *types.Status - tomb *tomb.Tomb - podList []v1.Pod - rwLock sync.RWMutex - listOptions metav1.ListOptions - listerWatcher v12.PodInterface + configPath string + watcher watchertypes.LogWatcher + buffer LogBuffer + config MonitorConfig + conditions []types.Condition + logCh <-chan *logtypes.Log + output chan *types.Status + tomb *tomb.Tomb + + // cache-key: pod uid + // cache-value: pod_name@pod_namespace拼接的字符串 + // 线程安全, 不需要加锁读写 + cache *cache.Cache } func InitK8sClientOrDie(options *options.NodeProblemDetectorOptions) *clientset.Clientset { @@ -108,8 +106,7 @@ func NewLogMonitorOrDie(configPath string) types.Monitor { l := &logMonitor{ configPath: configPath, tomb: tomb.NewTomb(), - rwLock: sync.RWMutex{}, - podList: make([]v1.Pod, 0), + cache: cache.New(cacheExpireDuration, cacheCleanupInterval), } f, err := ioutil.ReadFile(configPath) @@ -160,21 +157,10 @@ func initializeProblemMetricsOrDie(rules []systemlogtypes.Rule) { func (l *logMonitor) Start() (<-chan *types.Status, error) { glog.Infof("Start log monitor %s", l.configPath) var err error - - // init listAndWatcher - l.listerWatcher = k8sClient.CoreV1().Pods("") - l.listOptions = metav1.ListOptions{ - FieldSelector: fmt.Sprintf("spec.nodeName=%s", nodeName), - ResourceVersion: "0", - TimeoutSeconds: &podEventTimeout, - } - glog.V(4).Infof("init k8sClient %v and listOptions %v on node %s", l.listerWatcher, l.listOptions, nodeName) - l.logCh, err = l.watcher.Watch() if err != nil { return nil, err } - go l.listAndWatch() go l.monitorLoop() return l.output, nil } @@ -228,25 +214,20 @@ func (l *logMonitor) generateStatus(logs []*logtypes.Log, rule systemlogtypes.Ru // We use the timestamp of the first log line as the timestamp of the status. timestamp := logs[0].Timestamp message := generateMessage(logs) - if rule.Reason == OOMREASON && l.listerWatcher != nil { + if rule.Reason == OOMREASON && k8sClient != nil { uuid := string(uuidRegx.Find([]byte(message))) + uuid = strings.ReplaceAll(uuid, "_", "-") - // visit cache to get podList - l.rwLock.RLock() - // if cache is empty, list pods - if l.podList == nil || len(l.podList) <= 0 { - if err := l.list(); err != nil { - glog.Errorf("Failed to list pods, err:%v", err) - } + // check cache + if val, ok := l.cache.Get(uuid); ok { + strs := strings.Split(val.(string), "@") + message = fmt.Sprintf("pod was OOM killed. node:%s pod:%s namespace:%s uuid:%s", + nodeName, strs[0], strs[1], uuid) + l.cache.Delete(uuid) + } else { + message = l.cachePod(uuid) } - - message = l.generateOOMMessage(uuid) - l.rwLock.RUnlock() - } - - if l.listerWatcher == nil { - glog.Errorf("Empty ListAndWatcher!") } var events []types.Event @@ -323,99 +304,94 @@ func (l *logMonitor) initializeStatus() { } } -// listAndWatch watch and watch handle. -func (l *logMonitor) listAndWatch() { - if err := l.list(); err != nil { - glog.Errorf("Failed to init podList, err:%v", err) +// 每个node上最多也就60多个pod? +// 有必要使用分批list的策略吗? +func (l *logMonitor) cachePod(uuid string) string { + msgChan := make(chan string) + defer close(msgChan) + + pl, err := k8sClient.CoreV1().Pods("").List(metav1.ListOptions{ + ResourceVersion: "0", + FieldSelector: fmt.Sprintf("spec.nodeName=%s", nodeName), + }) + if err != nil { + glog.Error("Error in listing pods: %v", err.Error()) + return "" } - for { - w, err := l.listerWatcher.Watch(l.listOptions) - if err != nil { - glog.Errorf("Failed to watch pod, err:%v", err) - return + // update cache + go func(pods []v1.Pod) { + defer util.Recovery() + for _, pod := range pods { + if _, ok := l.cache.Get(string(pod.UID)); ok { + // todo + // 对于那些稳定的[相对稳定,不会OOM的pod],要不要更新缓存,还是采用一种惰性更新的策略? + } else { + l.cache.Set(string(pod.UID), fmt.Sprintf("%s@%s", pod.Name, pod.Namespace), cache.DefaultExpiration+util.RandomCacheDuration()) + } } - glog.V(4).Infof("Start watch pod on Node %s", nodeName) - - select { - case event := <-w.ResultChan(): - switch event.Type { - case watch.Added: - glog.V(4).Infof("Watch event Added") - if err = l.list(); err != nil { - glog.Errorf("Failed to list pods, err:%v", err) - } - w.Stop() - case watch.Modified: - glog.V(4).Infof("Watch event Modified") - if err = l.list(); err != nil { - glog.Errorf("Failed to list pods, err:%v", err) - } - w.Stop() - case watch.Deleted: - glog.V(4).Infof("Watch event Deleted") - if err = l.list(); err != nil { - glog.Errorf("Failed to list pods, err:%v", err) - } - w.Stop() - default: - glog.V(4).Infof("Watch event %v", event.Type) - // glog.Errorf("Invalid watch event %v", event.Type) - w.Stop() + }(pl.Items) + + go func(pods []v1.Pod) { + defer util.Recovery() + for _, pod := range pods { + if string(pod.UID) == uuid { + l.cache.Delete(uuid) + msgChan <- fmt.Sprintf("pod was OOM killed. node:%s pod:%s namespace:%s uuid:%s", + pod.Spec.NodeName, pod.Name, pod.Namespace, uuid) + return } - case <-l.tomb.Stopping(): - w.Stop() - glog.Infof("Log monitor stopped at %s", time.Now().Format("2006-01-02 15:04:05 MST Mon")) - return } + msgChan <- "" + }(pl.Items) + + select { + case msg := <-msgChan: + return msg + case <-time.After(time.Millisecond * 500): + return "" } } -// syncWith sync logMonitor's podList cache. -// write lock inside. -func (l *logMonitor) syncWith(obj runtime.Object, resourceVersion string) error { - list, ok := obj.(*v1.PodList) - if !ok { - return fmt.Errorf("cannot convert obj %v to podList", obj) - } +func (l *logMonitor) cachePodWithPagination(uuid string) (message string) { + var nextToken = "" + var step int64 = 50 - l.rwLock.Lock() - defer l.rwLock.Unlock() - l.podList = list.Items - l.listOptions.ResourceVersion = resourceVersion - glog.V(4).Infof("syncWith logMonitor cache, length of podList: %v", len(l.podList)) - return nil -} + // pagination + for { + pods, err := k8sClient.CoreV1().Pods("").List(metav1.ListOptions{ + ResourceVersion: "0", + FieldSelector: fmt.Sprintf("spec.nodeName=%s", nodeName), + Limit: step, + Continue: nextToken, + }) + if err != nil { + glog.Error("Error in listing pods: %v", err.Error()) + return + } -// list lists pods with FieldSelector and resourceVersion. -func (l *logMonitor) list() error { - var ( - list runtime.Object - err error - listMetaInterface meta.List - ) - if list, err = l.listerWatcher.List(l.listOptions); err != nil { - return err - } - if listMetaInterface, err = meta.ListAccessor(list); err != nil { - return err - } - resourceVersion := listMetaInterface.GetResourceVersion() + for _, pod := range pods.Items { + if string(pod.UID) == uuid { + message = fmt.Sprintf("pod was OOM killed. node:%s pod:%s namespace:%s uuid:%s", + pod.Spec.NodeName, pod.Name, pod.Namespace, uuid) + continue + } - err = l.syncWith(list, resourceVersion) - return err -} + // cache pod + if _, ok := l.cache.Get(string(pod.UID)); ok { + + } else { + l.cache.Set(string(pod.UID), fmt.Sprintf("%s@%s", pod.Name, pod.Namespace), cache.DefaultExpiration+util.RandomCacheDuration()) + } + } -// generateOOMMessage get oom pod message. -// need read lock before use. -func (l *logMonitor) generateOOMMessage(uuid string) string { - for _, pod := range l.podList { - if string(pod.UID) == uuid { - return fmt.Sprintf("pod was OOM killed. node:%s pod:%s namespace:%s uuid:%s", - pod.Spec.NodeName, pod.Name, pod.Namespace, uuid) + if pods.Continue == "" { + break } + nextToken = pods.Continue } - return fmt.Sprintf("pod was OOM killed. node:%s pod:Unknow namespace:Unknow uuid:%s", nodeName, uuid) + + return } func initialConditions(defaults []types.Condition) []types.Condition { diff --git a/pkg/util/helpers.go b/pkg/util/helpers.go index e8889e938..fe78ee466 100644 --- a/pkg/util/helpers.go +++ b/pkg/util/helpers.go @@ -17,7 +17,10 @@ package util import ( "fmt" + "github.com/golang/glog" + "math/rand" "regexp" + "runtime" "time" "k8s.io/node-problem-detector/pkg/types" @@ -64,3 +67,25 @@ func GetStartTime(now time.Time, uptimeDuration time.Duration, lookbackStr strin return startTime, nil } + +func Recovery() error { + if err := recover(); err != nil { + var e error + switch r := err.(type) { + case error: + e = r + default: + e = fmt.Errorf("%v", r) + } + stack := make([]byte, 2048) + length := runtime.Stack(stack, true) + glog.Error("[%s] %s %s\n", "PANIC RECOVER", e, stack[:length]) + return e + } + return nil +} + +func RandomCacheDuration() time.Duration { + rand.Seed(time.Now().Unix()) + return time.Duration(rand.Int63n(10) * int64(time.Minute)) +} diff --git a/vendor/github.com/patrickmn/go-cache/CONTRIBUTORS b/vendor/github.com/patrickmn/go-cache/CONTRIBUTORS new file mode 100644 index 000000000..2b16e9974 --- /dev/null +++ b/vendor/github.com/patrickmn/go-cache/CONTRIBUTORS @@ -0,0 +1,9 @@ +This is a list of people who have contributed code to go-cache. They, or their +employers, are the copyright holders of the contributed code. Contributed code +is subject to the license restrictions listed in LICENSE (as they were when the +code was contributed.) + +Dustin Sallings +Jason Mooberry +Sergey Shepelev +Alex Edwards diff --git a/vendor/github.com/patrickmn/go-cache/LICENSE b/vendor/github.com/patrickmn/go-cache/LICENSE new file mode 100644 index 000000000..db9903c75 --- /dev/null +++ b/vendor/github.com/patrickmn/go-cache/LICENSE @@ -0,0 +1,19 @@ +Copyright (c) 2012-2017 Patrick Mylund Nielsen and the go-cache contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/vendor/github.com/patrickmn/go-cache/README.md b/vendor/github.com/patrickmn/go-cache/README.md new file mode 100644 index 000000000..c5789cc66 --- /dev/null +++ b/vendor/github.com/patrickmn/go-cache/README.md @@ -0,0 +1,83 @@ +# go-cache + +go-cache is an in-memory key:value store/cache similar to memcached that is +suitable for applications running on a single machine. Its major advantage is +that, being essentially a thread-safe `map[string]interface{}` with expiration +times, it doesn't need to serialize or transmit its contents over the network. + +Any object can be stored, for a given duration or forever, and the cache can be +safely used by multiple goroutines. + +Although go-cache isn't meant to be used as a persistent datastore, the entire +cache can be saved to and loaded from a file (using `c.Items()` to retrieve the +items map to serialize, and `NewFrom()` to create a cache from a deserialized +one) to recover from downtime quickly. (See the docs for `NewFrom()` for caveats.) + +### Installation + +`go get github.com/patrickmn/go-cache` + +### Usage + +```go +import ( + "fmt" + "github.com/patrickmn/go-cache" + "time" +) + +func main() { + // Create a cache with a default expiration time of 5 minutes, and which + // purges expired items every 10 minutes + c := cache.New(5*time.Minute, 10*time.Minute) + + // Set the value of the key "foo" to "bar", with the default expiration time + c.Set("foo", "bar", cache.DefaultExpiration) + + // Set the value of the key "baz" to 42, with no expiration time + // (the item won't be removed until it is re-set, or removed using + // c.Delete("baz") + c.Set("baz", 42, cache.NoExpiration) + + // Get the string associated with the key "foo" from the cache + foo, found := c.Get("foo") + if found { + fmt.Println(foo) + } + + // Since Go is statically typed, and cache values can be anything, type + // assertion is needed when values are being passed to functions that don't + // take arbitrary types, (i.e. interface{}). The simplest way to do this for + // values which will only be used once--e.g. for passing to another + // function--is: + foo, found := c.Get("foo") + if found { + MyFunction(foo.(string)) + } + + // This gets tedious if the value is used several times in the same function. + // You might do either of the following instead: + if x, found := c.Get("foo"); found { + foo := x.(string) + // ... + } + // or + var foo string + if x, found := c.Get("foo"); found { + foo = x.(string) + } + // ... + // foo can then be passed around freely as a string + + // Want performance? Store pointers! + c.Set("foo", &MyStruct, cache.DefaultExpiration) + if x, found := c.Get("foo"); found { + foo := x.(*MyStruct) + // ... + } +} +``` + +### Reference + +`godoc` or [http://godoc.org/github.com/patrickmn/go-cache](http://godoc.org/github.com/patrickmn/go-cache) diff --git a/vendor/github.com/patrickmn/go-cache/cache.go b/vendor/github.com/patrickmn/go-cache/cache.go new file mode 100644 index 000000000..db88d2f2c --- /dev/null +++ b/vendor/github.com/patrickmn/go-cache/cache.go @@ -0,0 +1,1161 @@ +package cache + +import ( + "encoding/gob" + "fmt" + "io" + "os" + "runtime" + "sync" + "time" +) + +type Item struct { + Object interface{} + Expiration int64 +} + +// Returns true if the item has expired. +func (item Item) Expired() bool { + if item.Expiration == 0 { + return false + } + return time.Now().UnixNano() > item.Expiration +} + +const ( + // For use with functions that take an expiration time. + NoExpiration time.Duration = -1 + // For use with functions that take an expiration time. Equivalent to + // passing in the same expiration duration as was given to New() or + // NewFrom() when the cache was created (e.g. 5 minutes.) + DefaultExpiration time.Duration = 0 +) + +type Cache struct { + *cache + // If this is confusing, see the comment at the bottom of New() +} + +type cache struct { + defaultExpiration time.Duration + items map[string]Item + mu sync.RWMutex + onEvicted func(string, interface{}) + janitor *janitor +} + +// Add an item to the cache, replacing any existing item. If the duration is 0 +// (DefaultExpiration), the cache's default expiration time is used. If it is -1 +// (NoExpiration), the item never expires. +func (c *cache) Set(k string, x interface{}, d time.Duration) { + // "Inlining" of set + var e int64 + if d == DefaultExpiration { + d = c.defaultExpiration + } + if d > 0 { + e = time.Now().Add(d).UnixNano() + } + c.mu.Lock() + c.items[k] = Item{ + Object: x, + Expiration: e, + } + // TODO: Calls to mu.Unlock are currently not deferred because defer + // adds ~200 ns (as of go1.) + c.mu.Unlock() +} + +func (c *cache) set(k string, x interface{}, d time.Duration) { + var e int64 + if d == DefaultExpiration { + d = c.defaultExpiration + } + if d > 0 { + e = time.Now().Add(d).UnixNano() + } + c.items[k] = Item{ + Object: x, + Expiration: e, + } +} + +// Add an item to the cache, replacing any existing item, using the default +// expiration. +func (c *cache) SetDefault(k string, x interface{}) { + c.Set(k, x, DefaultExpiration) +} + +// Add an item to the cache only if an item doesn't already exist for the given +// key, or if the existing item has expired. Returns an error otherwise. +func (c *cache) Add(k string, x interface{}, d time.Duration) error { + c.mu.Lock() + _, found := c.get(k) + if found { + c.mu.Unlock() + return fmt.Errorf("Item %s already exists", k) + } + c.set(k, x, d) + c.mu.Unlock() + return nil +} + +// Set a new value for the cache key only if it already exists, and the existing +// item hasn't expired. Returns an error otherwise. +func (c *cache) Replace(k string, x interface{}, d time.Duration) error { + c.mu.Lock() + _, found := c.get(k) + if !found { + c.mu.Unlock() + return fmt.Errorf("Item %s doesn't exist", k) + } + c.set(k, x, d) + c.mu.Unlock() + return nil +} + +// Get an item from the cache. Returns the item or nil, and a bool indicating +// whether the key was found. +func (c *cache) Get(k string) (interface{}, bool) { + c.mu.RLock() + // "Inlining" of get and Expired + item, found := c.items[k] + if !found { + c.mu.RUnlock() + return nil, false + } + if item.Expiration > 0 { + if time.Now().UnixNano() > item.Expiration { + c.mu.RUnlock() + return nil, false + } + } + c.mu.RUnlock() + return item.Object, true +} + +// GetWithExpiration returns an item and its expiration time from the cache. +// It returns the item or nil, the expiration time if one is set (if the item +// never expires a zero value for time.Time is returned), and a bool indicating +// whether the key was found. +func (c *cache) GetWithExpiration(k string) (interface{}, time.Time, bool) { + c.mu.RLock() + // "Inlining" of get and Expired + item, found := c.items[k] + if !found { + c.mu.RUnlock() + return nil, time.Time{}, false + } + + if item.Expiration > 0 { + if time.Now().UnixNano() > item.Expiration { + c.mu.RUnlock() + return nil, time.Time{}, false + } + + // Return the item and the expiration time + c.mu.RUnlock() + return item.Object, time.Unix(0, item.Expiration), true + } + + // If expiration <= 0 (i.e. no expiration time set) then return the item + // and a zeroed time.Time + c.mu.RUnlock() + return item.Object, time.Time{}, true +} + +func (c *cache) get(k string) (interface{}, bool) { + item, found := c.items[k] + if !found { + return nil, false + } + // "Inlining" of Expired + if item.Expiration > 0 { + if time.Now().UnixNano() > item.Expiration { + return nil, false + } + } + return item.Object, true +} + +// Increment an item of type int, int8, int16, int32, int64, uintptr, uint, +// uint8, uint32, or uint64, float32 or float64 by n. Returns an error if the +// item's value is not an integer, if it was not found, or if it is not +// possible to increment it by n. To retrieve the incremented value, use one +// of the specialized methods, e.g. IncrementInt64. +func (c *cache) Increment(k string, n int64) error { + c.mu.Lock() + v, found := c.items[k] + if !found || v.Expired() { + c.mu.Unlock() + return fmt.Errorf("Item %s not found", k) + } + switch v.Object.(type) { + case int: + v.Object = v.Object.(int) + int(n) + case int8: + v.Object = v.Object.(int8) + int8(n) + case int16: + v.Object = v.Object.(int16) + int16(n) + case int32: + v.Object = v.Object.(int32) + int32(n) + case int64: + v.Object = v.Object.(int64) + n + case uint: + v.Object = v.Object.(uint) + uint(n) + case uintptr: + v.Object = v.Object.(uintptr) + uintptr(n) + case uint8: + v.Object = v.Object.(uint8) + uint8(n) + case uint16: + v.Object = v.Object.(uint16) + uint16(n) + case uint32: + v.Object = v.Object.(uint32) + uint32(n) + case uint64: + v.Object = v.Object.(uint64) + uint64(n) + case float32: + v.Object = v.Object.(float32) + float32(n) + case float64: + v.Object = v.Object.(float64) + float64(n) + default: + c.mu.Unlock() + return fmt.Errorf("The value for %s is not an integer", k) + } + c.items[k] = v + c.mu.Unlock() + return nil +} + +// Increment an item of type float32 or float64 by n. Returns an error if the +// item's value is not floating point, if it was not found, or if it is not +// possible to increment it by n. Pass a negative number to decrement the +// value. To retrieve the incremented value, use one of the specialized methods, +// e.g. IncrementFloat64. +func (c *cache) IncrementFloat(k string, n float64) error { + c.mu.Lock() + v, found := c.items[k] + if !found || v.Expired() { + c.mu.Unlock() + return fmt.Errorf("Item %s not found", k) + } + switch v.Object.(type) { + case float32: + v.Object = v.Object.(float32) + float32(n) + case float64: + v.Object = v.Object.(float64) + n + default: + c.mu.Unlock() + return fmt.Errorf("The value for %s does not have type float32 or float64", k) + } + c.items[k] = v + c.mu.Unlock() + return nil +} + +// Increment an item of type int by n. Returns an error if the item's value is +// not an int, or if it was not found. If there is no error, the incremented +// value is returned. +func (c *cache) IncrementInt(k string, n int) (int, error) { + c.mu.Lock() + v, found := c.items[k] + if !found || v.Expired() { + c.mu.Unlock() + return 0, fmt.Errorf("Item %s not found", k) + } + rv, ok := v.Object.(int) + if !ok { + c.mu.Unlock() + return 0, fmt.Errorf("The value for %s is not an int", k) + } + nv := rv + n + v.Object = nv + c.items[k] = v + c.mu.Unlock() + return nv, nil +} + +// Increment an item of type int8 by n. Returns an error if the item's value is +// not an int8, or if it was not found. If there is no error, the incremented +// value is returned. +func (c *cache) IncrementInt8(k string, n int8) (int8, error) { + c.mu.Lock() + v, found := c.items[k] + if !found || v.Expired() { + c.mu.Unlock() + return 0, fmt.Errorf("Item %s not found", k) + } + rv, ok := v.Object.(int8) + if !ok { + c.mu.Unlock() + return 0, fmt.Errorf("The value for %s is not an int8", k) + } + nv := rv + n + v.Object = nv + c.items[k] = v + c.mu.Unlock() + return nv, nil +} + +// Increment an item of type int16 by n. Returns an error if the item's value is +// not an int16, or if it was not found. If there is no error, the incremented +// value is returned. +func (c *cache) IncrementInt16(k string, n int16) (int16, error) { + c.mu.Lock() + v, found := c.items[k] + if !found || v.Expired() { + c.mu.Unlock() + return 0, fmt.Errorf("Item %s not found", k) + } + rv, ok := v.Object.(int16) + if !ok { + c.mu.Unlock() + return 0, fmt.Errorf("The value for %s is not an int16", k) + } + nv := rv + n + v.Object = nv + c.items[k] = v + c.mu.Unlock() + return nv, nil +} + +// Increment an item of type int32 by n. Returns an error if the item's value is +// not an int32, or if it was not found. If there is no error, the incremented +// value is returned. +func (c *cache) IncrementInt32(k string, n int32) (int32, error) { + c.mu.Lock() + v, found := c.items[k] + if !found || v.Expired() { + c.mu.Unlock() + return 0, fmt.Errorf("Item %s not found", k) + } + rv, ok := v.Object.(int32) + if !ok { + c.mu.Unlock() + return 0, fmt.Errorf("The value for %s is not an int32", k) + } + nv := rv + n + v.Object = nv + c.items[k] = v + c.mu.Unlock() + return nv, nil +} + +// Increment an item of type int64 by n. Returns an error if the item's value is +// not an int64, or if it was not found. If there is no error, the incremented +// value is returned. +func (c *cache) IncrementInt64(k string, n int64) (int64, error) { + c.mu.Lock() + v, found := c.items[k] + if !found || v.Expired() { + c.mu.Unlock() + return 0, fmt.Errorf("Item %s not found", k) + } + rv, ok := v.Object.(int64) + if !ok { + c.mu.Unlock() + return 0, fmt.Errorf("The value for %s is not an int64", k) + } + nv := rv + n + v.Object = nv + c.items[k] = v + c.mu.Unlock() + return nv, nil +} + +// Increment an item of type uint by n. Returns an error if the item's value is +// not an uint, or if it was not found. If there is no error, the incremented +// value is returned. +func (c *cache) IncrementUint(k string, n uint) (uint, error) { + c.mu.Lock() + v, found := c.items[k] + if !found || v.Expired() { + c.mu.Unlock() + return 0, fmt.Errorf("Item %s not found", k) + } + rv, ok := v.Object.(uint) + if !ok { + c.mu.Unlock() + return 0, fmt.Errorf("The value for %s is not an uint", k) + } + nv := rv + n + v.Object = nv + c.items[k] = v + c.mu.Unlock() + return nv, nil +} + +// Increment an item of type uintptr by n. Returns an error if the item's value +// is not an uintptr, or if it was not found. If there is no error, the +// incremented value is returned. +func (c *cache) IncrementUintptr(k string, n uintptr) (uintptr, error) { + c.mu.Lock() + v, found := c.items[k] + if !found || v.Expired() { + c.mu.Unlock() + return 0, fmt.Errorf("Item %s not found", k) + } + rv, ok := v.Object.(uintptr) + if !ok { + c.mu.Unlock() + return 0, fmt.Errorf("The value for %s is not an uintptr", k) + } + nv := rv + n + v.Object = nv + c.items[k] = v + c.mu.Unlock() + return nv, nil +} + +// Increment an item of type uint8 by n. Returns an error if the item's value +// is not an uint8, or if it was not found. If there is no error, the +// incremented value is returned. +func (c *cache) IncrementUint8(k string, n uint8) (uint8, error) { + c.mu.Lock() + v, found := c.items[k] + if !found || v.Expired() { + c.mu.Unlock() + return 0, fmt.Errorf("Item %s not found", k) + } + rv, ok := v.Object.(uint8) + if !ok { + c.mu.Unlock() + return 0, fmt.Errorf("The value for %s is not an uint8", k) + } + nv := rv + n + v.Object = nv + c.items[k] = v + c.mu.Unlock() + return nv, nil +} + +// Increment an item of type uint16 by n. Returns an error if the item's value +// is not an uint16, or if it was not found. If there is no error, the +// incremented value is returned. +func (c *cache) IncrementUint16(k string, n uint16) (uint16, error) { + c.mu.Lock() + v, found := c.items[k] + if !found || v.Expired() { + c.mu.Unlock() + return 0, fmt.Errorf("Item %s not found", k) + } + rv, ok := v.Object.(uint16) + if !ok { + c.mu.Unlock() + return 0, fmt.Errorf("The value for %s is not an uint16", k) + } + nv := rv + n + v.Object = nv + c.items[k] = v + c.mu.Unlock() + return nv, nil +} + +// Increment an item of type uint32 by n. Returns an error if the item's value +// is not an uint32, or if it was not found. If there is no error, the +// incremented value is returned. +func (c *cache) IncrementUint32(k string, n uint32) (uint32, error) { + c.mu.Lock() + v, found := c.items[k] + if !found || v.Expired() { + c.mu.Unlock() + return 0, fmt.Errorf("Item %s not found", k) + } + rv, ok := v.Object.(uint32) + if !ok { + c.mu.Unlock() + return 0, fmt.Errorf("The value for %s is not an uint32", k) + } + nv := rv + n + v.Object = nv + c.items[k] = v + c.mu.Unlock() + return nv, nil +} + +// Increment an item of type uint64 by n. Returns an error if the item's value +// is not an uint64, or if it was not found. If there is no error, the +// incremented value is returned. +func (c *cache) IncrementUint64(k string, n uint64) (uint64, error) { + c.mu.Lock() + v, found := c.items[k] + if !found || v.Expired() { + c.mu.Unlock() + return 0, fmt.Errorf("Item %s not found", k) + } + rv, ok := v.Object.(uint64) + if !ok { + c.mu.Unlock() + return 0, fmt.Errorf("The value for %s is not an uint64", k) + } + nv := rv + n + v.Object = nv + c.items[k] = v + c.mu.Unlock() + return nv, nil +} + +// Increment an item of type float32 by n. Returns an error if the item's value +// is not an float32, or if it was not found. If there is no error, the +// incremented value is returned. +func (c *cache) IncrementFloat32(k string, n float32) (float32, error) { + c.mu.Lock() + v, found := c.items[k] + if !found || v.Expired() { + c.mu.Unlock() + return 0, fmt.Errorf("Item %s not found", k) + } + rv, ok := v.Object.(float32) + if !ok { + c.mu.Unlock() + return 0, fmt.Errorf("The value for %s is not an float32", k) + } + nv := rv + n + v.Object = nv + c.items[k] = v + c.mu.Unlock() + return nv, nil +} + +// Increment an item of type float64 by n. Returns an error if the item's value +// is not an float64, or if it was not found. If there is no error, the +// incremented value is returned. +func (c *cache) IncrementFloat64(k string, n float64) (float64, error) { + c.mu.Lock() + v, found := c.items[k] + if !found || v.Expired() { + c.mu.Unlock() + return 0, fmt.Errorf("Item %s not found", k) + } + rv, ok := v.Object.(float64) + if !ok { + c.mu.Unlock() + return 0, fmt.Errorf("The value for %s is not an float64", k) + } + nv := rv + n + v.Object = nv + c.items[k] = v + c.mu.Unlock() + return nv, nil +} + +// Decrement an item of type int, int8, int16, int32, int64, uintptr, uint, +// uint8, uint32, or uint64, float32 or float64 by n. Returns an error if the +// item's value is not an integer, if it was not found, or if it is not +// possible to decrement it by n. To retrieve the decremented value, use one +// of the specialized methods, e.g. DecrementInt64. +func (c *cache) Decrement(k string, n int64) error { + // TODO: Implement Increment and Decrement more cleanly. + // (Cannot do Increment(k, n*-1) for uints.) + c.mu.Lock() + v, found := c.items[k] + if !found || v.Expired() { + c.mu.Unlock() + return fmt.Errorf("Item not found") + } + switch v.Object.(type) { + case int: + v.Object = v.Object.(int) - int(n) + case int8: + v.Object = v.Object.(int8) - int8(n) + case int16: + v.Object = v.Object.(int16) - int16(n) + case int32: + v.Object = v.Object.(int32) - int32(n) + case int64: + v.Object = v.Object.(int64) - n + case uint: + v.Object = v.Object.(uint) - uint(n) + case uintptr: + v.Object = v.Object.(uintptr) - uintptr(n) + case uint8: + v.Object = v.Object.(uint8) - uint8(n) + case uint16: + v.Object = v.Object.(uint16) - uint16(n) + case uint32: + v.Object = v.Object.(uint32) - uint32(n) + case uint64: + v.Object = v.Object.(uint64) - uint64(n) + case float32: + v.Object = v.Object.(float32) - float32(n) + case float64: + v.Object = v.Object.(float64) - float64(n) + default: + c.mu.Unlock() + return fmt.Errorf("The value for %s is not an integer", k) + } + c.items[k] = v + c.mu.Unlock() + return nil +} + +// Decrement an item of type float32 or float64 by n. Returns an error if the +// item's value is not floating point, if it was not found, or if it is not +// possible to decrement it by n. Pass a negative number to decrement the +// value. To retrieve the decremented value, use one of the specialized methods, +// e.g. DecrementFloat64. +func (c *cache) DecrementFloat(k string, n float64) error { + c.mu.Lock() + v, found := c.items[k] + if !found || v.Expired() { + c.mu.Unlock() + return fmt.Errorf("Item %s not found", k) + } + switch v.Object.(type) { + case float32: + v.Object = v.Object.(float32) - float32(n) + case float64: + v.Object = v.Object.(float64) - n + default: + c.mu.Unlock() + return fmt.Errorf("The value for %s does not have type float32 or float64", k) + } + c.items[k] = v + c.mu.Unlock() + return nil +} + +// Decrement an item of type int by n. Returns an error if the item's value is +// not an int, or if it was not found. If there is no error, the decremented +// value is returned. +func (c *cache) DecrementInt(k string, n int) (int, error) { + c.mu.Lock() + v, found := c.items[k] + if !found || v.Expired() { + c.mu.Unlock() + return 0, fmt.Errorf("Item %s not found", k) + } + rv, ok := v.Object.(int) + if !ok { + c.mu.Unlock() + return 0, fmt.Errorf("The value for %s is not an int", k) + } + nv := rv - n + v.Object = nv + c.items[k] = v + c.mu.Unlock() + return nv, nil +} + +// Decrement an item of type int8 by n. Returns an error if the item's value is +// not an int8, or if it was not found. If there is no error, the decremented +// value is returned. +func (c *cache) DecrementInt8(k string, n int8) (int8, error) { + c.mu.Lock() + v, found := c.items[k] + if !found || v.Expired() { + c.mu.Unlock() + return 0, fmt.Errorf("Item %s not found", k) + } + rv, ok := v.Object.(int8) + if !ok { + c.mu.Unlock() + return 0, fmt.Errorf("The value for %s is not an int8", k) + } + nv := rv - n + v.Object = nv + c.items[k] = v + c.mu.Unlock() + return nv, nil +} + +// Decrement an item of type int16 by n. Returns an error if the item's value is +// not an int16, or if it was not found. If there is no error, the decremented +// value is returned. +func (c *cache) DecrementInt16(k string, n int16) (int16, error) { + c.mu.Lock() + v, found := c.items[k] + if !found || v.Expired() { + c.mu.Unlock() + return 0, fmt.Errorf("Item %s not found", k) + } + rv, ok := v.Object.(int16) + if !ok { + c.mu.Unlock() + return 0, fmt.Errorf("The value for %s is not an int16", k) + } + nv := rv - n + v.Object = nv + c.items[k] = v + c.mu.Unlock() + return nv, nil +} + +// Decrement an item of type int32 by n. Returns an error if the item's value is +// not an int32, or if it was not found. If there is no error, the decremented +// value is returned. +func (c *cache) DecrementInt32(k string, n int32) (int32, error) { + c.mu.Lock() + v, found := c.items[k] + if !found || v.Expired() { + c.mu.Unlock() + return 0, fmt.Errorf("Item %s not found", k) + } + rv, ok := v.Object.(int32) + if !ok { + c.mu.Unlock() + return 0, fmt.Errorf("The value for %s is not an int32", k) + } + nv := rv - n + v.Object = nv + c.items[k] = v + c.mu.Unlock() + return nv, nil +} + +// Decrement an item of type int64 by n. Returns an error if the item's value is +// not an int64, or if it was not found. If there is no error, the decremented +// value is returned. +func (c *cache) DecrementInt64(k string, n int64) (int64, error) { + c.mu.Lock() + v, found := c.items[k] + if !found || v.Expired() { + c.mu.Unlock() + return 0, fmt.Errorf("Item %s not found", k) + } + rv, ok := v.Object.(int64) + if !ok { + c.mu.Unlock() + return 0, fmt.Errorf("The value for %s is not an int64", k) + } + nv := rv - n + v.Object = nv + c.items[k] = v + c.mu.Unlock() + return nv, nil +} + +// Decrement an item of type uint by n. Returns an error if the item's value is +// not an uint, or if it was not found. If there is no error, the decremented +// value is returned. +func (c *cache) DecrementUint(k string, n uint) (uint, error) { + c.mu.Lock() + v, found := c.items[k] + if !found || v.Expired() { + c.mu.Unlock() + return 0, fmt.Errorf("Item %s not found", k) + } + rv, ok := v.Object.(uint) + if !ok { + c.mu.Unlock() + return 0, fmt.Errorf("The value for %s is not an uint", k) + } + nv := rv - n + v.Object = nv + c.items[k] = v + c.mu.Unlock() + return nv, nil +} + +// Decrement an item of type uintptr by n. Returns an error if the item's value +// is not an uintptr, or if it was not found. If there is no error, the +// decremented value is returned. +func (c *cache) DecrementUintptr(k string, n uintptr) (uintptr, error) { + c.mu.Lock() + v, found := c.items[k] + if !found || v.Expired() { + c.mu.Unlock() + return 0, fmt.Errorf("Item %s not found", k) + } + rv, ok := v.Object.(uintptr) + if !ok { + c.mu.Unlock() + return 0, fmt.Errorf("The value for %s is not an uintptr", k) + } + nv := rv - n + v.Object = nv + c.items[k] = v + c.mu.Unlock() + return nv, nil +} + +// Decrement an item of type uint8 by n. Returns an error if the item's value is +// not an uint8, or if it was not found. If there is no error, the decremented +// value is returned. +func (c *cache) DecrementUint8(k string, n uint8) (uint8, error) { + c.mu.Lock() + v, found := c.items[k] + if !found || v.Expired() { + c.mu.Unlock() + return 0, fmt.Errorf("Item %s not found", k) + } + rv, ok := v.Object.(uint8) + if !ok { + c.mu.Unlock() + return 0, fmt.Errorf("The value for %s is not an uint8", k) + } + nv := rv - n + v.Object = nv + c.items[k] = v + c.mu.Unlock() + return nv, nil +} + +// Decrement an item of type uint16 by n. Returns an error if the item's value +// is not an uint16, or if it was not found. If there is no error, the +// decremented value is returned. +func (c *cache) DecrementUint16(k string, n uint16) (uint16, error) { + c.mu.Lock() + v, found := c.items[k] + if !found || v.Expired() { + c.mu.Unlock() + return 0, fmt.Errorf("Item %s not found", k) + } + rv, ok := v.Object.(uint16) + if !ok { + c.mu.Unlock() + return 0, fmt.Errorf("The value for %s is not an uint16", k) + } + nv := rv - n + v.Object = nv + c.items[k] = v + c.mu.Unlock() + return nv, nil +} + +// Decrement an item of type uint32 by n. Returns an error if the item's value +// is not an uint32, or if it was not found. If there is no error, the +// decremented value is returned. +func (c *cache) DecrementUint32(k string, n uint32) (uint32, error) { + c.mu.Lock() + v, found := c.items[k] + if !found || v.Expired() { + c.mu.Unlock() + return 0, fmt.Errorf("Item %s not found", k) + } + rv, ok := v.Object.(uint32) + if !ok { + c.mu.Unlock() + return 0, fmt.Errorf("The value for %s is not an uint32", k) + } + nv := rv - n + v.Object = nv + c.items[k] = v + c.mu.Unlock() + return nv, nil +} + +// Decrement an item of type uint64 by n. Returns an error if the item's value +// is not an uint64, or if it was not found. If there is no error, the +// decremented value is returned. +func (c *cache) DecrementUint64(k string, n uint64) (uint64, error) { + c.mu.Lock() + v, found := c.items[k] + if !found || v.Expired() { + c.mu.Unlock() + return 0, fmt.Errorf("Item %s not found", k) + } + rv, ok := v.Object.(uint64) + if !ok { + c.mu.Unlock() + return 0, fmt.Errorf("The value for %s is not an uint64", k) + } + nv := rv - n + v.Object = nv + c.items[k] = v + c.mu.Unlock() + return nv, nil +} + +// Decrement an item of type float32 by n. Returns an error if the item's value +// is not an float32, or if it was not found. If there is no error, the +// decremented value is returned. +func (c *cache) DecrementFloat32(k string, n float32) (float32, error) { + c.mu.Lock() + v, found := c.items[k] + if !found || v.Expired() { + c.mu.Unlock() + return 0, fmt.Errorf("Item %s not found", k) + } + rv, ok := v.Object.(float32) + if !ok { + c.mu.Unlock() + return 0, fmt.Errorf("The value for %s is not an float32", k) + } + nv := rv - n + v.Object = nv + c.items[k] = v + c.mu.Unlock() + return nv, nil +} + +// Decrement an item of type float64 by n. Returns an error if the item's value +// is not an float64, or if it was not found. If there is no error, the +// decremented value is returned. +func (c *cache) DecrementFloat64(k string, n float64) (float64, error) { + c.mu.Lock() + v, found := c.items[k] + if !found || v.Expired() { + c.mu.Unlock() + return 0, fmt.Errorf("Item %s not found", k) + } + rv, ok := v.Object.(float64) + if !ok { + c.mu.Unlock() + return 0, fmt.Errorf("The value for %s is not an float64", k) + } + nv := rv - n + v.Object = nv + c.items[k] = v + c.mu.Unlock() + return nv, nil +} + +// Delete an item from the cache. Does nothing if the key is not in the cache. +func (c *cache) Delete(k string) { + c.mu.Lock() + v, evicted := c.delete(k) + c.mu.Unlock() + if evicted { + c.onEvicted(k, v) + } +} + +func (c *cache) delete(k string) (interface{}, bool) { + if c.onEvicted != nil { + if v, found := c.items[k]; found { + delete(c.items, k) + return v.Object, true + } + } + delete(c.items, k) + return nil, false +} + +type keyAndValue struct { + key string + value interface{} +} + +// Delete all expired items from the cache. +func (c *cache) DeleteExpired() { + var evictedItems []keyAndValue + now := time.Now().UnixNano() + c.mu.Lock() + for k, v := range c.items { + // "Inlining" of expired + if v.Expiration > 0 && now > v.Expiration { + ov, evicted := c.delete(k) + if evicted { + evictedItems = append(evictedItems, keyAndValue{k, ov}) + } + } + } + c.mu.Unlock() + for _, v := range evictedItems { + c.onEvicted(v.key, v.value) + } +} + +// Sets an (optional) function that is called with the key and value when an +// item is evicted from the cache. (Including when it is deleted manually, but +// not when it is overwritten.) Set to nil to disable. +func (c *cache) OnEvicted(f func(string, interface{})) { + c.mu.Lock() + c.onEvicted = f + c.mu.Unlock() +} + +// Write the cache's items (using Gob) to an io.Writer. +// +// NOTE: This method is deprecated in favor of c.Items() and NewFrom() (see the +// documentation for NewFrom().) +func (c *cache) Save(w io.Writer) (err error) { + enc := gob.NewEncoder(w) + defer func() { + if x := recover(); x != nil { + err = fmt.Errorf("Error registering item types with Gob library") + } + }() + c.mu.RLock() + defer c.mu.RUnlock() + for _, v := range c.items { + gob.Register(v.Object) + } + err = enc.Encode(&c.items) + return +} + +// Save the cache's items to the given filename, creating the file if it +// doesn't exist, and overwriting it if it does. +// +// NOTE: This method is deprecated in favor of c.Items() and NewFrom() (see the +// documentation for NewFrom().) +func (c *cache) SaveFile(fname string) error { + fp, err := os.Create(fname) + if err != nil { + return err + } + err = c.Save(fp) + if err != nil { + fp.Close() + return err + } + return fp.Close() +} + +// Add (Gob-serialized) cache items from an io.Reader, excluding any items with +// keys that already exist (and haven't expired) in the current cache. +// +// NOTE: This method is deprecated in favor of c.Items() and NewFrom() (see the +// documentation for NewFrom().) +func (c *cache) Load(r io.Reader) error { + dec := gob.NewDecoder(r) + items := map[string]Item{} + err := dec.Decode(&items) + if err == nil { + c.mu.Lock() + defer c.mu.Unlock() + for k, v := range items { + ov, found := c.items[k] + if !found || ov.Expired() { + c.items[k] = v + } + } + } + return err +} + +// Load and add cache items from the given filename, excluding any items with +// keys that already exist in the current cache. +// +// NOTE: This method is deprecated in favor of c.Items() and NewFrom() (see the +// documentation for NewFrom().) +func (c *cache) LoadFile(fname string) error { + fp, err := os.Open(fname) + if err != nil { + return err + } + err = c.Load(fp) + if err != nil { + fp.Close() + return err + } + return fp.Close() +} + +// Copies all unexpired items in the cache into a new map and returns it. +func (c *cache) Items() map[string]Item { + c.mu.RLock() + defer c.mu.RUnlock() + m := make(map[string]Item, len(c.items)) + now := time.Now().UnixNano() + for k, v := range c.items { + // "Inlining" of Expired + if v.Expiration > 0 { + if now > v.Expiration { + continue + } + } + m[k] = v + } + return m +} + +// Returns the number of items in the cache. This may include items that have +// expired, but have not yet been cleaned up. +func (c *cache) ItemCount() int { + c.mu.RLock() + n := len(c.items) + c.mu.RUnlock() + return n +} + +// Delete all items from the cache. +func (c *cache) Flush() { + c.mu.Lock() + c.items = map[string]Item{} + c.mu.Unlock() +} + +type janitor struct { + Interval time.Duration + stop chan bool +} + +func (j *janitor) Run(c *cache) { + ticker := time.NewTicker(j.Interval) + for { + select { + case <-ticker.C: + c.DeleteExpired() + case <-j.stop: + ticker.Stop() + return + } + } +} + +func stopJanitor(c *Cache) { + c.janitor.stop <- true +} + +func runJanitor(c *cache, ci time.Duration) { + j := &janitor{ + Interval: ci, + stop: make(chan bool), + } + c.janitor = j + go j.Run(c) +} + +func newCache(de time.Duration, m map[string]Item) *cache { + if de == 0 { + de = -1 + } + c := &cache{ + defaultExpiration: de, + items: m, + } + return c +} + +func newCacheWithJanitor(de time.Duration, ci time.Duration, m map[string]Item) *Cache { + c := newCache(de, m) + // This trick ensures that the janitor goroutine (which--granted it + // was enabled--is running DeleteExpired on c forever) does not keep + // the returned C object from being garbage collected. When it is + // garbage collected, the finalizer stops the janitor goroutine, after + // which c can be collected. + C := &Cache{c} + if ci > 0 { + runJanitor(c, ci) + runtime.SetFinalizer(C, stopJanitor) + } + return C +} + +// Return a new cache with a given default expiration duration and cleanup +// interval. If the expiration duration is less than one (or NoExpiration), +// the items in the cache never expire (by default), and must be deleted +// manually. If the cleanup interval is less than one, expired items are not +// deleted from the cache before calling c.DeleteExpired(). +func New(defaultExpiration, cleanupInterval time.Duration) *Cache { + items := make(map[string]Item) + return newCacheWithJanitor(defaultExpiration, cleanupInterval, items) +} + +// Return a new cache with a given default expiration duration and cleanup +// interval. If the expiration duration is less than one (or NoExpiration), +// the items in the cache never expire (by default), and must be deleted +// manually. If the cleanup interval is less than one, expired items are not +// deleted from the cache before calling c.DeleteExpired(). +// +// NewFrom() also accepts an items map which will serve as the underlying map +// for the cache. This is useful for starting from a deserialized cache +// (serialized using e.g. gob.Encode() on c.Items()), or passing in e.g. +// make(map[string]Item, 500) to improve startup performance when the cache +// is expected to reach a certain minimum size. +// +// Only the cache's methods synchronize access to this map, so it is not +// recommended to keep any references to the map around after creating a cache. +// If need be, the map can be accessed at a later point using c.Items() (subject +// to the same caveat.) +// +// Note regarding serialization: When using e.g. gob, make sure to +// gob.Register() the individual types stored in the cache before encoding a +// map retrieved with c.Items(), and to register those same types before +// decoding a blob containing an items map. +func NewFrom(defaultExpiration, cleanupInterval time.Duration, items map[string]Item) *Cache { + return newCacheWithJanitor(defaultExpiration, cleanupInterval, items) +} diff --git a/vendor/github.com/patrickmn/go-cache/sharded.go b/vendor/github.com/patrickmn/go-cache/sharded.go new file mode 100644 index 000000000..bcc0538bc --- /dev/null +++ b/vendor/github.com/patrickmn/go-cache/sharded.go @@ -0,0 +1,192 @@ +package cache + +import ( + "crypto/rand" + "math" + "math/big" + insecurerand "math/rand" + "os" + "runtime" + "time" +) + +// This is an experimental and unexported (for now) attempt at making a cache +// with better algorithmic complexity than the standard one, namely by +// preventing write locks of the entire cache when an item is added. As of the +// time of writing, the overhead of selecting buckets results in cache +// operations being about twice as slow as for the standard cache with small +// total cache sizes, and faster for larger ones. +// +// See cache_test.go for a few benchmarks. + +type unexportedShardedCache struct { + *shardedCache +} + +type shardedCache struct { + seed uint32 + m uint32 + cs []*cache + janitor *shardedJanitor +} + +// djb2 with better shuffling. 5x faster than FNV with the hash.Hash overhead. +func djb33(seed uint32, k string) uint32 { + var ( + l = uint32(len(k)) + d = 5381 + seed + l + i = uint32(0) + ) + // Why is all this 5x faster than a for loop? + if l >= 4 { + for i < l-4 { + d = (d * 33) ^ uint32(k[i]) + d = (d * 33) ^ uint32(k[i+1]) + d = (d * 33) ^ uint32(k[i+2]) + d = (d * 33) ^ uint32(k[i+3]) + i += 4 + } + } + switch l - i { + case 1: + case 2: + d = (d * 33) ^ uint32(k[i]) + case 3: + d = (d * 33) ^ uint32(k[i]) + d = (d * 33) ^ uint32(k[i+1]) + case 4: + d = (d * 33) ^ uint32(k[i]) + d = (d * 33) ^ uint32(k[i+1]) + d = (d * 33) ^ uint32(k[i+2]) + } + return d ^ (d >> 16) +} + +func (sc *shardedCache) bucket(k string) *cache { + return sc.cs[djb33(sc.seed, k)%sc.m] +} + +func (sc *shardedCache) Set(k string, x interface{}, d time.Duration) { + sc.bucket(k).Set(k, x, d) +} + +func (sc *shardedCache) Add(k string, x interface{}, d time.Duration) error { + return sc.bucket(k).Add(k, x, d) +} + +func (sc *shardedCache) Replace(k string, x interface{}, d time.Duration) error { + return sc.bucket(k).Replace(k, x, d) +} + +func (sc *shardedCache) Get(k string) (interface{}, bool) { + return sc.bucket(k).Get(k) +} + +func (sc *shardedCache) Increment(k string, n int64) error { + return sc.bucket(k).Increment(k, n) +} + +func (sc *shardedCache) IncrementFloat(k string, n float64) error { + return sc.bucket(k).IncrementFloat(k, n) +} + +func (sc *shardedCache) Decrement(k string, n int64) error { + return sc.bucket(k).Decrement(k, n) +} + +func (sc *shardedCache) Delete(k string) { + sc.bucket(k).Delete(k) +} + +func (sc *shardedCache) DeleteExpired() { + for _, v := range sc.cs { + v.DeleteExpired() + } +} + +// Returns the items in the cache. This may include items that have expired, +// but have not yet been cleaned up. If this is significant, the Expiration +// fields of the items should be checked. Note that explicit synchronization +// is needed to use a cache and its corresponding Items() return values at +// the same time, as the maps are shared. +func (sc *shardedCache) Items() []map[string]Item { + res := make([]map[string]Item, len(sc.cs)) + for i, v := range sc.cs { + res[i] = v.Items() + } + return res +} + +func (sc *shardedCache) Flush() { + for _, v := range sc.cs { + v.Flush() + } +} + +type shardedJanitor struct { + Interval time.Duration + stop chan bool +} + +func (j *shardedJanitor) Run(sc *shardedCache) { + j.stop = make(chan bool) + tick := time.Tick(j.Interval) + for { + select { + case <-tick: + sc.DeleteExpired() + case <-j.stop: + return + } + } +} + +func stopShardedJanitor(sc *unexportedShardedCache) { + sc.janitor.stop <- true +} + +func runShardedJanitor(sc *shardedCache, ci time.Duration) { + j := &shardedJanitor{ + Interval: ci, + } + sc.janitor = j + go j.Run(sc) +} + +func newShardedCache(n int, de time.Duration) *shardedCache { + max := big.NewInt(0).SetUint64(uint64(math.MaxUint32)) + rnd, err := rand.Int(rand.Reader, max) + var seed uint32 + if err != nil { + os.Stderr.Write([]byte("WARNING: go-cache's newShardedCache failed to read from the system CSPRNG (/dev/urandom or equivalent.) Your system's security may be compromised. Continuing with an insecure seed.\n")) + seed = insecurerand.Uint32() + } else { + seed = uint32(rnd.Uint64()) + } + sc := &shardedCache{ + seed: seed, + m: uint32(n), + cs: make([]*cache, n), + } + for i := 0; i < n; i++ { + c := &cache{ + defaultExpiration: de, + items: map[string]Item{}, + } + sc.cs[i] = c + } + return sc +} + +func unexportedNewSharded(defaultExpiration, cleanupInterval time.Duration, shards int) *unexportedShardedCache { + if defaultExpiration == 0 { + defaultExpiration = -1 + } + sc := newShardedCache(shards, defaultExpiration) + SC := &unexportedShardedCache{sc} + if cleanupInterval > 0 { + runShardedJanitor(sc, cleanupInterval) + runtime.SetFinalizer(SC, stopShardedJanitor) + } + return SC +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 5ae9d703f..c6c08113f 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -176,6 +176,9 @@ github.com/onsi/gomega/matchers/support/goraph/edge github.com/onsi/gomega/matchers/support/goraph/node github.com/onsi/gomega/matchers/support/goraph/util github.com/onsi/gomega/types +# github.com/patrickmn/go-cache v2.1.0+incompatible +## explicit +github.com/patrickmn/go-cache # github.com/pmezard/go-difflib v1.0.0 github.com/pmezard/go-difflib/difflib # github.com/prometheus/client_golang v0.9.4 From 72659bacf563358c2bac9e86b931d0926562acab Mon Sep 17 00:00:00 2001 From: "shichun.fsc" Date: Mon, 29 Aug 2022 14:50:11 +0800 Subject: [PATCH 09/14] sync dockerfile with docker buildx multibuild --- Makefile | 15 +++++--- builder/Dockerfile.dind | 81 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+), 5 deletions(-) create mode 100644 builder/Dockerfile.dind diff --git a/Makefile b/Makefile index 73aaa5a98..8109b1b5a 100644 --- a/Makefile +++ b/Makefile @@ -23,7 +23,8 @@ all: build # PLATFORMS is the set of OS_ARCH that NPD can build against. LINUX_PLATFORMS?=linux_amd64 linux_arm64 -PLATFORMS=$(LINUX_PLATFORMS) windows_amd64 +PLATFORMS=$(LINUX_PLATFORMS) +#PLATFORMS=$(LINUX_PLATFORMS) windows_amd64 # VERSION is the version of the binary. VERSION?=$(shell if [ -d .git ]; then echo `git describe --tags --dirty`; else echo "UNKNOWN"; fi) @@ -32,7 +33,7 @@ VERSION?=$(shell if [ -d .git ]; then echo `git describe --tags --dirty`; else e TAG?=$(VERSION) # REGISTRY is the container registry to push into. -REGISTRY?=gcr.io/k8s-staging-npd +REGISTRY?=registry.cn-hangzhou.aliyuncs.com # UPLOAD_PATH is the cloud storage path to upload release tar. UPLOAD_PATH?=gs://kubernetes-release @@ -58,7 +59,7 @@ NPD_NAME_VERSION?=node-problem-detector-$(VERSION) TARBALL=$(NPD_NAME_VERSION).tar.gz # IMAGE is the image name of the node problem detector container image. -IMAGE:=$(REGISTRY)/node-problem-detector:$(TAG) +IMAGE:=$(REGISTRY)/acs/node-problem-detector:$(TAG) # ENABLE_JOURNALD enables build journald support or not. Building journald # support needs libsystemd-dev or libsystemd-journal-dev. @@ -149,9 +150,12 @@ output/windows_amd64/test/bin/%.exe: $(PKG_SOURCES) -tags "$(WINDOWS_BUILD_TAGS)" \ ./test/e2e/$(subst -,,$*) +# =x86_64-linux-gnu-gcc need yum install gcc-x86_64-linux-gnu.x86_64 + output/linux_amd64/bin/%: $(PKG_SOURCES) GOOS=linux GOARCH=amd64 CGO_ENABLED=$(CGO_ENABLED) GO111MODULE=on \ - CC=x86_64-linux-gnu-gcc go build \ +# CC=x86_64-linux-gnu-gcc go build \ + CC=x86_64-redhat-linux-gcc go build \ -mod vendor \ -o $@ \ -ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \ @@ -161,7 +165,8 @@ output/linux_amd64/bin/%: $(PKG_SOURCES) output/linux_amd64/test/bin/%: $(PKG_SOURCES) GOOS=linux GOARCH=amd64 CGO_ENABLED=$(CGO_ENABLED) GO111MODULE=on \ - CC=x86_64-linux-gnu-gcc go build \ +# CC=x86_64-linux-gnu-gcc go build \ + CC=x86_64-redhat-linux-gcc go build \ -mod vendor \ -o $@ \ -tags "$(LINUX_BUILD_TAGS)" \ diff --git a/builder/Dockerfile.dind b/builder/Dockerfile.dind new file mode 100644 index 000000000..c2de09d73 --- /dev/null +++ b/builder/Dockerfile.dind @@ -0,0 +1,81 @@ +ARG GO_VERSION=1.17 +FROM --platform=$TARGETPLATFORM golang:${GO_VERSION} as gobase +ENV GOBIN /usr/local/go/bin + +FROM --platform=$TARGETPLATFORM centos:centos8 AS builder +ARG TARGETPLATFORM +ARG BUILDPLATFORM +ARG BUILDARCH +ARG TARGETARCH + +RUN echo "I am running on $BUILDPLATFORM $BUILDARCH, building for $TARGETPLATFORM $TARGETARCH" + +####### go env ####### + +ENV GOLANG_VERSION ${GO_VERSION} +ENV GOBIN /usr/local/go/bin +ENV GOROOT /usr/local/go +ENV PATH /usr/bin:/usr/local/bin:/usr/local/go/bin:/sbin:/bin +COPY --from=gobase /usr/local/go /usr/local/go + +####### end o env ####### + +RUN sed -i -e "s|mirrorlist=|#mirrorlist=|g" /etc/yum.repos.d/CentOS-* +RUN sed -i -e "s|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g" /etc/yum.repos.d/CentOS-* +RUN yum -y install yum-utils device-mapper-persistent-data lvm2 iptables openssl net-tools binutils wget git unzip\ + && yum-config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo\ + #&& yum install docker -b current -y\ + && yum install docker-ce-19.03.15 -y\ + && yum clean all && rm -rf /var/cache/yum\ + && useradd -M -s /bin/false dockremap\ + && echo "dockremap:524288:65536" >> /etc/subuid\ + && echo "dockremap:524288:65536" >> /etc/subgid + +RUN yum -y update; yum clean all; \ + yum -y install systemd; yum clean all; \ + yum -y install libsystemd-dev; yum clean all;\ + yum -y install systemd-devel; yum clean all;\ + yum -y install gcc-x86_64-linux-gnu.x86_64; yum clean all;\ + yum -y install docker-ce-19.03.15; yum clean all;\ + yum -y install make automake gcc gcc-c++ kernel-devel; yum clean all; + + + +####################################################### + +ENV GO111MODULE=on +ENV GOPROXY=https://goproxy.cn,direct + +ADD . /src/github.com/AliyunContainerService/node-problem-detector +ENV GOPATH /:/src/github.com/AliyunContainerService/node-problem-detector/vendor +WORKDIR /src/github.com/AliyunContainerService/node-problem-detector + +RUN # VERSION="{{.VERSION}}" +RUN # IMAGES="{{.VERSION}}" +RUN # REGIONS="{{.REGIONS}}" +RUN # LOG_LEVEL="{{.LOG_LEVEL}}" +# RUN ls /src/github.com/AliyunContainerService/node-problem-detector/ +# RUN cd /src/github.com/AliyunContainerService/node-problem-detector/ + +RUN make build-binaries LINUX_PLATFORMS=linux_$TARGETARCH ENABLE_JOURNALD=1 +RUN make build-tar LINUX_PLATFORMS=linux_$TARGETARCH ENABLE_JOURNALD=1 + +####################################################### + + +FROM --platform=$TARGETPLATFORM centos:centos8 +MAINTAINER KeyOfSpectator + +RUN systemctl --version + +# RUN clean-install util-linux libsystemd0 systemd bash lsof curl + +# Avoid symlink of /etc/localtime. +RUN test -h /etc/localtime && rm -f /etc/localtime && cp /usr/share/zoneinfo/UTC /etc/localtime || true + +COPY --from=builder /src/github.com/AliyunContainerService/node-problem-detector/bin/node-problem-detector /node-problem-detector +COPY --from=builder /src/github.com/AliyunContainerService/node-problem-detector/bin/health-checker ${LOGCOUNTER} /home/kubernetes/bin/ + +COPY --from=builder /src/github.com/AliyunContainerService/node-problem-detector/config /config +RUN chmod +x /config/plugin/*.sh +ENTRYPOINT ["/node-problem-detector", "--config.system-log-monitor=/config/kernel-monitor.json"] From f1160d6a42e2203c26e79d3e50407d342da5bf40 Mon Sep 17 00:00:00 2001 From: "shichun.fsc" Date: Wed, 8 Feb 2023 18:03:34 +0800 Subject: [PATCH 10/14] refine list-cache pod list logic when PodOOMKilling --- pkg/systemlogmonitor/log_monitor.go | 177 ++++++++++++++-------------- pkg/util/helpers.go | 4 +- 2 files changed, 93 insertions(+), 88 deletions(-) diff --git a/pkg/systemlogmonitor/log_monitor.go b/pkg/systemlogmonitor/log_monitor.go index 9c85a6296..ebd24d79c 100644 --- a/pkg/systemlogmonitor/log_monitor.go +++ b/pkg/systemlogmonitor/log_monitor.go @@ -18,6 +18,7 @@ package systemlogmonitor import ( "encoding/json" + "errors" "fmt" "github.com/patrickmn/go-cache" "io/ioutil" @@ -57,8 +58,9 @@ var ( nodeName string // cache setting - cacheExpireDuration = time.Minute * 30 // cache过期时间为30min - cacheCleanupInterval = time.Minute * 60 // cache清理缓存时间为60min + cacheExpireDurationMinutesEachPod int64 = 30 + cacheExpireDuration = time.Minute * 30 // cache default expire duration = 30min + cacheCleanupInterval = time.Minute * 60 // cache default cleanup interval = 60min ) func init() { @@ -83,9 +85,10 @@ type logMonitor struct { output chan *types.Status tomb *tomb.Tomb - // cache-key: pod uid - // cache-value: pod_name@pod_namespace拼接的字符串 - // 线程安全, 不需要加锁读写 + // cache-key: pod uuid + // cache-value format: pod_name@pod_namespace + // thread-safe + // 1w pod estimate 10Mb memory cache *cache.Cache } @@ -213,21 +216,13 @@ func (l *logMonitor) parseLog(log *logtypes.Log) { func (l *logMonitor) generateStatus(logs []*logtypes.Log, rule systemlogtypes.Rule) *types.Status { // We use the timestamp of the first log line as the timestamp of the status. timestamp := logs[0].Timestamp - message := generateMessage(logs) + logContent := generateMessage(logs) + message := logContent // default event message set to original log content if rule.Reason == OOMREASON && k8sClient != nil { - uuid := string(uuidRegx.Find([]byte(message))) - + uuid := string(uuidRegx.Find([]byte(logContent))) uuid = strings.ReplaceAll(uuid, "_", "-") - - // check cache - if val, ok := l.cache.Get(uuid); ok { - strs := strings.Split(val.(string), "@") - message = fmt.Sprintf("pod was OOM killed. node:%s pod:%s namespace:%s uuid:%s", - nodeName, strs[0], strs[1], uuid) - l.cache.Delete(uuid) - } else { - message = l.cachePod(uuid) - } + // generate event message from cached pod logic. + message = l.generateEventMessage(uuid, message) } var events []types.Event @@ -292,6 +287,68 @@ func (l *logMonitor) generateStatus(logs []*logtypes.Log, rule systemlogtypes.Ru } } +func (l *logMonitor) generateEventMessage(uuid string, logMessage string) string { + // check cache + if cacheVal, ok := l.cache.Get(uuid); ok { + // 1. pod cache hit + podName, namespace := parseCache(uuid, cacheVal.(string)) + if podName != "" { + return generatePodOOMEventMessage(podName, uuid, namespace, nodeName) + } else { + // 1.1 cache dirty, try re cache + err := l.listPodAndCache() + if err != nil { + glog.Errorf("pod oom found, list and cache pod list error. pod uuid: %v, error: %v, cache value: %v", uuid, err, cacheVal) + } + if cacheVal, ok := l.cache.Get(uuid); ok { + podName, namespace := parseCache(uuid, cacheVal.(string)) + if podName != "" { + return generatePodOOMEventMessage(podName, uuid, namespace, nodeName) + } else { + glog.Errorf("pod oom found, but pod parse cache error. pod uuid: %v, cache value: %v", uuid, cacheVal) + } + } else { + glog.Errorf("pod oom found, but pod get cache error. pod uuid: %v, cache value: %v", uuid, cacheVal) + } + } + } else { + // 2. pod cache not hit. try list and cache. + err := l.listPodAndCache() + if err != nil { + glog.Errorf("pod oom found, list and cache pod list error. pod uuid: %v, error: %v, cache value: %v", uuid, err, cacheVal) + } + if cacheVal, ok := l.cache.Get(uuid); ok { + podName, namespace := parseCache(uuid, cacheVal.(string)) + if podName != "" { + return generatePodOOMEventMessage(podName, uuid, namespace, nodeName) + } else { + glog.Errorf("pod oom found, but pod parse cache error. pod uuid: %v, cache value: %v", uuid, cacheVal) + } + } else { + glog.Errorf("pod oom found, but pod get cache error. pod uuid: %v, cache value: %v", uuid, cacheVal) + } + } + // if failed to generate event message, return original event message. + return logMessage +} + +func parseCache(uuid string, cacheValue string) (podName string, namespace string) { + // cache-key: pod uuid + // cache-value format: pod_name@pod_namespace + s := strings.Split(cacheValue, "@") + if len(s) == 2 { + return s[0], s[1] + } else { + glog.Errorf("pod oom found, but pod cache error. pod uuid: %v, cache value: %v", uuid, cacheValue) + } + return "", "" +} + +func generatePodOOMEventMessage(podName string, podUUID string, namespace string, nodeName string) string { + return fmt.Sprintf("pod was OOM killed. node:%s pod:%s namespace:%s uuid:%s", + nodeName, podName, namespace, podUUID) +} + // initializeStatus initializes the internal condition and also reports it to the node problem detector. func (l *logMonitor) initializeStatus() { // Initialize the default node conditions @@ -304,19 +361,18 @@ func (l *logMonitor) initializeStatus() { } } -// 每个node上最多也就60多个pod? -// 有必要使用分批list的策略吗? -func (l *logMonitor) cachePod(uuid string) string { - msgChan := make(chan string) - defer close(msgChan) +// listPodAndCache list pods on this node, find pod with pod uuid. +func (l *logMonitor) listPodAndCache() error { + doneChan := make(chan bool) + defer close(doneChan) pl, err := k8sClient.CoreV1().Pods("").List(metav1.ListOptions{ ResourceVersion: "0", FieldSelector: fmt.Sprintf("spec.nodeName=%s", nodeName), }) if err != nil { - glog.Error("Error in listing pods: %v", err.Error()) - return "" + glog.Error("Error in listing pods, error: %v", err.Error()) + return err } // update cache @@ -324,74 +380,23 @@ func (l *logMonitor) cachePod(uuid string) string { defer util.Recovery() for _, pod := range pods { if _, ok := l.cache.Get(string(pod.UID)); ok { - // todo - // 对于那些稳定的[相对稳定,不会OOM的pod],要不要更新缓存,还是采用一种惰性更新的策略? + // pod already in cache. } else { - l.cache.Set(string(pod.UID), fmt.Sprintf("%s@%s", pod.Name, pod.Namespace), cache.DefaultExpiration+util.RandomCacheDuration()) - } - } - }(pl.Items) - - go func(pods []v1.Pod) { - defer util.Recovery() - for _, pod := range pods { - if string(pod.UID) == uuid { - l.cache.Delete(uuid) - msgChan <- fmt.Sprintf("pod was OOM killed. node:%s pod:%s namespace:%s uuid:%s", - pod.Spec.NodeName, pod.Name, pod.Namespace, uuid) - return + l.cache.Set(string(pod.UID), fmt.Sprintf("%s@%s", pod.Name, pod.Namespace), cache.DefaultExpiration+util.RandomDurationMinute(cacheExpireDurationMinutesEachPod)) } + doneChan <- true } - msgChan <- "" }(pl.Items) - select { - case msg := <-msgChan: - return msg - case <-time.After(time.Millisecond * 500): - return "" - } -} - -func (l *logMonitor) cachePodWithPagination(uuid string) (message string) { - var nextToken = "" - var step int64 = 50 - - // pagination - for { - pods, err := k8sClient.CoreV1().Pods("").List(metav1.ListOptions{ - ResourceVersion: "0", - FieldSelector: fmt.Sprintf("spec.nodeName=%s", nodeName), - Limit: step, - Continue: nextToken, - }) - if err != nil { - glog.Error("Error in listing pods: %v", err.Error()) - return - } - - for _, pod := range pods.Items { - if string(pod.UID) == uuid { - message = fmt.Sprintf("pod was OOM killed. node:%s pod:%s namespace:%s uuid:%s", - pod.Spec.NodeName, pod.Name, pod.Namespace, uuid) - continue - } - - // cache pod - if _, ok := l.cache.Get(string(pod.UID)); ok { - - } else { - l.cache.Set(string(pod.UID), fmt.Sprintf("%s@%s", pod.Name, pod.Namespace), cache.DefaultExpiration+util.RandomCacheDuration()) - } - } - - if pods.Continue == "" { - break + case isDone := <-doneChan: + if isDone { + return nil + } else { + return errors.New("list pod and cache error") } - nextToken = pods.Continue + case <-time.After(time.Millisecond * 1000): + return errors.New("list pod and cache timeout") } - - return } func initialConditions(defaults []types.Condition) []types.Condition { diff --git a/pkg/util/helpers.go b/pkg/util/helpers.go index fe78ee466..35ffcaebd 100644 --- a/pkg/util/helpers.go +++ b/pkg/util/helpers.go @@ -85,7 +85,7 @@ func Recovery() error { return nil } -func RandomCacheDuration() time.Duration { +func RandomDurationMinute(seedMinutes int64) time.Duration { rand.Seed(time.Now().Unix()) - return time.Duration(rand.Int63n(10) * int64(time.Minute)) + return time.Duration(rand.Int63n(seedMinutes) * int64(time.Minute)) } From 2aa1285e683dee84c9a3d7aca6a6fa7394423937 Mon Sep 17 00:00:00 2001 From: "shichun.fsc" Date: Mon, 13 Feb 2023 15:14:34 +0800 Subject: [PATCH 11/14] fix panic recover message format --- pkg/util/helpers.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/util/helpers.go b/pkg/util/helpers.go index 35ffcaebd..795d7e700 100644 --- a/pkg/util/helpers.go +++ b/pkg/util/helpers.go @@ -79,7 +79,7 @@ func Recovery() error { } stack := make([]byte, 2048) length := runtime.Stack(stack, true) - glog.Error("[%s] %s %s\n", "PANIC RECOVER", e, stack[:length]) + glog.Errorf("[%s] %s %s\n", "PANIC RECOVER", e, stack[:length]) return e } return nil From 82744780b05ef525fca3725000fb433df4827ca3 Mon Sep 17 00:00:00 2001 From: "shichun.fsc" Date: Mon, 13 Feb 2023 15:52:45 +0800 Subject: [PATCH 12/14] fix cache bug --- pkg/systemlogmonitor/log_monitor.go | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pkg/systemlogmonitor/log_monitor.go b/pkg/systemlogmonitor/log_monitor.go index 01b2140cb..77cf0e5dc 100644 --- a/pkg/systemlogmonitor/log_monitor.go +++ b/pkg/systemlogmonitor/log_monitor.go @@ -329,7 +329,7 @@ func (l *logMonitor) generateEventMessage(uuid string, logMessage string) string glog.Errorf("pod oom found, but pod parse cache error. pod uuid: %v, cache value: %v", uuid, cacheVal) } } else { - glog.Errorf("pod oom found, but pod get cache error. pod uuid: %v, cache value: %v", uuid, cacheVal) + glog.Errorf("pod oom found, but pod get cache error. pod uuid: %v, cache value: %v, cache length: %v, cache items: %v", uuid, cacheVal, l.cache.ItemCount(), l.cache.Items()) } } // if failed to generate event message, return original event message. @@ -369,11 +369,13 @@ func (l *logMonitor) initializeStatus() { func (l *logMonitor) listPodAndCache() error { doneChan := make(chan bool) defer close(doneChan) - + statisticStartTime := time.Now().UnixNano() pl, err := k8sClient.CoreV1().Pods("").List(metav1.ListOptions{ ResourceVersion: "0", FieldSelector: fmt.Sprintf("spec.nodeName=%s", nodeName), }) + statisticEndListPodTime := time.Now().UnixNano() + glog.Infof("listPod spend time: %v seconds, startTime: %v nanoTimestamp, endTime: %v nanoTimestamp", (statisticEndListPodTime-statisticStartTime)/1e9, statisticStartTime, statisticEndListPodTime) if err != nil { glog.Error("Error in listing pods, error: %v", err.Error()) return err @@ -388,17 +390,21 @@ func (l *logMonitor) listPodAndCache() error { } else { l.cache.Set(string(pod.UID), fmt.Sprintf("%s@%s", pod.Name, pod.Namespace), cache.DefaultExpiration+util.RandomDurationMinute(cacheExpireDurationMinutesEachPod)) } - doneChan <- true } + doneChan <- true }(pl.Items) select { case isDone := <-doneChan: if isDone { + statisticEndCachePodTime := time.Now().UnixNano() + glog.V(8).Infof("pod cache content, cache length: %v, cache items: %v", l.cache.ItemCount(), l.cache.Items()) + glog.Infof("listPodAndCache spend time: %v seconds, startTime: %v nanoTimestamp, endTime: %v nanoTimestamp", (statisticEndCachePodTime-statisticStartTime)/1e9, statisticStartTime, statisticEndCachePodTime) return nil } else { return errors.New("list pod and cache error") } - case <-time.After(time.Millisecond * 1000): + case <-time.After(time.Second * 5): + glog.Errorf("listPodAndCache timeout. startTime: %v nanoTimestamp", statisticStartTime) return errors.New("list pod and cache timeout") } } From 53831d727425f740b5e488f834a74ce8c8089465 Mon Sep 17 00:00:00 2001 From: "shichun.fsc" Date: Mon, 13 Feb 2023 18:20:26 +0800 Subject: [PATCH 13/14] add log --- pkg/systemlogmonitor/log_monitor.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/systemlogmonitor/log_monitor.go b/pkg/systemlogmonitor/log_monitor.go index 77cf0e5dc..7dd10ffbf 100644 --- a/pkg/systemlogmonitor/log_monitor.go +++ b/pkg/systemlogmonitor/log_monitor.go @@ -306,6 +306,7 @@ func (l *logMonitor) generateEventMessage(uuid string, logMessage string) string } if cacheVal, ok := l.cache.Get(uuid); ok { podName, namespace := parseCache(uuid, cacheVal.(string)) + glog.V(9).Infof("pod oom hit pod list cache. podName: %v, namespace: %v", podName, namespace) if podName != "" { return generatePodOOMEventMessage(podName, uuid, namespace, nodeName) } else { From 502cf3068983d3a4c8a541eb78479a5c9cdac296 Mon Sep 17 00:00:00 2001 From: "shichun.fsc" Date: Mon, 13 Feb 2023 18:24:25 +0800 Subject: [PATCH 14/14] refine log --- pkg/systemlogmonitor/log_monitor.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/systemlogmonitor/log_monitor.go b/pkg/systemlogmonitor/log_monitor.go index 7dd10ffbf..3869df18b 100644 --- a/pkg/systemlogmonitor/log_monitor.go +++ b/pkg/systemlogmonitor/log_monitor.go @@ -376,7 +376,7 @@ func (l *logMonitor) listPodAndCache() error { FieldSelector: fmt.Sprintf("spec.nodeName=%s", nodeName), }) statisticEndListPodTime := time.Now().UnixNano() - glog.Infof("listPod spend time: %v seconds, startTime: %v nanoTimestamp, endTime: %v nanoTimestamp", (statisticEndListPodTime-statisticStartTime)/1e9, statisticStartTime, statisticEndListPodTime) + glog.Infof("listPod spend time: %v ms, startTime: %v nanoTimestamp, endTime: %v nanoTimestamp", (statisticEndListPodTime-statisticStartTime)/1e6, statisticStartTime, statisticEndListPodTime) if err != nil { glog.Error("Error in listing pods, error: %v", err.Error()) return err @@ -399,7 +399,7 @@ func (l *logMonitor) listPodAndCache() error { if isDone { statisticEndCachePodTime := time.Now().UnixNano() glog.V(8).Infof("pod cache content, cache length: %v, cache items: %v", l.cache.ItemCount(), l.cache.Items()) - glog.Infof("listPodAndCache spend time: %v seconds, startTime: %v nanoTimestamp, endTime: %v nanoTimestamp", (statisticEndCachePodTime-statisticStartTime)/1e9, statisticStartTime, statisticEndCachePodTime) + glog.Infof("listPodAndCache spend time: %v ms, startTime: %v nanoTimestamp, endTime: %v nanoTimestamp", (statisticEndCachePodTime-statisticStartTime)/1e6, statisticStartTime, statisticEndCachePodTime) return nil } else { return errors.New("list pod and cache error")