From 7228c1cfa5f40bdf0b5bda66d49decf1ee9e413e Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Tue, 30 Oct 2018 14:33:43 -0400 Subject: [PATCH 01/56] Update vendored libraries: - Add github.com/promtheus/tsdb/wal - Update github.com/prometheus/client_golang for WrapRegistererWith function. Signed-off-by: Tom Wilkie --- Gopkg.lock | 19 +- .../github.com/nightlyone/lockfile/.gitignore | 27 - .../nightlyone/lockfile/.gitmodules | 3 - .../nightlyone/lockfile/.travis.yml | 14 - vendor/github.com/nightlyone/lockfile/LICENSE | 19 - .../github.com/nightlyone/lockfile/README.md | 52 -- .../nightlyone/lockfile/appveyor.yml | 12 - .../nightlyone/lockfile/lockfile.go | 211 ----- .../nightlyone/lockfile/lockfile_unix.go | 20 - .../nightlyone/lockfile/lockfile_windows.go | 30 - .../client_golang/api/prometheus/v1/api.go | 33 +- .../client_golang/prometheus/collector.go | 36 +- .../client_golang/prometheus/desc.go | 8 +- .../client_golang/prometheus/fnv.go | 13 + .../client_golang/prometheus/go_collector.go | 13 + .../client_golang/prometheus/histogram.go | 155 +++- .../client_golang/prometheus/http.go | 136 ++- .../prometheus/internal/metric.go | 85 ++ .../client_golang/prometheus/labels.go | 13 + .../client_golang/prometheus/metric.go | 56 +- .../prometheus/process_collector.go | 115 ++- .../client_golang/prometheus/promhttp/http.go | 104 +-- .../client_golang/prometheus/registry.go | 120 +-- .../client_golang/prometheus/summary.go | 17 +- .../client_golang/prometheus/value.go | 12 +- .../client_golang/prometheus/wrap.go | 179 ++++ vendor/github.com/prometheus/tsdb/.gitignore | 1 + vendor/github.com/prometheus/tsdb/.travis.yml | 17 +- .../github.com/prometheus/tsdb/MAINTAINERS.md | 4 + vendor/github.com/prometheus/tsdb/Makefile | 34 + .../prometheus/tsdb/Makefile.common | 132 +++ vendor/github.com/prometheus/tsdb/README.md | 13 +- vendor/github.com/prometheus/tsdb/block.go | 96 +- .../github.com/prometheus/tsdb/checkpoint.go | 269 ++++++ .../prometheus/tsdb/chunkenc/xor.go | 4 +- .../prometheus/tsdb/chunks/chunks.go | 16 +- vendor/github.com/prometheus/tsdb/compact.go | 53 +- vendor/github.com/prometheus/tsdb/db.go | 436 +++++---- .../prometheus/tsdb/encoding_helpers.go | 2 +- .../prometheus/tsdb/fileutil/fileutil.go | 112 +++ .../prometheus/tsdb/fileutil/flock.go | 41 + .../prometheus/tsdb/fileutil/flock_plan9.go | 32 + .../prometheus/tsdb/fileutil/flock_solaris.go | 59 ++ .../prometheus/tsdb/fileutil/flock_unix.go | 54 ++ .../prometheus/tsdb/fileutil/flock_windows.go | 36 + .../prometheus/tsdb/fileutil/sync_linux.go | 2 +- vendor/github.com/prometheus/tsdb/head.go | 535 ++++++++--- .../github.com/prometheus/tsdb/index/index.go | 68 +- vendor/github.com/prometheus/tsdb/querier.go | 30 +- vendor/github.com/prometheus/tsdb/record.go | 213 +++++ vendor/github.com/prometheus/tsdb/repair.go | 36 +- .../github.com/prometheus/tsdb/tombstones.go | 64 +- vendor/github.com/prometheus/tsdb/wal.go | 155 +++- vendor/github.com/prometheus/tsdb/wal/wal.go | 856 ++++++++++++++++++ 54 files changed, 3689 insertions(+), 1183 deletions(-) delete mode 100644 vendor/github.com/nightlyone/lockfile/.gitignore delete mode 100644 vendor/github.com/nightlyone/lockfile/.gitmodules delete mode 100644 vendor/github.com/nightlyone/lockfile/.travis.yml delete mode 100644 vendor/github.com/nightlyone/lockfile/LICENSE delete mode 100644 vendor/github.com/nightlyone/lockfile/README.md delete mode 100644 vendor/github.com/nightlyone/lockfile/appveyor.yml delete mode 100644 vendor/github.com/nightlyone/lockfile/lockfile.go delete mode 100644 vendor/github.com/nightlyone/lockfile/lockfile_unix.go delete mode 100644 vendor/github.com/nightlyone/lockfile/lockfile_windows.go create mode 100644 vendor/github.com/prometheus/client_golang/prometheus/internal/metric.go create mode 100644 vendor/github.com/prometheus/client_golang/prometheus/wrap.go create mode 100644 vendor/github.com/prometheus/tsdb/.gitignore create mode 100644 vendor/github.com/prometheus/tsdb/MAINTAINERS.md create mode 100644 vendor/github.com/prometheus/tsdb/Makefile create mode 100644 vendor/github.com/prometheus/tsdb/Makefile.common create mode 100644 vendor/github.com/prometheus/tsdb/checkpoint.go create mode 100644 vendor/github.com/prometheus/tsdb/fileutil/flock.go create mode 100644 vendor/github.com/prometheus/tsdb/fileutil/flock_plan9.go create mode 100644 vendor/github.com/prometheus/tsdb/fileutil/flock_solaris.go create mode 100644 vendor/github.com/prometheus/tsdb/fileutil/flock_unix.go create mode 100644 vendor/github.com/prometheus/tsdb/fileutil/flock_windows.go create mode 100644 vendor/github.com/prometheus/tsdb/record.go create mode 100644 vendor/github.com/prometheus/tsdb/wal/wal.go diff --git a/Gopkg.lock b/Gopkg.lock index f35732327e8..24ac5e7ded4 100644 --- a/Gopkg.lock +++ b/Gopkg.lock @@ -613,14 +613,6 @@ pruneopts = "UT" revision = "164c5fae744b141cd3e4a182b40d66fd4655c822" -[[projects]] - branch = "master" - digest = "1:16886567e49201f2bb97fc738dfe8097494764135a83b533fc020fcefe37d8fe" - name = "github.com/nightlyone/lockfile" - packages = ["."] - pruneopts = "UT" - revision = "6a197d5ea61168f2ac821de2b7f011b250904900" - [[projects]] digest = "1:d5f4c5b22e15b338051c4e26387e980b60d8e9eed13938adf1c415913ac64dbd" name = "github.com/oklog/oklog" @@ -723,17 +715,18 @@ [[projects]] branch = "master" - digest = "1:511d8b5b4cef21fdc5c660e5d9c46a2d6d039809d9600ff1518b033e300522b9" + digest = "1:38fc24d62aaef064aa5277a6c9560c8520d94633e636804dc13ec3a7da24a4c6" name = "github.com/prometheus/client_golang" packages = [ "api", "api/prometheus/v1", "prometheus", + "prometheus/internal", "prometheus/promauto", "prometheus/promhttp", ] pruneopts = "UT" - revision = "bcbbc08eb2ddff3af83bbf11e7ec13b4fd730b6e" + revision = "f30f428035633da15d00d3dfefb0128c5e569ef4" [[projects]] branch = "master" @@ -818,7 +811,7 @@ [[projects]] branch = "master" - digest = "1:8a81e6f6374afec45e2a8686f8c25e9551fa0fd50187ae0de767e54b1ae6e8cb" + digest = "1:2cdda56c55a79af892e94b2f2a9f63558a8dbb62f1c9125263ea0e45cfde981a" name = "github.com/prometheus/tsdb" packages = [ ".", @@ -827,9 +820,10 @@ "fileutil", "index", "labels", + "wal", ] pruneopts = "UT" - revision = "195bc0d286b077f5633eb4bda76743620588b2fa" + revision = "23a5f09085e5ee9423ea570f9c0bba4a02eae854" [[projects]] branch = "master" @@ -1412,6 +1406,7 @@ "github.com/prometheus/prometheus/web/api/v1", "github.com/prometheus/tsdb", "github.com/prometheus/tsdb/fileutil", + "github.com/prometheus/tsdb/wal", "github.com/segmentio/fasthash/fnv1a", "github.com/stretchr/testify/assert", "github.com/stretchr/testify/require", diff --git a/vendor/github.com/nightlyone/lockfile/.gitignore b/vendor/github.com/nightlyone/lockfile/.gitignore deleted file mode 100644 index 5a05665dee6..00000000000 --- a/vendor/github.com/nightlyone/lockfile/.gitignore +++ /dev/null @@ -1,27 +0,0 @@ -# Compiled Object files, Static and Dynamic libs (Shared Objects) -*.o -*.a -*.so - -# Folders -_obj -_test - -# popular temporaries -.err -.out -.diff - -# Architecture specific extensions/prefixes -*.[568vq] -[568vq].out - -*.cgo1.go -*.cgo2.c -_cgo_defun.c -_cgo_gotypes.go -_cgo_export.* - -_testmain.go - -*.exe diff --git a/vendor/github.com/nightlyone/lockfile/.gitmodules b/vendor/github.com/nightlyone/lockfile/.gitmodules deleted file mode 100644 index 6faa9e3469f..00000000000 --- a/vendor/github.com/nightlyone/lockfile/.gitmodules +++ /dev/null @@ -1,3 +0,0 @@ -[submodule "git-hooks"] - path = git-hooks - url = https://github.com/nightlyone/git-hooks diff --git a/vendor/github.com/nightlyone/lockfile/.travis.yml b/vendor/github.com/nightlyone/lockfile/.travis.yml deleted file mode 100644 index 76e5962bf1f..00000000000 --- a/vendor/github.com/nightlyone/lockfile/.travis.yml +++ /dev/null @@ -1,14 +0,0 @@ -language: go -go: - - 1.4.3 - - 1.6.2 - - tip - -# Only test commits to production branch and all pull requests -branches: - only: - - master - -matrix: - allow_failures: - - go: tip diff --git a/vendor/github.com/nightlyone/lockfile/LICENSE b/vendor/github.com/nightlyone/lockfile/LICENSE deleted file mode 100644 index eb5b8046858..00000000000 --- a/vendor/github.com/nightlyone/lockfile/LICENSE +++ /dev/null @@ -1,19 +0,0 @@ -Copyright (c) 2012 Ingo Oeser - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. diff --git a/vendor/github.com/nightlyone/lockfile/README.md b/vendor/github.com/nightlyone/lockfile/README.md deleted file mode 100644 index c35235cdaee..00000000000 --- a/vendor/github.com/nightlyone/lockfile/README.md +++ /dev/null @@ -1,52 +0,0 @@ -lockfile -========= -Handle locking via pid files. - -[![Build Status Unix][1]][2] -[![Build status Windows][3]][4] - -[1]: https://secure.travis-ci.org/nightlyone/lockfile.png -[2]: https://travis-ci.org/nightlyone/lockfile -[3]: https://ci.appveyor.com/api/projects/status/7mojkmauj81uvp8u/branch/master?svg=true -[4]: https://ci.appveyor.com/project/nightlyone/lockfile/branch/master - - - -install -------- -Install [Go 1][5], either [from source][6] or [with a prepackaged binary][7]. -For Windows suport, Go 1.4 or newer is required. - -Then run - - go get github.com/nightlyone/lockfile - -[5]: http://golang.org -[6]: http://golang.org/doc/install/source -[7]: http://golang.org/doc/install - -LICENSE -------- -MIT - -documentation -------------- -[package documentation at godoc.org](http://godoc.org/github.com/nightlyone/lockfile) - -install -------------------- - go get github.com/nightlyone/lockfile - - -contributing -============ - -Contributions are welcome. Please open an issue or send me a pull request for a dedicated branch. -Make sure the git commit hooks show it works. - -git commit hooks ------------------------ -enable commit hooks via - - cd .git ; rm -rf hooks; ln -s ../git-hooks hooks ; cd .. - diff --git a/vendor/github.com/nightlyone/lockfile/appveyor.yml b/vendor/github.com/nightlyone/lockfile/appveyor.yml deleted file mode 100644 index cf72a58b137..00000000000 --- a/vendor/github.com/nightlyone/lockfile/appveyor.yml +++ /dev/null @@ -1,12 +0,0 @@ -clone_folder: c:\gopath\src\github.com\nightlyone\lockfile - -environment: - GOPATH: c:\gopath - -install: - - go version - - go env - - go get -v -t ./... - -build_script: - - go test -v ./... diff --git a/vendor/github.com/nightlyone/lockfile/lockfile.go b/vendor/github.com/nightlyone/lockfile/lockfile.go deleted file mode 100644 index af2d84a5565..00000000000 --- a/vendor/github.com/nightlyone/lockfile/lockfile.go +++ /dev/null @@ -1,211 +0,0 @@ -// Package lockfile handles pid file based locking. -// While a sync.Mutex helps against concurrency issues within a single process, -// this package is designed to help against concurrency issues between cooperating processes -// or serializing multiple invocations of the same process. You can also combine sync.Mutex -// with Lockfile in order to serialize an action between different goroutines in a single program -// and also multiple invocations of this program. -package lockfile - -import ( - "errors" - "fmt" - "io" - "io/ioutil" - "os" - "path/filepath" -) - -// Lockfile is a pid file which can be locked -type Lockfile string - -// TemporaryError is a type of error where a retry after a random amount of sleep should help to mitigate it. -type TemporaryError string - -func (t TemporaryError) Error() string { return string(t) } - -// Temporary returns always true. -// It exists, so you can detect it via -// if te, ok := err.(interface{ Temporary() bool }); ok { -// fmt.Println("I am a temporay error situation, so wait and retry") -// } -func (t TemporaryError) Temporary() bool { return true } - -// Various errors returned by this package -var ( - ErrBusy = TemporaryError("Locked by other process") // If you get this, retry after a short sleep might help - ErrNotExist = TemporaryError("Lockfile created, but doesn't exist") // If you get this, retry after a short sleep might help - ErrNeedAbsPath = errors.New("Lockfiles must be given as absolute path names") - ErrInvalidPid = errors.New("Lockfile contains invalid pid for system") - ErrDeadOwner = errors.New("Lockfile contains pid of process not existent on this system anymore") - ErrRogueDeletion = errors.New("Lockfile owned by me has been removed unexpectedly") -) - -// New describes a new filename located at the given absolute path. -func New(path string) (Lockfile, error) { - if !filepath.IsAbs(path) { - return Lockfile(""), ErrNeedAbsPath - } - return Lockfile(path), nil -} - -// GetOwner returns who owns the lockfile. -func (l Lockfile) GetOwner() (*os.Process, error) { - name := string(l) - - // Ok, see, if we have a stale lockfile here - content, err := ioutil.ReadFile(name) - if err != nil { - return nil, err - } - - // try hard for pids. If no pid, the lockfile is junk anyway and we delete it. - pid, err := scanPidLine(content) - if err != nil { - return nil, err - } - running, err := isRunning(pid) - if err != nil { - return nil, err - } - - if running { - proc, err := os.FindProcess(pid) - if err != nil { - return nil, err - } - return proc, nil - } - return nil, ErrDeadOwner - -} - -// TryLock tries to own the lock. -// It Returns nil, if successful and and error describing the reason, it didn't work out. -// Please note, that existing lockfiles containing pids of dead processes -// and lockfiles containing no pid at all are simply deleted. -func (l Lockfile) TryLock() error { - name := string(l) - - // This has been checked by New already. If we trigger here, - // the caller didn't use New and re-implemented it's functionality badly. - // So panic, that he might find this easily during testing. - if !filepath.IsAbs(name) { - panic(ErrNeedAbsPath) - } - - tmplock, err := ioutil.TempFile(filepath.Dir(name), filepath.Base(name)+".") - if err != nil { - return err - } - - cleanup := func() { - _ = tmplock.Close() - _ = os.Remove(tmplock.Name()) - } - defer cleanup() - - if err := writePidLine(tmplock, os.Getpid()); err != nil { - return err - } - - // EEXIST and similiar error codes, caught by os.IsExist, are intentionally ignored, - // as it means that someone was faster creating this link - // and ignoring this kind of error is part of the algorithm. - // The we will probably fail the pid owner check later, if this process is still alive. - // We cannot ignore ALL errors, since failure to support hard links, disk full - // as well as many other errors can happen to a filesystem operation - // and we really want to abort on those. - if err := os.Link(tmplock.Name(), name); err != nil { - if !os.IsExist(err) { - return err - } - } - - fiTmp, err := os.Lstat(tmplock.Name()) - if err != nil { - return err - } - fiLock, err := os.Lstat(name) - if err != nil { - // tell user that a retry would be a good idea - if os.IsNotExist(err) { - return ErrNotExist - } - return err - } - - // Success - if os.SameFile(fiTmp, fiLock) { - return nil - } - - proc, err := l.GetOwner() - switch err { - default: - // Other errors -> defensively fail and let caller handle this - return err - case nil: - if proc.Pid != os.Getpid() { - return ErrBusy - } - case ErrDeadOwner, ErrInvalidPid: - // cases we can fix below - } - - // clean stale/invalid lockfile - err = os.Remove(name) - if err != nil { - // If it doesn't exist, then it doesn't matter who removed it. - if !os.IsNotExist(err) { - return err - } - } - - // now that the stale lockfile is gone, let's recurse - return l.TryLock() -} - -// Unlock a lock again, if we owned it. Returns any error that happend during release of lock. -func (l Lockfile) Unlock() error { - proc, err := l.GetOwner() - switch err { - case ErrInvalidPid, ErrDeadOwner: - return ErrRogueDeletion - case nil: - if proc.Pid == os.Getpid() { - // we really own it, so let's remove it. - return os.Remove(string(l)) - } - // Not owned by me, so don't delete it. - return ErrRogueDeletion - default: - // This is an application error or system error. - // So give a better error for logging here. - if os.IsNotExist(err) { - return ErrRogueDeletion - } - // Other errors -> defensively fail and let caller handle this - return err - } -} - -func writePidLine(w io.Writer, pid int) error { - _, err := io.WriteString(w, fmt.Sprintf("%d\n", pid)) - return err -} - -func scanPidLine(content []byte) (int, error) { - if len(content) == 0 { - return 0, ErrInvalidPid - } - - var pid int - if _, err := fmt.Sscanln(string(content), &pid); err != nil { - return 0, ErrInvalidPid - } - - if pid <= 0 { - return 0, ErrInvalidPid - } - return pid, nil -} diff --git a/vendor/github.com/nightlyone/lockfile/lockfile_unix.go b/vendor/github.com/nightlyone/lockfile/lockfile_unix.go deleted file mode 100644 index 742b041fb62..00000000000 --- a/vendor/github.com/nightlyone/lockfile/lockfile_unix.go +++ /dev/null @@ -1,20 +0,0 @@ -// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris - -package lockfile - -import ( - "os" - "syscall" -) - -func isRunning(pid int) (bool, error) { - proc, err := os.FindProcess(pid) - if err != nil { - return false, err - } - - if err := proc.Signal(syscall.Signal(0)); err != nil { - return false, nil - } - return true, nil -} diff --git a/vendor/github.com/nightlyone/lockfile/lockfile_windows.go b/vendor/github.com/nightlyone/lockfile/lockfile_windows.go deleted file mode 100644 index 482bd91d7bf..00000000000 --- a/vendor/github.com/nightlyone/lockfile/lockfile_windows.go +++ /dev/null @@ -1,30 +0,0 @@ -package lockfile - -import ( - "syscall" -) - -//For some reason these consts don't exist in syscall. -const ( - error_invalid_parameter = 87 - code_still_active = 259 -) - -func isRunning(pid int) (bool, error) { - procHnd, err := syscall.OpenProcess(syscall.PROCESS_QUERY_INFORMATION, true, uint32(pid)) - if err != nil { - if scerr, ok := err.(syscall.Errno); ok { - if uintptr(scerr) == error_invalid_parameter { - return false, nil - } - } - } - - var code uint32 - err = syscall.GetExitCodeProcess(procHnd, &code) - if err != nil { - return false, err - } - - return code == code_still_active, nil -} diff --git a/vendor/github.com/prometheus/client_golang/api/prometheus/v1/api.go b/vendor/github.com/prometheus/client_golang/api/prometheus/v1/api.go index 192ba49c0d3..255f3ba1388 100644 --- a/vendor/github.com/prometheus/client_golang/api/prometheus/v1/api.go +++ b/vendor/github.com/prometheus/client_golang/api/prometheus/v1/api.go @@ -60,6 +60,8 @@ const ( ErrCanceled = "canceled" ErrExec = "execution" ErrBadResponse = "bad_response" + ErrServer = "server_error" + ErrClient = "client_error" // Possible values for HealthStatus. HealthGood HealthStatus = "up" @@ -69,8 +71,9 @@ const ( // Error is an error returned by the API. type Error struct { - Type ErrorType - Msg string + Type ErrorType + Msg string + Detail string } func (e *Error) Error() string { @@ -460,6 +463,16 @@ func apiError(code int) bool { return code == statusAPIError || code == http.StatusBadRequest } +func errorTypeAndMsgFor(resp *http.Response) (ErrorType, string) { + switch resp.StatusCode / 100 { + case 4: + return ErrClient, fmt.Sprintf("client error: %d", resp.StatusCode) + case 5: + return ErrServer, fmt.Sprintf("server error: %d", resp.StatusCode) + } + return ErrBadResponse, fmt.Sprintf("bad response code %d", resp.StatusCode) +} + func (c apiClient) Do(ctx context.Context, req *http.Request) (*http.Response, []byte, error) { resp, body, err := c.Client.Do(ctx, req) if err != nil { @@ -469,18 +482,22 @@ func (c apiClient) Do(ctx context.Context, req *http.Request) (*http.Response, [ code := resp.StatusCode if code/100 != 2 && !apiError(code) { + errorType, errorMsg := errorTypeAndMsgFor(resp) return resp, body, &Error{ - Type: ErrBadResponse, - Msg: fmt.Sprintf("bad response code %d", resp.StatusCode), + Type: errorType, + Msg: errorMsg, + Detail: string(body), } } var result apiResponse - if err = json.Unmarshal(body, &result); err != nil { - return resp, body, &Error{ - Type: ErrBadResponse, - Msg: err.Error(), + if http.StatusNoContent != code { + if err = json.Unmarshal(body, &result); err != nil { + return resp, body, &Error{ + Type: ErrBadResponse, + Msg: err.Error(), + } } } diff --git a/vendor/github.com/prometheus/client_golang/prometheus/collector.go b/vendor/github.com/prometheus/client_golang/prometheus/collector.go index 3c9bae24b9e..c0d70b2faf1 100644 --- a/vendor/github.com/prometheus/client_golang/prometheus/collector.go +++ b/vendor/github.com/prometheus/client_golang/prometheus/collector.go @@ -40,7 +40,8 @@ type Collector interface { // Collector may yield any Metric it sees fit in its Collect method. // // This method idempotently sends the same descriptors throughout the - // lifetime of the Collector. + // lifetime of the Collector. It may be called concurrently and + // therefore must be implemented in a concurrency safe way. // // If a Collector encounters an error while executing this method, it // must send an invalid descriptor (created with NewInvalidDesc) to @@ -61,6 +62,39 @@ type Collector interface { Collect(chan<- Metric) } +// DescribeByCollect is a helper to implement the Describe method of a custom +// Collector. It collects the metrics from the provided Collector and sends +// their descriptors to the provided channel. +// +// If a Collector collects the same metrics throughout its lifetime, its +// Describe method can simply be implemented as: +// +// func (c customCollector) Describe(ch chan<- *Desc) { +// DescribeByCollect(c, ch) +// } +// +// However, this will not work if the metrics collected change dynamically over +// the lifetime of the Collector in a way that their combined set of descriptors +// changes as well. The shortcut implementation will then violate the contract +// of the Describe method. If a Collector sometimes collects no metrics at all +// (for example vectors like CounterVec, GaugeVec, etc., which only collect +// metrics after a metric with a fully specified label set has been accessed), +// it might even get registered as an unchecked Collecter (cf. the Register +// method of the Registerer interface). Hence, only use this shortcut +// implementation of Describe if you are certain to fulfill the contract. +// +// The Collector example demonstrates a use of DescribeByCollect. +func DescribeByCollect(c Collector, descs chan<- *Desc) { + metrics := make(chan Metric) + go func() { + c.Collect(metrics) + close(metrics) + }() + for m := range metrics { + descs <- m.Desc() + } +} + // selfCollector implements Collector for a single Metric so that the Metric // collects itself. Add it as an anonymous field to a struct that implements // Metric, and call init with the Metric itself as an argument. diff --git a/vendor/github.com/prometheus/client_golang/prometheus/desc.go b/vendor/github.com/prometheus/client_golang/prometheus/desc.go index 4a755b0fa51..7b8827ffbca 100644 --- a/vendor/github.com/prometheus/client_golang/prometheus/desc.go +++ b/vendor/github.com/prometheus/client_golang/prometheus/desc.go @@ -67,7 +67,7 @@ type Desc struct { // NewDesc allocates and initializes a new Desc. Errors are recorded in the Desc // and will be reported on registration time. variableLabels and constLabels can -// be nil if no such labels should be set. fqName and help must not be empty. +// be nil if no such labels should be set. fqName must not be empty. // // variableLabels only contain the label names. Their label values are variable // and therefore not part of the Desc. (They are managed within the Metric.) @@ -80,10 +80,6 @@ func NewDesc(fqName, help string, variableLabels []string, constLabels Labels) * help: help, variableLabels: variableLabels, } - if help == "" { - d.err = errors.New("empty help string") - return d - } if !model.IsValidMetricName(model.LabelValue(fqName)) { d.err = fmt.Errorf("%q is not a valid metric name", fqName) return d @@ -156,7 +152,7 @@ func NewDesc(fqName, help string, variableLabels []string, constLabels Labels) * Value: proto.String(v), }) } - sort.Sort(LabelPairSorter(d.constLabelPairs)) + sort.Sort(labelPairSorter(d.constLabelPairs)) return d } diff --git a/vendor/github.com/prometheus/client_golang/prometheus/fnv.go b/vendor/github.com/prometheus/client_golang/prometheus/fnv.go index e3b67df8ac0..3d383a735c3 100644 --- a/vendor/github.com/prometheus/client_golang/prometheus/fnv.go +++ b/vendor/github.com/prometheus/client_golang/prometheus/fnv.go @@ -1,3 +1,16 @@ +// Copyright 2018 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package prometheus // Inline and byte-free variant of hash/fnv's fnv64a. diff --git a/vendor/github.com/prometheus/client_golang/prometheus/go_collector.go b/vendor/github.com/prometheus/client_golang/prometheus/go_collector.go index 0440bd12065..ba3b9333edd 100644 --- a/vendor/github.com/prometheus/client_golang/prometheus/go_collector.go +++ b/vendor/github.com/prometheus/client_golang/prometheus/go_collector.go @@ -1,3 +1,16 @@ +// Copyright 2018 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package prometheus import ( diff --git a/vendor/github.com/prometheus/client_golang/prometheus/histogram.go b/vendor/github.com/prometheus/client_golang/prometheus/histogram.go index 331783a75ce..4d7fa976e47 100644 --- a/vendor/github.com/prometheus/client_golang/prometheus/histogram.go +++ b/vendor/github.com/prometheus/client_golang/prometheus/histogram.go @@ -16,7 +16,9 @@ package prometheus import ( "fmt" "math" + "runtime" "sort" + "sync" "sync/atomic" "github.com/golang/protobuf/proto" @@ -108,8 +110,9 @@ func ExponentialBuckets(start, factor float64, count int) []float64 { } // HistogramOpts bundles the options for creating a Histogram metric. It is -// mandatory to set Name and Help to a non-empty string. All other fields are -// optional and can safely be left at their zero value. +// mandatory to set Name to a non-empty string. All other fields are optional +// and can safely be left at their zero value, although it is strongly +// encouraged to set a Help string. type HistogramOpts struct { // Namespace, Subsystem, and Name are components of the fully-qualified // name of the Histogram (created by joining these components with @@ -120,7 +123,7 @@ type HistogramOpts struct { Subsystem string Name string - // Help provides information about this Histogram. Mandatory! + // Help provides information about this Histogram. // // Metrics with the same fully-qualified name must have the same Help // string. @@ -184,6 +187,7 @@ func newHistogram(desc *Desc, opts HistogramOpts, labelValues ...string) Histogr desc: desc, upperBounds: opts.Buckets, labelPairs: makeLabelPairs(desc, labelValues), + counts: [2]*histogramCounts{&histogramCounts{}, &histogramCounts{}}, } for i, upperBound := range h.upperBounds { if i < len(h.upperBounds)-1 { @@ -200,28 +204,53 @@ func newHistogram(desc *Desc, opts HistogramOpts, labelValues ...string) Histogr } } } - // Finally we know the final length of h.upperBounds and can make counts. - h.counts = make([]uint64, len(h.upperBounds)) + // Finally we know the final length of h.upperBounds and can make counts + // for both states: + h.counts[0].buckets = make([]uint64, len(h.upperBounds)) + h.counts[1].buckets = make([]uint64, len(h.upperBounds)) h.init(h) // Init self-collection. return h } -type histogram struct { +type histogramCounts struct { // sumBits contains the bits of the float64 representing the sum of all // observations. sumBits and count have to go first in the struct to // guarantee alignment for atomic operations. // http://golang.org/pkg/sync/atomic/#pkg-note-BUG sumBits uint64 count uint64 + buckets []uint64 +} - selfCollector - // Note that there is no mutex required. +type histogram struct { + // countAndHotIdx is a complicated one. For lock-free yet atomic + // observations, we need to save the total count of observations again, + // combined with the index of the currently-hot counts struct, so that + // we can perform the operation on both values atomically. The least + // significant bit defines the hot counts struct. The remaining 63 bits + // represent the total count of observations. This happens under the + // assumption that the 63bit count will never overflow. Rationale: An + // observations takes about 30ns. Let's assume it could happen in + // 10ns. Overflowing the counter will then take at least (2^63)*10ns, + // which is about 3000 years. + // + // This has to be first in the struct for 64bit alignment. See + // http://golang.org/pkg/sync/atomic/#pkg-note-BUG + countAndHotIdx uint64 - desc *Desc + selfCollector + desc *Desc + writeMtx sync.Mutex // Only used in the Write method. upperBounds []float64 - counts []uint64 + + // Two counts, one is "hot" for lock-free observations, the other is + // "cold" for writing out a dto.Metric. It has to be an array of + // pointers to guarantee 64bit alignment of the histogramCounts, see + // http://golang.org/pkg/sync/atomic/#pkg-note-BUG. + counts [2]*histogramCounts + hotIdx int // Index of currently-hot counts. Only used within Write. labelPairs []*dto.LabelPair } @@ -241,36 +270,113 @@ func (h *histogram) Observe(v float64) { // 100 buckets: 78.1 ns/op linear - binary 54.9 ns/op // 300 buckets: 154 ns/op linear - binary 61.6 ns/op i := sort.SearchFloat64s(h.upperBounds, v) - if i < len(h.counts) { - atomic.AddUint64(&h.counts[i], 1) + + // We increment h.countAndHotIdx by 2 so that the counter in the upper + // 63 bits gets incremented by 1. At the same time, we get the new value + // back, which we can use to find the currently-hot counts. + n := atomic.AddUint64(&h.countAndHotIdx, 2) + hotCounts := h.counts[n%2] + + if i < len(h.upperBounds) { + atomic.AddUint64(&hotCounts.buckets[i], 1) } - atomic.AddUint64(&h.count, 1) for { - oldBits := atomic.LoadUint64(&h.sumBits) + oldBits := atomic.LoadUint64(&hotCounts.sumBits) newBits := math.Float64bits(math.Float64frombits(oldBits) + v) - if atomic.CompareAndSwapUint64(&h.sumBits, oldBits, newBits) { + if atomic.CompareAndSwapUint64(&hotCounts.sumBits, oldBits, newBits) { break } } + // Increment count last as we take it as a signal that the observation + // is complete. + atomic.AddUint64(&hotCounts.count, 1) } func (h *histogram) Write(out *dto.Metric) error { - his := &dto.Histogram{} - buckets := make([]*dto.Bucket, len(h.upperBounds)) + var ( + his = &dto.Histogram{} + buckets = make([]*dto.Bucket, len(h.upperBounds)) + hotCounts, coldCounts *histogramCounts + count uint64 + ) - his.SampleSum = proto.Float64(math.Float64frombits(atomic.LoadUint64(&h.sumBits))) - his.SampleCount = proto.Uint64(atomic.LoadUint64(&h.count)) - var count uint64 + // For simplicity, we mutex the rest of this method. It is not in the + // hot path, i.e. Observe is called much more often than Write. The + // complication of making Write lock-free isn't worth it. + h.writeMtx.Lock() + defer h.writeMtx.Unlock() + + // This is a bit arcane, which is why the following spells out this if + // clause in English: + // + // If the currently-hot counts struct is #0, we atomically increment + // h.countAndHotIdx by 1 so that from now on Observe will use the counts + // struct #1. Furthermore, the atomic increment gives us the new value, + // which, in its most significant 63 bits, tells us the count of + // observations done so far up to and including currently ongoing + // observations still using the counts struct just changed from hot to + // cold. To have a normal uint64 for the count, we bitshift by 1 and + // save the result in count. We also set h.hotIdx to 1 for the next + // Write call, and we will refer to counts #1 as hotCounts and to counts + // #0 as coldCounts. + // + // If the currently-hot counts struct is #1, we do the corresponding + // things the other way round. We have to _decrement_ h.countAndHotIdx + // (which is a bit arcane in itself, as we have to express -1 with an + // unsigned int...). + if h.hotIdx == 0 { + count = atomic.AddUint64(&h.countAndHotIdx, 1) >> 1 + h.hotIdx = 1 + hotCounts = h.counts[1] + coldCounts = h.counts[0] + } else { + count = atomic.AddUint64(&h.countAndHotIdx, ^uint64(0)) >> 1 // Decrement. + h.hotIdx = 0 + hotCounts = h.counts[0] + coldCounts = h.counts[1] + } + + // Now we have to wait for the now-declared-cold counts to actually cool + // down, i.e. wait for all observations still using it to finish. That's + // the case once the count in the cold counts struct is the same as the + // one atomically retrieved from the upper 63bits of h.countAndHotIdx. + for { + if count == atomic.LoadUint64(&coldCounts.count) { + break + } + runtime.Gosched() // Let observations get work done. + } + + his.SampleCount = proto.Uint64(count) + his.SampleSum = proto.Float64(math.Float64frombits(atomic.LoadUint64(&coldCounts.sumBits))) + var cumCount uint64 for i, upperBound := range h.upperBounds { - count += atomic.LoadUint64(&h.counts[i]) + cumCount += atomic.LoadUint64(&coldCounts.buckets[i]) buckets[i] = &dto.Bucket{ - CumulativeCount: proto.Uint64(count), + CumulativeCount: proto.Uint64(cumCount), UpperBound: proto.Float64(upperBound), } } + his.Bucket = buckets out.Histogram = his out.Label = h.labelPairs + + // Finally add all the cold counts to the new hot counts and reset the cold counts. + atomic.AddUint64(&hotCounts.count, count) + atomic.StoreUint64(&coldCounts.count, 0) + for { + oldBits := atomic.LoadUint64(&hotCounts.sumBits) + newBits := math.Float64bits(math.Float64frombits(oldBits) + his.GetSampleSum()) + if atomic.CompareAndSwapUint64(&hotCounts.sumBits, oldBits, newBits) { + atomic.StoreUint64(&coldCounts.sumBits, 0) + break + } + } + for i := range h.upperBounds { + atomic.AddUint64(&hotCounts.buckets[i], atomic.LoadUint64(&coldCounts.buckets[i])) + atomic.StoreUint64(&coldCounts.buckets[i], 0) + } return nil } @@ -454,7 +560,7 @@ func (h *constHistogram) Write(out *dto.Metric) error { // bucket. // // NewConstHistogram returns an error if the length of labelValues is not -// consistent with the variable labels in Desc. +// consistent with the variable labels in Desc or if Desc is invalid. func NewConstHistogram( desc *Desc, count uint64, @@ -462,6 +568,9 @@ func NewConstHistogram( buckets map[float64]uint64, labelValues ...string, ) (Metric, error) { + if desc.err != nil { + return nil, desc.err + } if err := validateLabelValues(labelValues, len(desc.variableLabels)); err != nil { return nil, err } diff --git a/vendor/github.com/prometheus/client_golang/prometheus/http.go b/vendor/github.com/prometheus/client_golang/prometheus/http.go index 4d08154b230..9f0875bfc81 100644 --- a/vendor/github.com/prometheus/client_golang/prometheus/http.go +++ b/vendor/github.com/prometheus/client_golang/prometheus/http.go @@ -15,9 +15,7 @@ package prometheus import ( "bufio" - "bytes" "compress/gzip" - "fmt" "io" "net" "net/http" @@ -41,19 +39,10 @@ const ( acceptEncodingHeader = "Accept-Encoding" ) -var bufPool sync.Pool - -func getBuf() *bytes.Buffer { - buf := bufPool.Get() - if buf == nil { - return &bytes.Buffer{} - } - return buf.(*bytes.Buffer) -} - -func giveBuf(buf *bytes.Buffer) { - buf.Reset() - bufPool.Put(buf) +var gzipPool = sync.Pool{ + New: func() interface{} { + return gzip.NewWriter(nil) + }, } // Handler returns an HTTP handler for the DefaultGatherer. It is @@ -61,68 +50,50 @@ func giveBuf(buf *bytes.Buffer) { // name). // // Deprecated: Please note the issues described in the doc comment of -// InstrumentHandler. You might want to consider using -// promhttp.InstrumentedHandler instead. +// InstrumentHandler. You might want to consider using promhttp.Handler instead. func Handler() http.Handler { return InstrumentHandler("prometheus", UninstrumentedHandler()) } // UninstrumentedHandler returns an HTTP handler for the DefaultGatherer. // -// Deprecated: Use promhttp.Handler instead. See there for further documentation. +// Deprecated: Use promhttp.HandlerFor(DefaultGatherer, promhttp.HandlerOpts{}) +// instead. See there for further documentation. func UninstrumentedHandler() http.Handler { - return http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) { + return http.HandlerFunc(func(rsp http.ResponseWriter, req *http.Request) { mfs, err := DefaultGatherer.Gather() if err != nil { - http.Error(w, "An error has occurred during metrics collection:\n\n"+err.Error(), http.StatusInternalServerError) + httpError(rsp, err) return } contentType := expfmt.Negotiate(req.Header) - buf := getBuf() - defer giveBuf(buf) - writer, encoding := decorateWriter(req, buf) - enc := expfmt.NewEncoder(writer, contentType) - var lastErr error + header := rsp.Header() + header.Set(contentTypeHeader, string(contentType)) + + w := io.Writer(rsp) + if gzipAccepted(req.Header) { + header.Set(contentEncodingHeader, "gzip") + gz := gzipPool.Get().(*gzip.Writer) + defer gzipPool.Put(gz) + + gz.Reset(w) + defer gz.Close() + + w = gz + } + + enc := expfmt.NewEncoder(w, contentType) + for _, mf := range mfs { if err := enc.Encode(mf); err != nil { - lastErr = err - http.Error(w, "An error has occurred during metrics encoding:\n\n"+err.Error(), http.StatusInternalServerError) + httpError(rsp, err) return } } - if closer, ok := writer.(io.Closer); ok { - closer.Close() - } - if lastErr != nil && buf.Len() == 0 { - http.Error(w, "No metrics encoded, last error:\n\n"+lastErr.Error(), http.StatusInternalServerError) - return - } - header := w.Header() - header.Set(contentTypeHeader, string(contentType)) - header.Set(contentLengthHeader, fmt.Sprint(buf.Len())) - if encoding != "" { - header.Set(contentEncodingHeader, encoding) - } - w.Write(buf.Bytes()) }) } -// decorateWriter wraps a writer to handle gzip compression if requested. It -// returns the decorated writer and the appropriate "Content-Encoding" header -// (which is empty if no compression is enabled). -func decorateWriter(request *http.Request, writer io.Writer) (io.Writer, string) { - header := request.Header.Get(acceptEncodingHeader) - parts := strings.Split(header, ",") - for _, part := range parts { - part = strings.TrimSpace(part) - if part == "gzip" || strings.HasPrefix(part, "gzip;") { - return gzip.NewWriter(writer), "gzip" - } - } - return writer, "" -} - var instLabels = []string{"method", "code"} type nower interface { @@ -149,21 +120,14 @@ var now nower = nowFunc(func() time.Time { // (label name "method") and HTTP status code (label name "code"). // // Deprecated: InstrumentHandler has several issues. Use the tooling provided in -// package promhttp instead. The issues are the following: -// -// - It uses Summaries rather than Histograms. Summaries are not useful if -// aggregation across multiple instances is required. -// -// - It uses microseconds as unit, which is deprecated and should be replaced by -// seconds. -// -// - The size of the request is calculated in a separate goroutine. Since this -// calculator requires access to the request header, it creates a race with -// any writes to the header performed during request handling. -// httputil.ReverseProxy is a prominent example for a handler -// performing such writes. -// -// - It has additional issues with HTTP/2, cf. +// package promhttp instead. The issues are the following: (1) It uses Summaries +// rather than Histograms. Summaries are not useful if aggregation across +// multiple instances is required. (2) It uses microseconds as unit, which is +// deprecated and should be replaced by seconds. (3) The size of the request is +// calculated in a separate goroutine. Since this calculator requires access to +// the request header, it creates a race with any writes to the header performed +// during request handling. httputil.ReverseProxy is a prominent example for a +// handler performing such writes. (4) It has additional issues with HTTP/2, cf. // https://github.com/prometheus/client_golang/issues/272. func InstrumentHandler(handlerName string, handler http.Handler) http.HandlerFunc { return InstrumentHandlerFunc(handlerName, handler.ServeHTTP) @@ -307,7 +271,7 @@ func InstrumentHandlerFuncWithOpts(opts SummaryOpts, handlerFunc func(http.Respo } func computeApproximateRequestSize(r *http.Request) <-chan int { - // Get URL length in current go routine for avoiding a race condition. + // Get URL length in current goroutine for avoiding a race condition. // HandlerFunc that runs in parallel may modify the URL. s := 0 if r.URL != nil { @@ -510,3 +474,31 @@ func sanitizeCode(s int) string { return strconv.Itoa(s) } } + +// gzipAccepted returns whether the client will accept gzip-encoded content. +func gzipAccepted(header http.Header) bool { + a := header.Get(acceptEncodingHeader) + parts := strings.Split(a, ",") + for _, part := range parts { + part = strings.TrimSpace(part) + if part == "gzip" || strings.HasPrefix(part, "gzip;") { + return true + } + } + return false +} + +// httpError removes any content-encoding header and then calls http.Error with +// the provided error and http.StatusInternalServerErrer. Error contents is +// supposed to be uncompressed plain text. However, same as with a plain +// http.Error, any header settings will be void if the header has already been +// sent. The error message will still be written to the writer, but it will +// probably be of limited use. +func httpError(rsp http.ResponseWriter, err error) { + rsp.Header().Del(contentEncodingHeader) + http.Error( + rsp, + "An error has occurred while serving metrics:\n\n"+err.Error(), + http.StatusInternalServerError, + ) +} diff --git a/vendor/github.com/prometheus/client_golang/prometheus/internal/metric.go b/vendor/github.com/prometheus/client_golang/prometheus/internal/metric.go new file mode 100644 index 00000000000..351c26e1aed --- /dev/null +++ b/vendor/github.com/prometheus/client_golang/prometheus/internal/metric.go @@ -0,0 +1,85 @@ +// Copyright 2018 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package internal + +import ( + "sort" + + dto "github.com/prometheus/client_model/go" +) + +// metricSorter is a sortable slice of *dto.Metric. +type metricSorter []*dto.Metric + +func (s metricSorter) Len() int { + return len(s) +} + +func (s metricSorter) Swap(i, j int) { + s[i], s[j] = s[j], s[i] +} + +func (s metricSorter) Less(i, j int) bool { + if len(s[i].Label) != len(s[j].Label) { + // This should not happen. The metrics are + // inconsistent. However, we have to deal with the fact, as + // people might use custom collectors or metric family injection + // to create inconsistent metrics. So let's simply compare the + // number of labels in this case. That will still yield + // reproducible sorting. + return len(s[i].Label) < len(s[j].Label) + } + for n, lp := range s[i].Label { + vi := lp.GetValue() + vj := s[j].Label[n].GetValue() + if vi != vj { + return vi < vj + } + } + + // We should never arrive here. Multiple metrics with the same + // label set in the same scrape will lead to undefined ingestion + // behavior. However, as above, we have to provide stable sorting + // here, even for inconsistent metrics. So sort equal metrics + // by their timestamp, with missing timestamps (implying "now") + // coming last. + if s[i].TimestampMs == nil { + return false + } + if s[j].TimestampMs == nil { + return true + } + return s[i].GetTimestampMs() < s[j].GetTimestampMs() +} + +// NormalizeMetricFamilies returns a MetricFamily slice with empty +// MetricFamilies pruned and the remaining MetricFamilies sorted by name within +// the slice, with the contained Metrics sorted within each MetricFamily. +func NormalizeMetricFamilies(metricFamiliesByName map[string]*dto.MetricFamily) []*dto.MetricFamily { + for _, mf := range metricFamiliesByName { + sort.Sort(metricSorter(mf.Metric)) + } + names := make([]string, 0, len(metricFamiliesByName)) + for name, mf := range metricFamiliesByName { + if len(mf.Metric) > 0 { + names = append(names, name) + } + } + sort.Strings(names) + result := make([]*dto.MetricFamily, 0, len(names)) + for _, name := range names { + result = append(result, metricFamiliesByName[name]) + } + return result +} diff --git a/vendor/github.com/prometheus/client_golang/prometheus/labels.go b/vendor/github.com/prometheus/client_golang/prometheus/labels.go index 2502e37348f..e68f132ecef 100644 --- a/vendor/github.com/prometheus/client_golang/prometheus/labels.go +++ b/vendor/github.com/prometheus/client_golang/prometheus/labels.go @@ -1,3 +1,16 @@ +// Copyright 2018 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package prometheus import ( diff --git a/vendor/github.com/prometheus/client_golang/prometheus/metric.go b/vendor/github.com/prometheus/client_golang/prometheus/metric.go index 76035bca547..55e6d86d596 100644 --- a/vendor/github.com/prometheus/client_golang/prometheus/metric.go +++ b/vendor/github.com/prometheus/client_golang/prometheus/metric.go @@ -15,6 +15,9 @@ package prometheus import ( "strings" + "time" + + "github.com/golang/protobuf/proto" dto "github.com/prometheus/client_model/go" ) @@ -43,9 +46,8 @@ type Metric interface { // While populating dto.Metric, it is the responsibility of the // implementation to ensure validity of the Metric protobuf (like valid // UTF-8 strings or syntactically valid metric and label names). It is - // recommended to sort labels lexicographically. (Implementers may find - // LabelPairSorter useful for that.) Callers of Write should still make - // sure of sorting if they depend on it. + // recommended to sort labels lexicographically. Callers of Write should + // still make sure of sorting if they depend on it. Write(*dto.Metric) error // TODO(beorn7): The original rationale of passing in a pre-allocated // dto.Metric protobuf to save allocations has disappeared. The @@ -57,8 +59,9 @@ type Metric interface { // implementation XXX has its own XXXOpts type, but in most cases, it is just be // an alias of this type (which might change when the requirement arises.) // -// It is mandatory to set Name and Help to a non-empty string. All other fields -// are optional and can safely be left at their zero value. +// It is mandatory to set Name to a non-empty string. All other fields are +// optional and can safely be left at their zero value, although it is strongly +// encouraged to set a Help string. type Opts struct { // Namespace, Subsystem, and Name are components of the fully-qualified // name of the Metric (created by joining these components with @@ -69,7 +72,7 @@ type Opts struct { Subsystem string Name string - // Help provides information about this metric. Mandatory! + // Help provides information about this metric. // // Metrics with the same fully-qualified name must have the same Help // string. @@ -110,20 +113,19 @@ func BuildFQName(namespace, subsystem, name string) string { return name } -// LabelPairSorter implements sort.Interface. It is used to sort a slice of -// dto.LabelPair pointers. This is useful for implementing the Write method of -// custom metrics. -type LabelPairSorter []*dto.LabelPair +// labelPairSorter implements sort.Interface. It is used to sort a slice of +// dto.LabelPair pointers. +type labelPairSorter []*dto.LabelPair -func (s LabelPairSorter) Len() int { +func (s labelPairSorter) Len() int { return len(s) } -func (s LabelPairSorter) Swap(i, j int) { +func (s labelPairSorter) Swap(i, j int) { s[i], s[j] = s[j], s[i] } -func (s LabelPairSorter) Less(i, j int) bool { +func (s labelPairSorter) Less(i, j int) bool { return s[i].GetName() < s[j].GetName() } @@ -142,3 +144,31 @@ func NewInvalidMetric(desc *Desc, err error) Metric { func (m *invalidMetric) Desc() *Desc { return m.desc } func (m *invalidMetric) Write(*dto.Metric) error { return m.err } + +type timestampedMetric struct { + Metric + t time.Time +} + +func (m timestampedMetric) Write(pb *dto.Metric) error { + e := m.Metric.Write(pb) + pb.TimestampMs = proto.Int64(m.t.Unix()*1000 + int64(m.t.Nanosecond()/1000000)) + return e +} + +// NewMetricWithTimestamp returns a new Metric wrapping the provided Metric in a +// way that it has an explicit timestamp set to the provided Time. This is only +// useful in rare cases as the timestamp of a Prometheus metric should usually +// be set by the Prometheus server during scraping. Exceptions include mirroring +// metrics with given timestamps from other metric +// sources. +// +// NewMetricWithTimestamp works best with MustNewConstMetric, +// MustNewConstHistogram, and MustNewConstSummary, see example. +// +// Currently, the exposition formats used by Prometheus are limited to +// millisecond resolution. Thus, the provided time will be rounded down to the +// next full millisecond value. +func NewMetricWithTimestamp(t time.Time, m Metric) Metric { + return timestampedMetric{Metric: m, t: t} +} diff --git a/vendor/github.com/prometheus/client_golang/prometheus/process_collector.go b/vendor/github.com/prometheus/client_golang/prometheus/process_collector.go index b80adc6e3d5..55176d58ce6 100644 --- a/vendor/github.com/prometheus/client_golang/prometheus/process_collector.go +++ b/vendor/github.com/prometheus/client_golang/prometheus/process_collector.go @@ -13,45 +13,74 @@ package prometheus -import "github.com/prometheus/procfs" +import ( + "errors" + "os" + + "github.com/prometheus/procfs" +) type processCollector struct { collectFn func(chan<- Metric) pidFn func() (int, error) + reportErrors bool cpuTotal *Desc openFDs, maxFDs *Desc - vsize, rss *Desc + vsize, maxVsize *Desc + rss *Desc startTime *Desc } +// ProcessCollectorOpts defines the behavior of a process metrics collector +// created with NewProcessCollector. +type ProcessCollectorOpts struct { + // PidFn returns the PID of the process the collector collects metrics + // for. It is called upon each collection. By default, the PID of the + // current process is used, as determined on construction time by + // calling os.Getpid(). + PidFn func() (int, error) + // If non-empty, each of the collected metrics is prefixed by the + // provided string and an underscore ("_"). + Namespace string + // If true, any error encountered during collection is reported as an + // invalid metric (see NewInvalidMetric). Otherwise, errors are ignored + // and the collected metrics will be incomplete. (Possibly, no metrics + // will be collected at all.) While that's usually not desired, it is + // appropriate for the common "mix-in" of process metrics, where process + // metrics are nice to have, but failing to collect them should not + // disrupt the collection of the remaining metrics. + ReportErrors bool +} + // NewProcessCollector returns a collector which exports the current state of // process metrics including CPU, memory and file descriptor usage as well as -// the process start time for the given process ID under the given namespace. +// the process start time. The detailed behavior is defined by the provided +// ProcessCollectorOpts. The zero value of ProcessCollectorOpts creates a +// collector for the current process with an empty namespace string and no error +// reporting. // // Currently, the collector depends on a Linux-style proc filesystem and // therefore only exports metrics for Linux. -func NewProcessCollector(pid int, namespace string) Collector { - return NewProcessCollectorPIDFn( - func() (int, error) { return pid, nil }, - namespace, - ) -} - -// NewProcessCollectorPIDFn works like NewProcessCollector but the process ID is -// determined on each collect anew by calling the given pidFn function. -func NewProcessCollectorPIDFn( - pidFn func() (int, error), - namespace string, -) Collector { +// +// Note: An older version of this function had the following signature: +// +// NewProcessCollector(pid int, namespace string) Collector +// +// Most commonly, it was called as +// +// NewProcessCollector(os.Getpid(), "") +// +// The following call of the current version is equivalent to the above: +// +// NewProcessCollector(ProcessCollectorOpts{}) +func NewProcessCollector(opts ProcessCollectorOpts) Collector { ns := "" - if len(namespace) > 0 { - ns = namespace + "_" + if len(opts.Namespace) > 0 { + ns = opts.Namespace + "_" } - c := processCollector{ - pidFn: pidFn, - collectFn: func(chan<- Metric) {}, - + c := &processCollector{ + reportErrors: opts.ReportErrors, cpuTotal: NewDesc( ns+"process_cpu_seconds_total", "Total user and system CPU time spent in seconds.", @@ -72,6 +101,11 @@ func NewProcessCollectorPIDFn( "Virtual memory size in bytes.", nil, nil, ), + maxVsize: NewDesc( + ns+"process_virtual_memory_max_bytes", + "Maximum amount of virtual memory available in bytes.", + nil, nil, + ), rss: NewDesc( ns+"process_resident_memory_bytes", "Resident memory size in bytes.", @@ -84,12 +118,23 @@ func NewProcessCollectorPIDFn( ), } + if opts.PidFn == nil { + pid := os.Getpid() + c.pidFn = func() (int, error) { return pid, nil } + } else { + c.pidFn = opts.PidFn + } + // Set up process metric collection if supported by the runtime. if _, err := procfs.NewStat(); err == nil { c.collectFn = c.processCollect + } else { + c.collectFn = func(ch chan<- Metric) { + c.reportError(ch, nil, errors.New("process metrics not supported on this platform")) + } } - return &c + return c } // Describe returns all descriptions of the collector. @@ -98,6 +143,7 @@ func (c *processCollector) Describe(ch chan<- *Desc) { ch <- c.openFDs ch <- c.maxFDs ch <- c.vsize + ch <- c.maxVsize ch <- c.rss ch <- c.startTime } @@ -107,16 +153,16 @@ func (c *processCollector) Collect(ch chan<- Metric) { c.collectFn(ch) } -// TODO(ts): Bring back error reporting by reverting 7faf9e7 as soon as the -// client allows users to configure the error behavior. func (c *processCollector) processCollect(ch chan<- Metric) { pid, err := c.pidFn() if err != nil { + c.reportError(ch, nil, err) return } p, err := procfs.NewProc(pid) if err != nil { + c.reportError(ch, nil, err) return } @@ -126,14 +172,33 @@ func (c *processCollector) processCollect(ch chan<- Metric) { ch <- MustNewConstMetric(c.rss, GaugeValue, float64(stat.ResidentMemory())) if startTime, err := stat.StartTime(); err == nil { ch <- MustNewConstMetric(c.startTime, GaugeValue, startTime) + } else { + c.reportError(ch, c.startTime, err) } + } else { + c.reportError(ch, nil, err) } if fds, err := p.FileDescriptorsLen(); err == nil { ch <- MustNewConstMetric(c.openFDs, GaugeValue, float64(fds)) + } else { + c.reportError(ch, c.openFDs, err) } if limits, err := p.NewLimits(); err == nil { ch <- MustNewConstMetric(c.maxFDs, GaugeValue, float64(limits.OpenFiles)) + ch <- MustNewConstMetric(c.maxVsize, GaugeValue, float64(limits.AddressSpace)) + } else { + c.reportError(ch, nil, err) + } +} + +func (c *processCollector) reportError(ch chan<- Metric, desc *Desc, err error) { + if !c.reportErrors { + return + } + if desc == nil { + desc = NewInvalidDesc(err) } + ch <- NewInvalidMetric(desc, err) } diff --git a/vendor/github.com/prometheus/client_golang/prometheus/promhttp/http.go b/vendor/github.com/prometheus/client_golang/prometheus/promhttp/http.go index 01357374feb..668eb6b3c93 100644 --- a/vendor/github.com/prometheus/client_golang/prometheus/promhttp/http.go +++ b/vendor/github.com/prometheus/client_golang/prometheus/promhttp/http.go @@ -32,7 +32,6 @@ package promhttp import ( - "bytes" "compress/gzip" "fmt" "io" @@ -53,19 +52,10 @@ const ( acceptEncodingHeader = "Accept-Encoding" ) -var bufPool sync.Pool - -func getBuf() *bytes.Buffer { - buf := bufPool.Get() - if buf == nil { - return &bytes.Buffer{} - } - return buf.(*bytes.Buffer) -} - -func giveBuf(buf *bytes.Buffer) { - buf.Reset() - bufPool.Put(buf) +var gzipPool = sync.Pool{ + New: func() interface{} { + return gzip.NewWriter(nil) + }, } // Handler returns an http.Handler for the prometheus.DefaultGatherer, using @@ -100,19 +90,18 @@ func HandlerFor(reg prometheus.Gatherer, opts HandlerOpts) http.Handler { inFlightSem = make(chan struct{}, opts.MaxRequestsInFlight) } - h := http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) { + h := http.HandlerFunc(func(rsp http.ResponseWriter, req *http.Request) { if inFlightSem != nil { select { case inFlightSem <- struct{}{}: // All good, carry on. defer func() { <-inFlightSem }() default: - http.Error(w, fmt.Sprintf( + http.Error(rsp, fmt.Sprintf( "Limit of concurrent requests reached (%d), try again later.", opts.MaxRequestsInFlight, ), http.StatusServiceUnavailable) return } } - mfs, err := reg.Gather() if err != nil { if opts.ErrorLog != nil { @@ -123,26 +112,40 @@ func HandlerFor(reg prometheus.Gatherer, opts HandlerOpts) http.Handler { panic(err) case ContinueOnError: if len(mfs) == 0 { - http.Error(w, "No metrics gathered, last error:\n\n"+err.Error(), http.StatusInternalServerError) + // Still report the error if no metrics have been gathered. + httpError(rsp, err) return } case HTTPErrorOnError: - http.Error(w, "An error has occurred during metrics gathering:\n\n"+err.Error(), http.StatusInternalServerError) + httpError(rsp, err) return } } contentType := expfmt.Negotiate(req.Header) - buf := getBuf() - defer giveBuf(buf) - writer, encoding := decorateWriter(req, buf, opts.DisableCompression) - enc := expfmt.NewEncoder(writer, contentType) + header := rsp.Header() + header.Set(contentTypeHeader, string(contentType)) + + w := io.Writer(rsp) + if !opts.DisableCompression && gzipAccepted(req.Header) { + header.Set(contentEncodingHeader, "gzip") + gz := gzipPool.Get().(*gzip.Writer) + defer gzipPool.Put(gz) + + gz.Reset(w) + defer gz.Close() + + w = gz + } + + enc := expfmt.NewEncoder(w, contentType) + var lastErr error for _, mf := range mfs { if err := enc.Encode(mf); err != nil { lastErr = err if opts.ErrorLog != nil { - opts.ErrorLog.Println("error encoding metric family:", err) + opts.ErrorLog.Println("error encoding and sending metric family:", err) } switch opts.ErrorHandling { case PanicOnError: @@ -150,28 +153,15 @@ func HandlerFor(reg prometheus.Gatherer, opts HandlerOpts) http.Handler { case ContinueOnError: // Handled later. case HTTPErrorOnError: - http.Error(w, "An error has occurred during metrics encoding:\n\n"+err.Error(), http.StatusInternalServerError) + httpError(rsp, err) return } } } - if closer, ok := writer.(io.Closer); ok { - closer.Close() - } - if lastErr != nil && buf.Len() == 0 { - http.Error(w, "No metrics encoded, last error:\n\n"+lastErr.Error(), http.StatusInternalServerError) - return - } - header := w.Header() - header.Set(contentTypeHeader, string(contentType)) - header.Set(contentLengthHeader, fmt.Sprint(buf.Len())) - if encoding != "" { - header.Set(contentEncodingHeader, encoding) - } - if _, err := w.Write(buf.Bytes()); err != nil && opts.ErrorLog != nil { - opts.ErrorLog.Println("error while sending encoded metrics:", err) + + if lastErr != nil { + httpError(rsp, lastErr) } - // TODO(beorn7): Consider streaming serving of metrics. }) if opts.Timeout <= 0 { @@ -292,20 +282,30 @@ type HandlerOpts struct { Timeout time.Duration } -// decorateWriter wraps a writer to handle gzip compression if requested. It -// returns the decorated writer and the appropriate "Content-Encoding" header -// (which is empty if no compression is enabled). -func decorateWriter(request *http.Request, writer io.Writer, compressionDisabled bool) (io.Writer, string) { - if compressionDisabled { - return writer, "" - } - header := request.Header.Get(acceptEncodingHeader) - parts := strings.Split(header, ",") +// gzipAccepted returns whether the client will accept gzip-encoded content. +func gzipAccepted(header http.Header) bool { + a := header.Get(acceptEncodingHeader) + parts := strings.Split(a, ",") for _, part := range parts { part = strings.TrimSpace(part) if part == "gzip" || strings.HasPrefix(part, "gzip;") { - return gzip.NewWriter(writer), "gzip" + return true } } - return writer, "" + return false +} + +// httpError removes any content-encoding header and then calls http.Error with +// the provided error and http.StatusInternalServerErrer. Error contents is +// supposed to be uncompressed plain text. However, same as with a plain +// http.Error, any header settings will be void if the header has already been +// sent. The error message will still be written to the writer, but it will +// probably be of limited use. +func httpError(rsp http.ResponseWriter, err error) { + rsp.Header().Del(contentEncodingHeader) + http.Error( + rsp, + "An error has occurred while serving metrics:\n\n"+err.Error(), + http.StatusInternalServerError, + ) } diff --git a/vendor/github.com/prometheus/client_golang/prometheus/registry.go b/vendor/github.com/prometheus/client_golang/prometheus/registry.go index 896838f1cc0..e422ef3834c 100644 --- a/vendor/github.com/prometheus/client_golang/prometheus/registry.go +++ b/vendor/github.com/prometheus/client_golang/prometheus/registry.go @@ -16,7 +16,6 @@ package prometheus import ( "bytes" "fmt" - "os" "runtime" "sort" "strings" @@ -26,6 +25,8 @@ import ( "github.com/golang/protobuf/proto" dto "github.com/prometheus/client_model/go" + + "github.com/prometheus/client_golang/prometheus/internal" ) const ( @@ -52,7 +53,7 @@ var ( ) func init() { - MustRegister(NewProcessCollector(os.Getpid(), "")) + MustRegister(NewProcessCollector(ProcessCollectorOpts{})) MustRegister(NewGoCollector()) } @@ -106,9 +107,6 @@ type Registerer interface { // Collector, and for providing a Collector that will not cause // inconsistent metrics on collection. (This would lead to scrape // errors.) - // - // It is in general not safe to register the same Collector multiple - // times concurrently. Register(Collector) error // MustRegister works like Register but registers any number of // Collectors and panics upon the first registration that causes an @@ -272,7 +270,12 @@ func (r *Registry) Register(c Collector) error { close(descChan) }() r.mtx.Lock() - defer r.mtx.Unlock() + defer func() { + // Drain channel in case of premature return to not leak a goroutine. + for range descChan { + } + r.mtx.Unlock() + }() // Conduct various tests... for desc := range descChan { @@ -527,7 +530,7 @@ func (r *Registry) Gather() ([]*dto.MetricFamily, error) { break } } - return normalizeMetricFamilies(metricFamiliesByName), errs.MaybeUnwrap() + return internal.NormalizeMetricFamilies(metricFamiliesByName), errs.MaybeUnwrap() } // processMetric is an internal helper method only used by the Gather method. @@ -538,6 +541,11 @@ func processMetric( registeredDescIDs map[uint64]struct{}, ) error { desc := metric.Desc() + // Wrapped metrics collected by an unchecked Collector can have an + // invalid Desc. + if desc.err != nil { + return desc.err + } dtoMetric := &dto.Metric{} if err := metric.Write(dtoMetric); err != nil { return fmt.Errorf("error collecting metric %v: %s", desc, err) @@ -707,72 +715,7 @@ func (gs Gatherers) Gather() ([]*dto.MetricFamily, error) { } } } - return normalizeMetricFamilies(metricFamiliesByName), errs.MaybeUnwrap() -} - -// metricSorter is a sortable slice of *dto.Metric. -type metricSorter []*dto.Metric - -func (s metricSorter) Len() int { - return len(s) -} - -func (s metricSorter) Swap(i, j int) { - s[i], s[j] = s[j], s[i] -} - -func (s metricSorter) Less(i, j int) bool { - if len(s[i].Label) != len(s[j].Label) { - // This should not happen. The metrics are - // inconsistent. However, we have to deal with the fact, as - // people might use custom collectors or metric family injection - // to create inconsistent metrics. So let's simply compare the - // number of labels in this case. That will still yield - // reproducible sorting. - return len(s[i].Label) < len(s[j].Label) - } - for n, lp := range s[i].Label { - vi := lp.GetValue() - vj := s[j].Label[n].GetValue() - if vi != vj { - return vi < vj - } - } - - // We should never arrive here. Multiple metrics with the same - // label set in the same scrape will lead to undefined ingestion - // behavior. However, as above, we have to provide stable sorting - // here, even for inconsistent metrics. So sort equal metrics - // by their timestamp, with missing timestamps (implying "now") - // coming last. - if s[i].TimestampMs == nil { - return false - } - if s[j].TimestampMs == nil { - return true - } - return s[i].GetTimestampMs() < s[j].GetTimestampMs() -} - -// normalizeMetricFamilies returns a MetricFamily slice with empty -// MetricFamilies pruned and the remaining MetricFamilies sorted by name within -// the slice, with the contained Metrics sorted within each MetricFamily. -func normalizeMetricFamilies(metricFamiliesByName map[string]*dto.MetricFamily) []*dto.MetricFamily { - for _, mf := range metricFamiliesByName { - sort.Sort(metricSorter(mf.Metric)) - } - names := make([]string, 0, len(metricFamiliesByName)) - for name, mf := range metricFamiliesByName { - if len(mf.Metric) > 0 { - names = append(names, name) - } - } - sort.Strings(names) - result := make([]*dto.MetricFamily, 0, len(names)) - for _, name := range names { - result = append(result, metricFamiliesByName[name]) - } - return result + return internal.NormalizeMetricFamilies(metricFamiliesByName), errs.MaybeUnwrap() } // checkSuffixCollisions checks for collisions with the “magic” suffixes the @@ -844,6 +787,8 @@ func checkMetricConsistency( dtoMetric *dto.Metric, metricHashes map[uint64]struct{}, ) error { + name := metricFamily.GetName() + // Type consistency with metric family. if metricFamily.GetType() == dto.MetricType_GAUGE && dtoMetric.Gauge == nil || metricFamily.GetType() == dto.MetricType_COUNTER && dtoMetric.Counter == nil || @@ -852,37 +797,46 @@ func checkMetricConsistency( metricFamily.GetType() == dto.MetricType_UNTYPED && dtoMetric.Untyped == nil { return fmt.Errorf( "collected metric %q { %s} is not a %s", - metricFamily.GetName(), dtoMetric, metricFamily.GetType(), + name, dtoMetric, metricFamily.GetType(), ) } + previousLabelName := "" for _, labelPair := range dtoMetric.GetLabel() { - if !checkLabelName(labelPair.GetName()) { + labelName := labelPair.GetName() + if labelName == previousLabelName { + return fmt.Errorf( + "collected metric %q { %s} has two or more labels with the same name: %s", + name, dtoMetric, labelName, + ) + } + if !checkLabelName(labelName) { return fmt.Errorf( "collected metric %q { %s} has a label with an invalid name: %s", - metricFamily.GetName(), dtoMetric, labelPair.GetName(), + name, dtoMetric, labelName, ) } - if dtoMetric.Summary != nil && labelPair.GetName() == quantileLabel { + if dtoMetric.Summary != nil && labelName == quantileLabel { return fmt.Errorf( "collected metric %q { %s} must not have an explicit %q label", - metricFamily.GetName(), dtoMetric, quantileLabel, + name, dtoMetric, quantileLabel, ) } if !utf8.ValidString(labelPair.GetValue()) { return fmt.Errorf( "collected metric %q { %s} has a label named %q whose value is not utf8: %#v", - metricFamily.GetName(), dtoMetric, labelPair.GetName(), labelPair.GetValue()) + name, dtoMetric, labelName, labelPair.GetValue()) } + previousLabelName = labelName } // Is the metric unique (i.e. no other metric with the same name and the same labels)? h := hashNew() - h = hashAdd(h, metricFamily.GetName()) + h = hashAdd(h, name) h = hashAddByte(h, separatorByte) // Make sure label pairs are sorted. We depend on it for the consistency // check. - sort.Sort(LabelPairSorter(dtoMetric.Label)) + sort.Sort(labelPairSorter(dtoMetric.Label)) for _, lp := range dtoMetric.Label { h = hashAdd(h, lp.GetName()) h = hashAddByte(h, separatorByte) @@ -892,7 +846,7 @@ func checkMetricConsistency( if _, exists := metricHashes[h]; exists { return fmt.Errorf( "collected metric %q { %s} was collected before with the same name and label values", - metricFamily.GetName(), dtoMetric, + name, dtoMetric, ) } metricHashes[h] = struct{}{} @@ -926,7 +880,7 @@ func checkDescConsistency( metricFamily.GetName(), dtoMetric, desc, ) } - sort.Sort(LabelPairSorter(lpsFromDesc)) + sort.Sort(labelPairSorter(lpsFromDesc)) for i, lpFromDesc := range lpsFromDesc { lpFromMetric := dtoMetric.Label[i] if lpFromDesc.GetName() != lpFromMetric.GetName() || diff --git a/vendor/github.com/prometheus/client_golang/prometheus/summary.go b/vendor/github.com/prometheus/client_golang/prometheus/summary.go index 83b403c5964..f7e92d82945 100644 --- a/vendor/github.com/prometheus/client_golang/prometheus/summary.go +++ b/vendor/github.com/prometheus/client_golang/prometheus/summary.go @@ -37,7 +37,7 @@ const quantileLabel = "quantile" // A typical use-case is the observation of request latencies. By default, a // Summary provides the median, the 90th and the 99th percentile of the latency // as rank estimations. However, the default behavior will change in the -// upcoming v0.10 of the library. There will be no rank estiamtions at all by +// upcoming v0.10 of the library. There will be no rank estimations at all by // default. For a sane transition, it is recommended to set the desired rank // estimations explicitly. // @@ -81,10 +81,10 @@ const ( ) // SummaryOpts bundles the options for creating a Summary metric. It is -// mandatory to set Name and Help to a non-empty string. While all other fields -// are optional and can safely be left at their zero value, it is recommended to -// explicitly set the Objectives field to the desired value as the default value -// will change in the upcoming v0.10 of the library. +// mandatory to set Name to a non-empty string. While all other fields are +// optional and can safely be left at their zero value, it is recommended to set +// a help string and to explicitly set the Objectives field to the desired value +// as the default value will change in the upcoming v0.10 of the library. type SummaryOpts struct { // Namespace, Subsystem, and Name are components of the fully-qualified // name of the Summary (created by joining these components with @@ -95,7 +95,7 @@ type SummaryOpts struct { Subsystem string Name string - // Help provides information about this Summary. Mandatory! + // Help provides information about this Summary. // // Metrics with the same fully-qualified name must have the same Help // string. @@ -586,7 +586,7 @@ func (s *constSummary) Write(out *dto.Metric) error { // map[float64]float64{0.5: 0.23, 0.99: 0.56} // // NewConstSummary returns an error if the length of labelValues is not -// consistent with the variable labels in Desc. +// consistent with the variable labels in Desc or if Desc is invalid. func NewConstSummary( desc *Desc, count uint64, @@ -594,6 +594,9 @@ func NewConstSummary( quantiles map[float64]float64, labelValues ...string, ) (Metric, error) { + if desc.err != nil { + return nil, desc.err + } if err := validateLabelValues(labelValues, len(desc.variableLabels)); err != nil { return nil, err } diff --git a/vendor/github.com/prometheus/client_golang/prometheus/value.go b/vendor/github.com/prometheus/client_golang/prometheus/value.go index 9fb7eab0619..eb248f10874 100644 --- a/vendor/github.com/prometheus/client_golang/prometheus/value.go +++ b/vendor/github.com/prometheus/client_golang/prometheus/value.go @@ -17,9 +17,9 @@ import ( "fmt" "sort" - dto "github.com/prometheus/client_model/go" - "github.com/golang/protobuf/proto" + + dto "github.com/prometheus/client_model/go" ) // ValueType is an enumeration of metric types that represent a simple value. @@ -77,8 +77,12 @@ func (v *valueFunc) Write(out *dto.Metric) error { // operations. However, when implementing custom Collectors, it is useful as a // throw-away metric that is generated on the fly to send it to Prometheus in // the Collect method. NewConstMetric returns an error if the length of -// labelValues is not consistent with the variable labels in Desc. +// labelValues is not consistent with the variable labels in Desc or if Desc is +// invalid. func NewConstMetric(desc *Desc, valueType ValueType, value float64, labelValues ...string) (Metric, error) { + if desc.err != nil { + return nil, desc.err + } if err := validateLabelValues(labelValues, len(desc.variableLabels)); err != nil { return nil, err } @@ -153,6 +157,6 @@ func makeLabelPairs(desc *Desc, labelValues []string) []*dto.LabelPair { }) } labelPairs = append(labelPairs, desc.constLabelPairs...) - sort.Sort(LabelPairSorter(labelPairs)) + sort.Sort(labelPairSorter(labelPairs)) return labelPairs } diff --git a/vendor/github.com/prometheus/client_golang/prometheus/wrap.go b/vendor/github.com/prometheus/client_golang/prometheus/wrap.go new file mode 100644 index 00000000000..49159bf3eb0 --- /dev/null +++ b/vendor/github.com/prometheus/client_golang/prometheus/wrap.go @@ -0,0 +1,179 @@ +// Copyright 2018 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package prometheus + +import ( + "fmt" + "sort" + + "github.com/golang/protobuf/proto" + + dto "github.com/prometheus/client_model/go" +) + +// WrapRegistererWith returns a Registerer wrapping the provided +// Registerer. Collectors registered with the returned Registerer will be +// registered with the wrapped Registerer in a modified way. The modified +// Collector adds the provided Labels to all Metrics it collects (as +// ConstLabels). The Metrics collected by the unmodified Collector must not +// duplicate any of those labels. +// +// WrapRegistererWith provides a way to add fixed labels to a subset of +// Collectors. It should not be used to add fixed labels to all metrics exposed. +// +// The Collector example demonstrates a use of WrapRegistererWith. +func WrapRegistererWith(labels Labels, reg Registerer) Registerer { + return &wrappingRegisterer{ + wrappedRegisterer: reg, + labels: labels, + } +} + +// WrapRegistererWithPrefix returns a Registerer wrapping the provided +// Registerer. Collectors registered with the returned Registerer will be +// registered with the wrapped Registerer in a modified way. The modified +// Collector adds the provided prefix to the name of all Metrics it collects. +// +// WrapRegistererWithPrefix is useful to have one place to prefix all metrics of +// a sub-system. To make this work, register metrics of the sub-system with the +// wrapping Registerer returned by WrapRegistererWithPrefix. It is rarely useful +// to use the same prefix for all metrics exposed. In particular, do not prefix +// metric names that are standardized across applications, as that would break +// horizontal monitoring, for example the metrics provided by the Go collector +// (see NewGoCollector) and the process collector (see NewProcessCollector). (In +// fact, those metrics are already prefixed with “go_” or “process_”, +// respectively.) +func WrapRegistererWithPrefix(prefix string, reg Registerer) Registerer { + return &wrappingRegisterer{ + wrappedRegisterer: reg, + prefix: prefix, + } +} + +type wrappingRegisterer struct { + wrappedRegisterer Registerer + prefix string + labels Labels +} + +func (r *wrappingRegisterer) Register(c Collector) error { + return r.wrappedRegisterer.Register(&wrappingCollector{ + wrappedCollector: c, + prefix: r.prefix, + labels: r.labels, + }) +} + +func (r *wrappingRegisterer) MustRegister(cs ...Collector) { + for _, c := range cs { + if err := r.Register(c); err != nil { + panic(err) + } + } +} + +func (r *wrappingRegisterer) Unregister(c Collector) bool { + return r.wrappedRegisterer.Unregister(&wrappingCollector{ + wrappedCollector: c, + prefix: r.prefix, + labels: r.labels, + }) +} + +type wrappingCollector struct { + wrappedCollector Collector + prefix string + labels Labels +} + +func (c *wrappingCollector) Collect(ch chan<- Metric) { + wrappedCh := make(chan Metric) + go func() { + c.wrappedCollector.Collect(wrappedCh) + close(wrappedCh) + }() + for m := range wrappedCh { + ch <- &wrappingMetric{ + wrappedMetric: m, + prefix: c.prefix, + labels: c.labels, + } + } +} + +func (c *wrappingCollector) Describe(ch chan<- *Desc) { + wrappedCh := make(chan *Desc) + go func() { + c.wrappedCollector.Describe(wrappedCh) + close(wrappedCh) + }() + for desc := range wrappedCh { + ch <- wrapDesc(desc, c.prefix, c.labels) + } +} + +type wrappingMetric struct { + wrappedMetric Metric + prefix string + labels Labels +} + +func (m *wrappingMetric) Desc() *Desc { + return wrapDesc(m.wrappedMetric.Desc(), m.prefix, m.labels) +} + +func (m *wrappingMetric) Write(out *dto.Metric) error { + if err := m.wrappedMetric.Write(out); err != nil { + return err + } + if len(m.labels) == 0 { + // No wrapping labels. + return nil + } + for ln, lv := range m.labels { + out.Label = append(out.Label, &dto.LabelPair{ + Name: proto.String(ln), + Value: proto.String(lv), + }) + } + sort.Sort(labelPairSorter(out.Label)) + return nil +} + +func wrapDesc(desc *Desc, prefix string, labels Labels) *Desc { + constLabels := Labels{} + for _, lp := range desc.constLabelPairs { + constLabels[*lp.Name] = *lp.Value + } + for ln, lv := range labels { + if _, alreadyUsed := constLabels[ln]; alreadyUsed { + return &Desc{ + fqName: desc.fqName, + help: desc.help, + variableLabels: desc.variableLabels, + constLabelPairs: desc.constLabelPairs, + err: fmt.Errorf("attempted wrapping with already existing label name %q", ln), + } + } + constLabels[ln] = lv + } + // NewDesc will do remaining validations. + newDesc := NewDesc(prefix+desc.fqName, desc.help, desc.variableLabels, constLabels) + // Propagate errors if there was any. This will override any errer + // created by NewDesc above, i.e. earlier errors get precedence. + if desc.err != nil { + newDesc.err = desc.err + } + return newDesc +} diff --git a/vendor/github.com/prometheus/tsdb/.gitignore b/vendor/github.com/prometheus/tsdb/.gitignore new file mode 100644 index 00000000000..5530d1f6fa9 --- /dev/null +++ b/vendor/github.com/prometheus/tsdb/.gitignore @@ -0,0 +1 @@ +benchout/ diff --git a/vendor/github.com/prometheus/tsdb/.travis.yml b/vendor/github.com/prometheus/tsdb/.travis.yml index d62e488fce0..f659ab70ad3 100644 --- a/vendor/github.com/prometheus/tsdb/.travis.yml +++ b/vendor/github.com/prometheus/tsdb/.travis.yml @@ -1,13 +1,18 @@ -sudo: false - +# sudo is enabled because it provides more memory which was needed to run go test -race +sudo: required +dist: trusty language: go go: -- 1.9.x + - 1.9.x + - 1.10.x go_import_path: github.com/prometheus/tsdb -script: -- go test -timeout 5m ./... - +install: + - go get -v -t ./... +script: + # `check_license` target is omitted due to some missing license headers + # `staticcheck` target is omitted due to linting errors + - make style unused test diff --git a/vendor/github.com/prometheus/tsdb/MAINTAINERS.md b/vendor/github.com/prometheus/tsdb/MAINTAINERS.md new file mode 100644 index 00000000000..dcb57a80dfd --- /dev/null +++ b/vendor/github.com/prometheus/tsdb/MAINTAINERS.md @@ -0,0 +1,4 @@ +Maintainers of this repository: + +* Krasi Georgiev @krasi-georgiev +* Goutham Veeramachaneni @gouthamve \ No newline at end of file diff --git a/vendor/github.com/prometheus/tsdb/Makefile b/vendor/github.com/prometheus/tsdb/Makefile new file mode 100644 index 00000000000..eff204cd2db --- /dev/null +++ b/vendor/github.com/prometheus/tsdb/Makefile @@ -0,0 +1,34 @@ +# Copyright 2018 The Prometheus Authors +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +TSDB_PROJECT_DIR = "." +TSDB_CLI_DIR="$(TSDB_PROJECT_DIR)/cmd/tsdb" +TSDB_BIN = "$(TSDB_CLI_DIR)/tsdb" +TSDB_BENCHMARK_NUM_METRICS ?= 1000 +TSDB_BENCHMARK_DATASET ?= "$(TSDB_PROJECT_DIR)/testdata/20kseries.json" +TSDB_BENCHMARK_OUTPUT_DIR ?= "$(TSDB_CLI_DIR)/benchout" + +STATICCHECK_IGNORE = +include Makefile.common + +build: + @$(GO) build -o $(TSDB_BIN) $(TSDB_CLI_DIR) + +bench: build + @echo ">> running benchmark, writing result to $(TSDB_BENCHMARK_OUTPUT_DIR)" + @$(TSDB_BIN) bench write --metrics=$(TSDB_BENCHMARK_NUM_METRICS) --out=$(TSDB_BENCHMARK_OUTPUT_DIR) $(TSDB_BENCHMARK_DATASET) + @$(GO) tool pprof -svg $(TSDB_BIN) $(TSDB_BENCHMARK_OUTPUT_DIR)/cpu.prof > $(TSDB_BENCHMARK_OUTPUT_DIR)/cpuprof.svg + @$(GO) tool pprof --inuse_space -svg $(TSDB_BIN) $(TSDB_BENCHMARK_OUTPUT_DIR)/mem.prof > $(TSDB_BENCHMARK_OUTPUT_DIR)/memprof.inuse.svg + @$(GO) tool pprof --alloc_space -svg $(TSDB_BIN) $(TSDB_BENCHMARK_OUTPUT_DIR)/mem.prof > $(TSDB_BENCHMARK_OUTPUT_DIR)/memprof.alloc.svg + @$(GO) tool pprof -svg $(TSDB_BIN) $(TSDB_BENCHMARK_OUTPUT_DIR)/block.prof > $(TSDB_BENCHMARK_OUTPUT_DIR)/blockprof.svg + @$(GO) tool pprof -svg $(TSDB_BIN) $(TSDB_BENCHMARK_OUTPUT_DIR)/mutex.prof > $(TSDB_BENCHMARK_OUTPUT_DIR)/mutexprof.svg diff --git a/vendor/github.com/prometheus/tsdb/Makefile.common b/vendor/github.com/prometheus/tsdb/Makefile.common new file mode 100644 index 00000000000..c9d832373be --- /dev/null +++ b/vendor/github.com/prometheus/tsdb/Makefile.common @@ -0,0 +1,132 @@ +# Copyright 2018 The Prometheus Authors +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# A common Makefile that includes rules to be reused in different prometheus projects. +# !!! Open PRs only against the prometheus/prometheus/Makefile.common repository! + +# Example usage : +# Create the main Makefile in the root project directory. +# include Makefile.common +# customTarget: +# @echo ">> Running customTarget" +# + +# Ensure GOBIN is not set during build so that promu is installed to the correct path +unexport GOBIN + +GO ?= go +GOFMT ?= $(GO)fmt +FIRST_GOPATH := $(firstword $(subst :, ,$(shell $(GO) env GOPATH))) +PROMU := $(FIRST_GOPATH)/bin/promu +STATICCHECK := $(FIRST_GOPATH)/bin/staticcheck +GOVENDOR := $(FIRST_GOPATH)/bin/govendor +pkgs = ./... + +PREFIX ?= $(shell pwd) +BIN_DIR ?= $(shell pwd) +DOCKER_IMAGE_TAG ?= $(subst /,-,$(shell git rev-parse --abbrev-ref HEAD)) +DOCKER_REPO ?= prom + +.PHONY: all +all: style staticcheck unused build test + +# This rule is used to forward a target like "build" to "common-build". This +# allows a new "build" target to be defined in a Makefile which includes this +# one and override "common-build" without override warnings. +%: common-% ; + +.PHONY: common-style +common-style: + @echo ">> checking code style" + @fmtRes=$$($(GOFMT) -d $$(find . -path ./vendor -prune -o -name '*.go' -print)); \ + if [ -n "$${fmtRes}" ]; then \ + echo "gofmt checking failed!"; echo "$${fmtRes}"; echo; \ + echo "Please ensure you are using $$($(GO) version) for formatting code."; \ + exit 1; \ + fi + +.PHONY: common-check_license +common-check_license: + @echo ">> checking license header" + @licRes=$$(for file in $$(find . -type f -iname '*.go' ! -path './vendor/*') ; do \ + awk 'NR<=3' $$file | grep -Eq "(Copyright|generated|GENERATED)" || echo $$file; \ + done); \ + if [ -n "$${licRes}" ]; then \ + echo "license header checking failed:"; echo "$${licRes}"; \ + exit 1; \ + fi + +.PHONY: common-test-short +common-test-short: + @echo ">> running short tests" + $(GO) test -short $(pkgs) + +.PHONY: common-test +common-test: + @echo ">> running all tests" + $(GO) test -race $(pkgs) + +.PHONY: common-format +common-format: + @echo ">> formatting code" + $(GO) fmt $(pkgs) + +.PHONY: common-vet +common-vet: + @echo ">> vetting code" + $(GO) vet $(pkgs) + +.PHONY: common-staticcheck +common-staticcheck: $(STATICCHECK) + @echo ">> running staticcheck" + $(STATICCHECK) -ignore "$(STATICCHECK_IGNORE)" $(pkgs) + +.PHONY: common-unused +common-unused: $(GOVENDOR) + @echo ">> running check for unused packages" + @$(GOVENDOR) list +unused | grep . && exit 1 || echo 'No unused packages' + +.PHONY: common-build +common-build: promu + @echo ">> building binaries" + $(PROMU) build --prefix $(PREFIX) + +.PHONY: common-tarball +common-tarball: promu + @echo ">> building release tarball" + $(PROMU) tarball --prefix $(PREFIX) $(BIN_DIR) + +.PHONY: common-docker +common-docker: + docker build -t "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_TAG)" . + +.PHONY: common-docker-publish +common-docker-publish: + docker push "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)" + +.PHONY: common-docker-tag-latest +common-docker-tag-latest: + docker tag "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_TAG)" "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME):latest" + +.PHONY: promu +promu: + GOOS= GOARCH= $(GO) get -u github.com/prometheus/promu + +.PHONY: $(STATICCHECK) +$(STATICCHECK): + GOOS= GOARCH= $(GO) get -u honnef.co/go/tools/cmd/staticcheck + +.PHONY: $(GOVENDOR) +$(GOVENDOR): + GOOS= GOARCH= $(GO) get -u github.com/kardianos/govendor diff --git a/vendor/github.com/prometheus/tsdb/README.md b/vendor/github.com/prometheus/tsdb/README.md index 51e6e68b6e5..4393d511a55 100644 --- a/vendor/github.com/prometheus/tsdb/README.md +++ b/vendor/github.com/prometheus/tsdb/README.md @@ -1,5 +1,14 @@ -# TSDB +# TSDB [![Build Status](https://travis-ci.org/prometheus/tsdb.svg?branch=master)](https://travis-ci.org/prometheus/tsdb) -This repository contains the new Prometheus storage layer that will be used in its 2.0 release. +[![GoDoc](https://godoc.org/github.com/prometheus/tsdb?status.svg)](https://godoc.org/github.com/prometheus/tsdb) +[![Go Report Card](https://goreportcard.com/badge/github.com/prometheus/tsdb)](https://goreportcard.com/report/github.com/prometheus/tsdb) + +This repository contains the Prometheus storage layer that is used in its 2.x releases. A writeup of its design can be found [here](https://fabxc.org/blog/2017-04-10-writing-a-tsdb/). + +Based on the Gorilla TSDB [white papers](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf). + +Video: [Storing 16 Bytes at Scale](https://youtu.be/b_pEevMAC3I) from [PromCon 2017](https://promcon.io/2017-munich/). + +See also the [format documentation](docs/format/README.md). diff --git a/vendor/github.com/prometheus/tsdb/block.go b/vendor/github.com/prometheus/tsdb/block.go index 7dc3af9d802..981c69eb44a 100644 --- a/vendor/github.com/prometheus/tsdb/block.go +++ b/vendor/github.com/prometheus/tsdb/block.go @@ -15,6 +15,7 @@ package tsdb import ( + "encoding/binary" "encoding/json" "io/ioutil" "os" @@ -164,6 +165,13 @@ type BlockStats struct { NumTombstones uint64 `json:"numTombstones,omitempty"` } +// BlockDesc describes a block by ULID and time range. +type BlockDesc struct { + ULID ulid.ULID `json:"ulid"` + MinTime int64 `json:"minTime"` + MaxTime int64 `json:"maxTime"` +} + // BlockMetaCompaction holds information about compactions a block went through. type BlockMetaCompaction struct { // Maximum number of compaction cycles any source block has @@ -171,19 +179,16 @@ type BlockMetaCompaction struct { Level int `json:"level"` // ULIDs of all source head blocks that went into the block. Sources []ulid.ULID `json:"sources,omitempty"` + // Short descriptions of the direct blocks that were used to create + // this block. + Parents []BlockDesc `json:"parents,omitempty"` Failed bool `json:"failed,omitempty"` } -const ( - flagNone = 0 - flagStd = 1 -) - const indexFilename = "index" const metaFilename = "meta.json" func chunkDir(dir string) string { return filepath.Join(dir, "chunks") } -func walDir(dir string) string { return filepath.Join(dir, "wal") } func readMetaFile(dir string) (*BlockMeta, error) { b, err := ioutil.ReadFile(filepath.Join(dir, metaFilename)) @@ -238,6 +243,10 @@ type Block struct { dir string meta BlockMeta + // Symbol Table Size in bytes. + // We maintain this variable to avoid recalculation everytime. + symbolTableSize uint64 + chunkr ChunkReader indexr IndexReader tombstones TombstoneReader @@ -265,12 +274,23 @@ func OpenBlock(dir string, pool chunkenc.Pool) (*Block, error) { return nil, err } + // Calculating symbol table size. + tmp := make([]byte, 8) + symTblSize := uint64(0) + for _, v := range ir.SymbolTable() { + // Size of varint length of the symbol. + symTblSize += uint64(binary.PutUvarint(tmp, uint64(len(v)))) + // Size of the symbol. + symTblSize += uint64(len(v)) + } + pb := &Block{ - dir: dir, - meta: *meta, - chunkr: cr, - indexr: ir, - tombstones: tr, + dir: dir, + meta: *meta, + chunkr: cr, + indexr: ir, + tombstones: tr, + symbolTableSize: symTblSize, } return pb, nil } @@ -340,6 +360,11 @@ func (pb *Block) Tombstones() (TombstoneReader, error) { return blockTombstoneReader{TombstoneReader: pb.tombstones, b: pb}, nil } +// GetSymbolTableSize returns the Symbol Table Size in the index of this block. +func (pb *Block) GetSymbolTableSize() uint64 { + return pb.symbolTableSize +} + func (pb *Block) setCompactionFailed() error { pb.meta.Compaction.Failed = true return writeMetaFile(pb.dir, &pb.meta) @@ -424,7 +449,7 @@ func (pb *Block) Delete(mint, maxt int64, ms ...labels.Matcher) error { ir := pb.indexr // Choose only valid postings which have chunks in the time-range. - stones := memTombstones{} + stones := NewMemTombstones() var lset labels.Labels var chks []chunks.Meta @@ -437,10 +462,10 @@ Outer: } for _, chk := range chks { - if intervalOverlap(mint, maxt, chk.MinTime, chk.MaxTime) { - // Delete only until the current vlaues and not beyond. + if chk.OverlapsClosedInterval(mint, maxt) { + // Delete only until the current values and not beyond. tmin, tmax := clampInterval(mint, maxt, chks[0].MinTime, chks[len(chks)-1].MaxTime) - stones[p.At()] = Intervals{{tmin, tmax}} + stones.addInterval(p.At(), Interval{tmin, tmax}) continue Outer } } @@ -452,8 +477,7 @@ Outer: err = pb.tombstones.Iter(func(id uint64, ivs Intervals) error { for _, iv := range ivs { - stones.add(id, iv) - pb.meta.Stats.NumTombstones++ + stones.addInterval(id, iv) } return nil }) @@ -461,6 +485,7 @@ Outer: return err } pb.tombstones = stones + pb.meta.Stats.NumTombstones = pb.tombstones.Total() if err := writeTombstoneFile(pb.dir, pb.tombstones); err != nil { return err @@ -468,28 +493,28 @@ Outer: return writeMetaFile(pb.dir, &pb.meta) } -// CleanTombstones will rewrite the block if there any tombstones to remove them -// and returns if there was a re-write. -func (pb *Block) CleanTombstones(dest string, c Compactor) (bool, error) { +// CleanTombstones will remove the tombstones and rewrite the block (only if there are any tombstones). +// If there was a rewrite, then it returns the ULID of the new block written, else nil. +func (pb *Block) CleanTombstones(dest string, c Compactor) (*ulid.ULID, error) { numStones := 0 - pb.tombstones.Iter(func(id uint64, ivs Intervals) error { - for _ = range ivs { - numStones++ - } - + if err := pb.tombstones.Iter(func(id uint64, ivs Intervals) error { + numStones += len(ivs) return nil - }) - + }); err != nil { + // This should never happen, as the iteration function only returns nil. + panic(err) + } if numStones == 0 { - return false, nil + return nil, nil } - if _, err := c.Write(dest, pb, pb.meta.MinTime, pb.meta.MaxTime); err != nil { - return false, err + meta := pb.Meta() + uid, err := c.Write(dest, pb, pb.meta.MinTime, pb.meta.MaxTime, &meta) + if err != nil { + return nil, err } - - return true, nil + return &uid, nil } // Snapshot creates snapshot of the block into dir. @@ -532,6 +557,13 @@ func (pb *Block) Snapshot(dir string) error { return nil } +// Returns true if the block overlaps [mint, maxt]. +func (pb *Block) OverlapsClosedInterval(mint, maxt int64) bool { + // The block itself is a half-open interval + // [pb.meta.MinTime, pb.meta.MaxTime). + return pb.meta.MinTime <= maxt && mint < pb.meta.MaxTime +} + func clampInterval(a, b, mint, maxt int64) (int64, int64) { if a < mint { a = mint diff --git a/vendor/github.com/prometheus/tsdb/checkpoint.go b/vendor/github.com/prometheus/tsdb/checkpoint.go new file mode 100644 index 00000000000..aa8170520c0 --- /dev/null +++ b/vendor/github.com/prometheus/tsdb/checkpoint.go @@ -0,0 +1,269 @@ +// Copyright 2018 The Prometheus Authors + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tsdb + +import ( + "fmt" + "io" + "io/ioutil" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/pkg/errors" + "github.com/prometheus/tsdb/fileutil" + "github.com/prometheus/tsdb/wal" +) + +// CheckpointStats returns stats about a created checkpoint. +type CheckpointStats struct { + DroppedSeries int + DroppedSamples int + DroppedTombstones int + TotalSeries int // Processed series including dropped ones. + TotalSamples int // Processed samples inlcuding dropped ones. + TotalTombstones int // Processed tombstones including dropped ones. +} + +// LastCheckpoint returns the directory name and index of the most recent checkpoint. +// If dir does not contain any checkpoints, ErrNotFound is returned. +func LastCheckpoint(dir string) (string, int, error) { + files, err := ioutil.ReadDir(dir) + if err != nil { + return "", 0, err + } + // Traverse list backwards since there may be multiple checkpoints left. + for i := len(files) - 1; i >= 0; i-- { + fi := files[i] + + if !strings.HasPrefix(fi.Name(), checkpointPrefix) { + continue + } + if !fi.IsDir() { + return "", 0, errors.Errorf("checkpoint %s is not a directory", fi.Name()) + } + idx, err := strconv.Atoi(fi.Name()[len(checkpointPrefix):]) + if err != nil { + continue + } + return fi.Name(), idx, nil + } + return "", 0, ErrNotFound +} + +// DeleteCheckpoints deletes all checkpoints in a directory below a given index. +func DeleteCheckpoints(dir string, maxIndex int) error { + var errs MultiError + + files, err := ioutil.ReadDir(dir) + if err != nil { + return err + } + for _, fi := range files { + if !strings.HasPrefix(fi.Name(), checkpointPrefix) { + continue + } + index, err := strconv.Atoi(fi.Name()[len(checkpointPrefix):]) + if err != nil || index >= maxIndex { + continue + } + if err := os.RemoveAll(filepath.Join(dir, fi.Name())); err != nil { + errs.Add(err) + } + } + return errs.Err() +} + +const checkpointPrefix = "checkpoint." + +// Checkpoint creates a compacted checkpoint of segments in range [first, last] in the given WAL. +// It includes the most recent checkpoint if it exists. +// All series not satisfying keep and samples below mint are dropped. +// +// The checkpoint is stored in a directory named checkpoint.N in the same +// segmented format as the original WAL itself. +// This makes it easy to read it through the WAL package and concatenate +// it with the original WAL. +func Checkpoint(w *wal.WAL, from, to int, keep func(id uint64) bool, mint int64) (*CheckpointStats, error) { + stats := &CheckpointStats{} + + var sr io.Reader + // We close everything explicitly because Windows needs files to be + // closed before being deleted. But we also have defer so that we close + // files if there is an error somewhere. + var closers []io.Closer + { + dir, idx, err := LastCheckpoint(w.Dir()) + if err != nil && err != ErrNotFound { + return nil, errors.Wrap(err, "find last checkpoint") + } + last := idx + 1 + if err == nil { + if from > last { + return nil, fmt.Errorf("unexpected gap to last checkpoint. expected:%v, requested:%v", last, from) + } + // Ignore WAL files below the checkpoint. They shouldn't exist to begin with. + from = last + + r, err := wal.NewSegmentsReader(filepath.Join(w.Dir(), dir)) + if err != nil { + return nil, errors.Wrap(err, "open last checkpoint") + } + defer r.Close() + closers = append(closers, r) + sr = r + } + + segsr, err := wal.NewSegmentsRangeReader(w.Dir(), from, to) + if err != nil { + return nil, errors.Wrap(err, "create segment reader") + } + defer segsr.Close() + closers = append(closers, segsr) + + if sr != nil { + sr = io.MultiReader(sr, segsr) + } else { + sr = segsr + } + } + + cpdir := filepath.Join(w.Dir(), fmt.Sprintf("checkpoint.%06d", to)) + cpdirtmp := cpdir + ".tmp" + + if err := os.MkdirAll(cpdirtmp, 0777); err != nil { + return nil, errors.Wrap(err, "create checkpoint dir") + } + cp, err := wal.New(nil, nil, cpdirtmp) + if err != nil { + return nil, errors.Wrap(err, "open checkpoint") + } + + r := wal.NewReader(sr) + + var ( + series []RefSeries + samples []RefSample + tstones []Stone + dec RecordDecoder + enc RecordEncoder + buf []byte + recs [][]byte + ) + for r.Next() { + series, samples, tstones = series[:0], samples[:0], tstones[:0] + + // We don't reset the buffer since we batch up multiple records + // before writing them to the checkpoint. + // Remember where the record for this iteration starts. + start := len(buf) + rec := r.Record() + + switch dec.Type(rec) { + case RecordSeries: + series, err = dec.Series(rec, series) + if err != nil { + return nil, errors.Wrap(err, "decode series") + } + // Drop irrelevant series in place. + repl := series[:0] + for _, s := range series { + if keep(s.Ref) { + repl = append(repl, s) + } + } + if len(repl) > 0 { + buf = enc.Series(repl, buf) + } + stats.TotalSeries += len(series) + stats.DroppedSeries += len(series) - len(repl) + + case RecordSamples: + samples, err = dec.Samples(rec, samples) + if err != nil { + return nil, errors.Wrap(err, "decode samples") + } + // Drop irrelevant samples in place. + repl := samples[:0] + for _, s := range samples { + if s.T >= mint { + repl = append(repl, s) + } + } + if len(repl) > 0 { + buf = enc.Samples(repl, buf) + } + stats.TotalSamples += len(samples) + stats.DroppedSamples += len(samples) - len(repl) + + case RecordTombstones: + tstones, err = dec.Tombstones(rec, tstones) + if err != nil { + return nil, errors.Wrap(err, "decode deletes") + } + // Drop irrelevant tombstones in place. + repl := tstones[:0] + for _, s := range tstones { + for _, iv := range s.intervals { + if iv.Maxt >= mint { + repl = append(repl, s) + break + } + } + } + if len(repl) > 0 { + buf = enc.Tombstones(repl, buf) + } + stats.TotalTombstones += len(tstones) + stats.DroppedTombstones += len(tstones) - len(repl) + + default: + return nil, errors.New("invalid record type") + } + if len(buf[start:]) == 0 { + continue // All contents discarded. + } + recs = append(recs, buf[start:]) + + // Flush records in 1 MB increments. + if len(buf) > 1*1024*1024 { + if err := cp.Log(recs...); err != nil { + return nil, errors.Wrap(err, "flush records") + } + buf, recs = buf[:0], recs[:0] + } + } + // If we hit any corruption during checkpointing, repairing is not an option. + // The head won't know which series records are lost. + if r.Err() != nil { + return nil, errors.Wrap(r.Err(), "read segments") + } + + // Flush remaining records. + if err := cp.Log(recs...); err != nil { + return nil, errors.Wrap(err, "flush records") + } + if err := cp.Close(); err != nil { + return nil, errors.Wrap(err, "close checkpoint") + } + if err := fileutil.Replace(cpdirtmp, cpdir); err != nil { + return nil, errors.Wrap(err, "rename checkpoint directory") + } + if err := closeAll(closers...); err != nil { + return stats, errors.Wrap(err, "close opened files") + } + return stats, nil +} diff --git a/vendor/github.com/prometheus/tsdb/chunkenc/xor.go b/vendor/github.com/prometheus/tsdb/chunkenc/xor.go index ff690f2228d..e5e9c9a33e2 100644 --- a/vendor/github.com/prometheus/tsdb/chunkenc/xor.go +++ b/vendor/github.com/prometheus/tsdb/chunkenc/xor.go @@ -13,7 +13,7 @@ // The code in this file was largely written by Damian Gryski as part of // https://github.com/dgryski/go-tsz and published under the license below. -// It was modified to accomodate reading from byte slices without modifying +// It was modified to accommodate reading from byte slices without modifying // the underlying bytes, which would panic when reading from mmaped // read-only byte slices. @@ -259,7 +259,7 @@ func (it *xorIterator) Next() bool { it.err = err return false } - it.t = int64(t) + it.t = t it.val = math.Float64frombits(v) it.numRead++ diff --git a/vendor/github.com/prometheus/tsdb/chunks/chunks.go b/vendor/github.com/prometheus/tsdb/chunks/chunks.go index e32ea0f9987..5eab23982d5 100644 --- a/vendor/github.com/prometheus/tsdb/chunks/chunks.go +++ b/vendor/github.com/prometheus/tsdb/chunks/chunks.go @@ -57,6 +57,12 @@ func (cm *Meta) writeHash(h hash.Hash) error { return nil } +// Returns true if the chunk overlaps [mint, maxt]. +func (cm *Meta) OverlapsClosedInterval(mint, maxt int64) bool { + // The chunk itself is a closed interval [cm.MinTime, cm.MaxTime]. + return cm.MinTime <= maxt && mint <= cm.MaxTime +} + var ( errInvalidSize = fmt.Errorf("invalid size") errInvalidFlag = fmt.Errorf("invalid flag") @@ -133,7 +139,7 @@ func (w *Writer) finalizeTail() error { return err } // As the file was pre-allocated, we truncate any superfluous zero bytes. - off, err := tf.Seek(0, os.SEEK_CUR) + off, err := tf.Seek(0, io.SeekCurrent) if err != nil { return err } @@ -296,7 +302,7 @@ func newReader(bs []ByteSlice, cs []io.Closer, pool chunkenc.Pool) (*Reader, err } // Verify magic number. if m := binary.BigEndian.Uint32(b.Range(0, 4)); m != MagicChunks { - return nil, fmt.Errorf("invalid magic number %x", m) + return nil, errors.Errorf("invalid magic number %x", m) } } return &cr, nil @@ -349,7 +355,7 @@ func (s *Reader) Chunk(ref uint64) (chunkenc.Chunk, error) { } b := s.bs[seq] - if int(off) >= b.Len() { + if off >= b.Len() { return nil, errors.Errorf("offset %d beyond data size %d", off, b.Len()) } // With the minimum chunk length this should never cause us reading @@ -357,8 +363,8 @@ func (s *Reader) Chunk(ref uint64) (chunkenc.Chunk, error) { r := b.Range(off, off+binary.MaxVarintLen32) l, n := binary.Uvarint(r) - if n < 0 { - return nil, fmt.Errorf("reading chunk length failed") + if n <= 0 { + return nil, errors.Errorf("reading chunk length failed with %d", n) } r = b.Range(off+n, off+n+int(l)) diff --git a/vendor/github.com/prometheus/tsdb/compact.go b/vendor/github.com/prometheus/tsdb/compact.go index 16a3bd74716..0439ca310f9 100644 --- a/vendor/github.com/prometheus/tsdb/compact.go +++ b/vendor/github.com/prometheus/tsdb/compact.go @@ -55,7 +55,7 @@ type Compactor interface { Plan(dir string) ([]string, error) // Write persists a Block into a directory. - Write(dest string, b BlockReader, mint, maxt int64) (ulid.ULID, error) + Write(dest string, b BlockReader, mint, maxt int64, parent *BlockMeta) (ulid.ULID, error) // Compact runs compaction against the provided directories. Must // only be called concurrently with results of Plan(). @@ -97,7 +97,7 @@ func newCompactorMetrics(r prometheus.Registerer) *compactorMetrics { Buckets: prometheus.ExponentialBuckets(1, 2, 10), }) m.chunkSize = prometheus.NewHistogram(prometheus.HistogramOpts{ - Name: "prometheus_tsdb_compaction_chunk_size", + Name: "prometheus_tsdb_compaction_chunk_size_bytes", Help: "Final size of chunks on their first compaction", Buckets: prometheus.ExponentialBuckets(32, 1.5, 12), }) @@ -107,7 +107,7 @@ func newCompactorMetrics(r prometheus.Registerer) *compactorMetrics { Buckets: prometheus.ExponentialBuckets(4, 1.5, 12), }) m.chunkRange = prometheus.NewHistogram(prometheus.HistogramOpts{ - Name: "prometheus_tsdb_compaction_chunk_range", + Name: "prometheus_tsdb_compaction_chunk_range_seconds", Help: "Final time range of chunks on their first compaction", Buckets: prometheus.ExponentialBuckets(100, 4, 10), }) @@ -297,6 +297,11 @@ func compactBlockMetas(uid ulid.ULID, blocks ...*BlockMeta) *BlockMeta { for _, s := range b.Compaction.Sources { sources[s] = struct{}{} } + res.Compaction.Parents = append(res.Compaction.Parents, BlockDesc{ + ULID: b.ULID, + MinTime: b.MinTime, + MaxTime: b.MaxTime, + }) } res.Compaction.Level++ @@ -367,7 +372,7 @@ func (c *LeveledCompactor) Compact(dest string, dirs ...string) (uid ulid.ULID, return uid, merr } -func (c *LeveledCompactor) Write(dest string, b BlockReader, mint, maxt int64) (ulid.ULID, error) { +func (c *LeveledCompactor) Write(dest string, b BlockReader, mint, maxt int64, parent *BlockMeta) (ulid.ULID, error) { entropy := rand.New(rand.NewSource(time.Now().UnixNano())) uid := ulid.MustNew(ulid.Now(), entropy) @@ -379,6 +384,12 @@ func (c *LeveledCompactor) Write(dest string, b BlockReader, mint, maxt int64) ( meta.Compaction.Level = 1 meta.Compaction.Sources = []ulid.ULID{uid} + if parent != nil { + meta.Compaction.Parents = []BlockDesc{ + {ULID: parent.ULID, MinTime: parent.MinTime, MaxTime: parent.MaxTime}, + } + } + err := c.write(dest, meta, b) if err != nil { return uid, err @@ -441,6 +452,7 @@ func (c *LeveledCompactor) write(dest string, meta *BlockMeta, blocks ...BlockRe if err != nil { return errors.Wrap(err, "open chunk writer") } + defer chunkw.Close() // Record written chunk sizes on level 1 compactions. if meta.Compaction.Level == 1 { chunkw = &instrumentedChunkWriter{ @@ -455,6 +467,7 @@ func (c *LeveledCompactor) write(dest string, meta *BlockMeta, blocks ...BlockRe if err != nil { return errors.Wrap(err, "open index writer") } + defer indexw.Close() if err := c.populateBlock(blocks, meta, indexw, chunkw); err != nil { return errors.Wrap(err, "write compaction") @@ -464,6 +477,10 @@ func (c *LeveledCompactor) write(dest string, meta *BlockMeta, blocks ...BlockRe return errors.Wrap(err, "write merged meta") } + // We are explicitly closing them here to check for error even + // though these are covered under defer. This is because in Windows, + // you cannot delete these unless they are closed and the defer is to + // make sure they are closed if the function exits due to an error above. if err = chunkw.Close(); err != nil { return errors.Wrap(err, "close chunk writer") } @@ -472,7 +489,7 @@ func (c *LeveledCompactor) write(dest string, meta *BlockMeta, blocks ...BlockRe } // Create an empty tombstones file. - if err := writeTombstoneFile(tmp, EmptyTombstoneReader()); err != nil { + if err := writeTombstoneFile(tmp, NewMemTombstones()); err != nil { return errors.Wrap(err, "write new tombstones file") } @@ -507,6 +524,10 @@ func (c *LeveledCompactor) write(dest string, meta *BlockMeta, blocks ...BlockRe // populateBlock fills the index and chunk writers with new data gathered as the union // of the provided blocks. It returns meta information for the new block. func (c *LeveledCompactor) populateBlock(blocks []BlockReader, meta *BlockMeta, indexw IndexWriter, chunkw ChunkWriter) error { + if len(blocks) == 0 { + return errors.New("cannot populate block from no readers") + } + var ( set ChunkSeriesSet allSymbols = make(map[string]struct{}, 1<<16) @@ -578,13 +599,17 @@ func (c *LeveledCompactor) populateBlock(blocks []BlockReader, meta *BlockMeta, continue } - if len(dranges) > 0 { - // Re-encode the chunk to not have deleted values. - for i, chk := range chks { - if !intervalOverlap(dranges[0].Mint, dranges[len(dranges)-1].Maxt, chk.MinTime, chk.MaxTime) { + for i, chk := range chks { + if chk.MinTime < meta.MinTime || chk.MaxTime > meta.MaxTime { + return errors.Errorf("found chunk with minTime: %d maxTime: %d outside of compacted minTime: %d maxTime: %d", + chk.MinTime, chk.MaxTime, meta.MinTime, meta.MaxTime) + } + + if len(dranges) > 0 { + // Re-encode the chunk to not have deleted values. + if !chk.OverlapsClosedInterval(dranges[0].Mint, dranges[len(dranges)-1].Maxt) { continue } - newChunk := chunkenc.NewXORChunk() app, err := newChunk.Appender() if err != nil { @@ -600,6 +625,7 @@ func (c *LeveledCompactor) populateBlock(blocks []BlockReader, meta *BlockMeta, chks[i].Chunk = newChunk } } + if err := chunkw.WriteChunks(chks...); err != nil { return errors.Wrap(err, "write chunks") } @@ -615,7 +641,9 @@ func (c *LeveledCompactor) populateBlock(blocks []BlockReader, meta *BlockMeta, } for _, chk := range chks { - c.chunkPool.Put(chk.Chunk) + if err := c.chunkPool.Put(chk.Chunk); err != nil { + return errors.Wrap(err, "put chunk") + } } for _, l := range lset { @@ -772,7 +800,6 @@ func (c *compactionMerger) Next() bool { var chks []chunks.Meta d := c.compare() - // Both sets contain the current series. Chain them into a single one. if d > 0 { lset, chks, c.intervals = c.b.At() c.l = append(c.l[:0], lset...) @@ -786,8 +813,10 @@ func (c *compactionMerger) Next() bool { c.aok = c.a.Next() } else { + // Both sets contain the current series. Chain them into a single one. l, ca, ra := c.a.At() _, cb, rb := c.b.At() + for _, r := range rb { ra = ra.add(r) } diff --git a/vendor/github.com/prometheus/tsdb/db.go b/vendor/github.com/prometheus/tsdb/db.go index 489f3a12e5c..cb02b48356a 100644 --- a/vendor/github.com/prometheus/tsdb/db.go +++ b/vendor/github.com/prometheus/tsdb/db.go @@ -19,26 +19,26 @@ import ( "fmt" "io" "io/ioutil" + "math" "os" "path/filepath" "runtime" "sort" "strconv" + "strings" "sync" "time" - "unsafe" - - "golang.org/x/sync/errgroup" "github.com/go-kit/kit/log" "github.com/go-kit/kit/log/level" - "github.com/nightlyone/lockfile" "github.com/oklog/ulid" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/tsdb/chunkenc" "github.com/prometheus/tsdb/fileutil" "github.com/prometheus/tsdb/labels" + "github.com/prometheus/tsdb/wal" + "golang.org/x/sync/errgroup" ) // DefaultOptions used for the DB. They are sane for setups using @@ -76,7 +76,7 @@ type Appender interface { // Returned reference numbers are ephemeral and may be rejected in calls // to AddFast() at any point. Adding the sample via Add() returns a new // reference number. - // If the reference is the empty string it must not be used for caching. + // If the reference is 0 it must not be used for caching. Add(l labels.Labels, t int64, v float64) (uint64, error) // Add adds a sample pair for the referenced series. It is generally faster @@ -94,7 +94,7 @@ type Appender interface { // a hashed partition of a seriedb. type DB struct { dir string - lockf *lockfile.Lockfile + lockf fileutil.Releaser logger log.Logger metrics *dbMetrics @@ -119,11 +119,13 @@ type DB struct { type dbMetrics struct { loadedBlocks prometheus.GaugeFunc + symbolTableSize prometheus.GaugeFunc reloads prometheus.Counter reloadsFailed prometheus.Counter compactionsTriggered prometheus.Counter cutoffs prometheus.Counter cutoffsFailed prometheus.Counter + startTime prometheus.GaugeFunc tombCleanTimer prometheus.Histogram } @@ -138,6 +140,19 @@ func newDBMetrics(db *DB, r prometheus.Registerer) *dbMetrics { defer db.mtx.RUnlock() return float64(len(db.blocks)) }) + m.symbolTableSize = prometheus.NewGaugeFunc(prometheus.GaugeOpts{ + Name: "prometheus_tsdb_symbol_table_size_bytes", + Help: "Size of symbol table on disk (in bytes)", + }, func() float64 { + db.mtx.RLock() + blocks := db.blocks[:] + db.mtx.RUnlock() + symTblSize := uint64(0) + for _, b := range blocks { + symTblSize += b.GetSymbolTableSize() + } + return float64(symTblSize) + }) m.reloads = prometheus.NewCounter(prometheus.CounterOpts{ Name: "prometheus_tsdb_reloads_total", Help: "Number of times the database reloaded block data from disk.", @@ -158,6 +173,17 @@ func newDBMetrics(db *DB, r prometheus.Registerer) *dbMetrics { Name: "prometheus_tsdb_retention_cutoffs_failures_total", Help: "Number of times the database failed to cut off block data from disk.", }) + m.startTime = prometheus.NewGaugeFunc(prometheus.GaugeOpts{ + Name: "prometheus_tsdb_lowest_timestamp", + Help: "Lowest timestamp value stored in the database.", + }, func() float64 { + db.mtx.RLock() + defer db.mtx.RUnlock() + if len(db.blocks) == 0 { + return float64(db.head.minTime) + } + return float64(db.blocks[0].meta.MinTime) + }) m.tombCleanTimer = prometheus.NewHistogram(prometheus.HistogramOpts{ Name: "prometheus_tsdb_tombstone_cleanup_seconds", Help: "The time taken to recompact blocks to remove tombstones.", @@ -166,11 +192,13 @@ func newDBMetrics(db *DB, r prometheus.Registerer) *dbMetrics { if r != nil { r.MustRegister( m.loadedBlocks, + m.symbolTableSize, m.reloads, m.reloadsFailed, m.cutoffs, m.cutoffsFailed, m.compactionsTriggered, + m.startTime, m.tombCleanTimer, ) } @@ -192,6 +220,10 @@ func Open(dir string, l log.Logger, r prometheus.Registerer, opts *Options) (db if err := repairBadIndexVersion(l, dir); err != nil { return nil, err } + // Migrate old WAL if one exists. + if err := MigrateWAL(l, filepath.Join(dir, "wal")); err != nil { + return nil, errors.Wrap(err, "migrate WAL") + } db = &DB{ dir: dir, @@ -210,14 +242,11 @@ func Open(dir string, l log.Logger, r prometheus.Registerer, opts *Options) (db if err != nil { return nil, err } - lockf, err := lockfile.New(filepath.Join(absdir, "lock")) + lockf, _, err := fileutil.Flock(filepath.Join(absdir, "lock")) if err != nil { - return nil, err + return nil, errors.Wrap(err, "lock DB directory") } - if err := lockf.TryLock(); err != nil { - return nil, errors.Wrapf(err, "open DB in %s", dir) - } - db.lockf = &lockf + db.lockf = lockf } db.compactor, err = NewLeveledCompactor(r, l, opts.BlockRanges, db.chunkPool) @@ -225,18 +254,18 @@ func Open(dir string, l log.Logger, r prometheus.Registerer, opts *Options) (db return nil, errors.Wrap(err, "create leveled compactor") } - wal, err := OpenSegmentWAL(filepath.Join(dir, "wal"), l, opts.WALFlushInterval, r) + wlog, err := wal.New(l, r, filepath.Join(dir, "wal")) if err != nil { return nil, err } - db.head, err = NewHead(r, l, wal, opts.BlockRanges[0]) + db.head, err = NewHead(r, l, wlog, opts.BlockRanges[0]) if err != nil { return nil, err } if err := db.reload(); err != nil { return nil, err } - if err := db.head.ReadWAL(); err != nil { + if err := db.head.Init(); err != nil { return nil, errors.Wrap(err, "read WAL") } @@ -271,17 +300,9 @@ func (db *DB) run() { case <-db.compactc: db.metrics.compactionsTriggered.Inc() - _, err1 := db.retentionCutoff() - if err1 != nil { - level.Error(db.logger).Log("msg", "retention cutoff failed", "err", err1) - } - - _, err2 := db.compact() - if err2 != nil { - level.Error(db.logger).Log("msg", "compaction failed", "err", err2) - } - - if err1 != nil || err2 != nil { + err := db.compact() + if err != nil { + level.Error(db.logger).Log("msg", "compaction failed", "err", err) backoff = exponential(backoff, 1*time.Second, 1*time.Minute) } else { backoff = 0 @@ -293,19 +314,9 @@ func (db *DB) run() { } } -func (db *DB) retentionCutoff() (b bool, err error) { - defer func() { - if !b && err == nil { - // no data had to be cut off. - return - } - db.metrics.cutoffs.Inc() - if err != nil { - db.metrics.cutoffsFailed.Inc() - } - }() +func (db *DB) beyondRetention(meta *BlockMeta) bool { if db.opts.RetentionDuration == 0 { - return false, nil + return false } db.mtx.RLock() @@ -313,23 +324,13 @@ func (db *DB) retentionCutoff() (b bool, err error) { db.mtx.RUnlock() if len(blocks) == 0 { - return false, nil + return false } last := blocks[len(db.blocks)-1] - mint := last.Meta().MaxTime - int64(db.opts.RetentionDuration) - dirs, err := retentionCutoffDirs(db.dir, mint) - if err != nil { - return false, err - } - - // This will close the dirs and then delete the dirs. - if len(dirs) > 0 { - return true, db.reload(dirs...) - } - return false, nil + return meta.MaxTime < mint } // Appender opens a new appender against the database. @@ -358,12 +359,19 @@ func (a dbAppender) Commit() error { return err } -func (db *DB) compact() (changes bool, err error) { +// Compact data if possible. After successful compaction blocks are reloaded +// which will also trigger blocks to be deleted that fall out of the retention +// window. +// If no blocks are compacted, the retention window state doesn't change. Thus, +// this is sufficient to reliably delete old data. +// Old blocks are only deleted on reload based on the new block's parent information. +// See DB.reload documentation for further information. +func (db *DB) compact() (err error) { db.cmtx.Lock() defer db.cmtx.Unlock() if !db.compactionsEnabled { - return false, nil + return nil } // Check whether we have pending head blocks that are ready to be persisted. @@ -371,7 +379,7 @@ func (db *DB) compact() (changes bool, err error) { for { select { case <-db.stopc: - return changes, nil + return nil default: } // The head has a compactable range if 1.5 level 0 ranges are between the oldest @@ -385,17 +393,22 @@ func (db *DB) compact() (changes bool, err error) { head := &rangeHead{ head: db.head, mint: mint, - maxt: maxt, + // We remove 1 millisecond from maxt because block + // intervals are half-open: [b.MinTime, b.MaxTime). But + // chunk intervals are closed: [c.MinTime, c.MaxTime]; + // so in order to make sure that overlaps are evaluated + // consistently, we explicitly remove the last value + // from the block interval here. + maxt: maxt - 1, } - if _, err = db.compactor.Write(db.dir, head, mint, maxt); err != nil { - return changes, errors.Wrap(err, "persist head block") + if _, err = db.compactor.Write(db.dir, head, mint, maxt, nil); err != nil { + return errors.Wrap(err, "persist head block") } - changes = true runtime.GC() if err := db.reload(); err != nil { - return changes, errors.Wrap(err, "reload blocks") + return errors.Wrap(err, "reload blocks") } runtime.GC() } @@ -404,7 +417,7 @@ func (db *DB) compact() (changes bool, err error) { for { plan, err := db.compactor.Plan(db.dir) if err != nil { - return changes, errors.Wrap(err, "plan compaction") + return errors.Wrap(err, "plan compaction") } if len(plan) == 0 { break @@ -412,56 +425,22 @@ func (db *DB) compact() (changes bool, err error) { select { case <-db.stopc: - return changes, nil + return nil default: } if _, err := db.compactor.Compact(db.dir, plan...); err != nil { - return changes, errors.Wrapf(err, "compact %s", plan) + return errors.Wrapf(err, "compact %s", plan) } - changes = true runtime.GC() - if err := db.reload(plan...); err != nil { - return changes, errors.Wrap(err, "reload blocks") + if err := db.reload(); err != nil { + return errors.Wrap(err, "reload blocks") } runtime.GC() } - return changes, nil -} - -// retentionCutoffDirs returns all directories of blocks in dir that are strictly -// before mint. -func retentionCutoffDirs(dir string, mint int64) ([]string, error) { - df, err := fileutil.OpenDir(dir) - if err != nil { - return nil, errors.Wrapf(err, "open directory") - } - defer df.Close() - - dirs, err := blockDirs(dir) - if err != nil { - return nil, errors.Wrapf(err, "list block dirs %s", dir) - } - - delDirs := []string{} - - for _, dir := range dirs { - meta, err := readMetaFile(dir) - if err != nil { - return nil, errors.Wrapf(err, "read block meta %s", dir) - } - // The first block we encounter marks that we crossed the boundary - // of deletable blocks. - if meta.MaxTime >= mint { - break - } - - delDirs = append(delDirs, dir) - } - - return delDirs, nil + return nil } func (db *DB) getBlock(id ulid.ULID) (*Block, bool) { @@ -473,18 +452,10 @@ func (db *DB) getBlock(id ulid.ULID) (*Block, bool) { return nil, false } -func stringsContain(set []string, elem string) bool { - for _, e := range set { - if elem == e { - return true - } - } - return false -} - // reload on-disk blocks and trigger head truncation if new blocks appeared. It takes // a list of block directories which should be deleted during reload. -func (db *DB) reload(deleteable ...string) (err error) { +// Blocks that are obsolete due to replacement or retention will be deleted. +func (db *DB) reload() (err error) { defer func() { if err != nil { db.metrics.reloadsFailed.Inc() @@ -496,21 +467,58 @@ func (db *DB) reload(deleteable ...string) (err error) { if err != nil { return errors.Wrap(err, "find blocks") } + // We delete old blocks that have been superseded by new ones by gathering all parents + // from existing blocks. Those parents all have newer replacements and can be safely deleted + // after we loaded the other blocks. + // This makes us resilient against the process crashing towards the end of a compaction. + // Creation of a new block and deletion of its parents cannot happen atomically. By creating + // blocks with their parents, we can pick up the deletion where it left off during a crash. var ( - blocks []*Block - exist = map[ulid.ULID]struct{}{} + blocks []*Block + corrupted = map[ulid.ULID]error{} + opened = map[ulid.ULID]struct{}{} + deleteable = map[ulid.ULID]struct{}{} ) + for _, dir := range dirs { + meta, err := readMetaFile(dir) + if err != nil { + // The block was potentially in the middle of being deleted during a crash. + // Skip it since we may delete it properly further down again. + level.Warn(db.logger).Log("msg", "read meta information", "err", err, "dir", dir) + ulid, err2 := ulid.Parse(filepath.Base(dir)) + if err2 != nil { + level.Error(db.logger).Log("msg", "not a block dir", "dir", dir) + continue + } + corrupted[ulid] = err + continue + } + if db.beyondRetention(meta) { + deleteable[meta.ULID] = struct{}{} + continue + } + for _, b := range meta.Compaction.Parents { + deleteable[b.ULID] = struct{}{} + } + } + // Blocks we failed to open should all be those we are want to delete anyway. + for c, err := range corrupted { + if _, ok := deleteable[c]; !ok { + return errors.Wrapf(err, "unexpected corrupted block %s", c) + } + } + // Load new blocks into memory. for _, dir := range dirs { meta, err := readMetaFile(dir) if err != nil { return errors.Wrapf(err, "read meta information %s", dir) } - // If the block is pending for deletion, don't add it to the new block set. - if stringsContain(deleteable, dir) { + // Don't load blocks that are scheduled for deletion. + if _, ok := deleteable[meta.ULID]; ok { continue } - + // See if we already have the block in memory or open it otherwise. b, ok := db.getBlock(meta.ULID) if !ok { b, err = OpenBlock(dir, db.chunkPool) @@ -518,11 +526,12 @@ func (db *DB) reload(deleteable ...string) (err error) { return errors.Wrapf(err, "open block %s", dir) } } - blocks = append(blocks, b) - exist[meta.ULID] = struct{}{} + opened[meta.ULID] = struct{}{} } - + sort.Slice(blocks, func(i, j int) bool { + return blocks[i].Meta().MinTime < blocks[j].Meta().MinTime + }) if err := validateBlockSequence(blocks); err != nil { return errors.Wrap(err, "invalid block sequence") } @@ -534,15 +543,19 @@ func (db *DB) reload(deleteable ...string) (err error) { db.blocks = blocks db.mtx.Unlock() + // Drop old blocks from memory. for _, b := range oldBlocks { - if _, ok := exist[b.Meta().ULID]; ok { + if _, ok := opened[b.Meta().ULID]; ok { continue } if err := b.Close(); err != nil { level.Warn(db.logger).Log("msg", "closing block failed", "err", err) } - if err := os.RemoveAll(b.Dir()); err != nil { - level.Warn(db.logger).Log("msg", "deleting block failed", "err", err) + } + // Delete all obsolete blocks. None of them are opened any longer. + for ulid := range deleteable { + if err := os.RemoveAll(filepath.Join(db.dir, ulid.String())); err != nil { + return errors.Wrapf(err, "delete obsolete block %s", ulid) } } @@ -556,22 +569,125 @@ func (db *DB) reload(deleteable ...string) (err error) { return errors.Wrap(db.head.Truncate(maxt), "head truncate failed") } +// validateBlockSequence returns error if given block meta files indicate that some blocks overlaps within sequence. func validateBlockSequence(bs []*Block) error { - if len(bs) == 0 { + if len(bs) <= 1 { return nil } - sort.Slice(bs, func(i, j int) bool { - return bs[i].Meta().MinTime < bs[j].Meta().MinTime - }) - prev := bs[0] - for _, b := range bs[1:] { - if b.Meta().MinTime < prev.Meta().MaxTime { - return errors.Errorf("block time ranges overlap (%d, %d)", b.Meta().MinTime, prev.Meta().MaxTime) - } + + var metas []BlockMeta + for _, b := range bs { + metas = append(metas, b.meta) + } + + overlaps := OverlappingBlocks(metas) + if len(overlaps) > 0 { + return errors.Errorf("block time ranges overlap: %s", overlaps) } + return nil } +// TimeRange specifies minTime and maxTime range. +type TimeRange struct { + Min, Max int64 +} + +// Overlaps contains overlapping blocks aggregated by overlapping range. +type Overlaps map[TimeRange][]BlockMeta + +// String returns human readable string form of overlapped blocks. +func (o Overlaps) String() string { + var res []string + for r, overlaps := range o { + var groups []string + for _, m := range overlaps { + groups = append(groups, fmt.Sprintf( + "", + m.ULID.String(), + m.MinTime, + m.MaxTime, + (time.Duration((m.MaxTime-m.MinTime)/1000)*time.Second).String(), + )) + } + res = append(res, fmt.Sprintf( + "[mint: %d, maxt: %d, range: %s, blocks: %d]: %s", + r.Min, r.Max, + (time.Duration((r.Max-r.Min)/1000)*time.Second).String(), + len(overlaps), + strings.Join(groups, ", ")), + ) + } + return strings.Join(res, "\n") +} + +// OverlappingBlocks returns all overlapping blocks from given meta files. +func OverlappingBlocks(bm []BlockMeta) Overlaps { + if len(bm) <= 1 { + return nil + } + var ( + overlaps [][]BlockMeta + + // pending contains not ended blocks in regards to "current" timestamp. + pending = []BlockMeta{bm[0]} + // continuousPending helps to aggregate same overlaps to single group. + continuousPending = true + ) + + // We have here blocks sorted by minTime. We iterate over each block and treat its minTime as our "current" timestamp. + // We check if any of the pending block finished (blocks that we have seen before, but their maxTime was still ahead current + // timestamp). If not, it means they overlap with our current block. In the same time current block is assumed pending. + for _, b := range bm[1:] { + var newPending []BlockMeta + + for _, p := range pending { + // "b.MinTime" is our current time. + if b.MinTime >= p.MaxTime { + continuousPending = false + continue + } + + // "p" overlaps with "b" and "p" is still pending. + newPending = append(newPending, p) + } + + // Our block "b" is now pending. + pending = append(newPending, b) + if len(newPending) == 0 { + // No overlaps. + continue + } + + if continuousPending && len(overlaps) > 0 { + overlaps[len(overlaps)-1] = append(overlaps[len(overlaps)-1], b) + continue + } + overlaps = append(overlaps, append(newPending, b)) + // Start new pendings. + continuousPending = true + } + + // Fetch the critical overlapped time range foreach overlap groups. + overlapGroups := Overlaps{} + for _, overlap := range overlaps { + + minRange := TimeRange{Min: 0, Max: math.MaxInt64} + for _, b := range overlap { + if minRange.Max > b.MaxTime { + minRange.Max = b.MaxTime + } + + if minRange.Min < b.MinTime { + minRange.Min = b.MinTime + } + } + overlapGroups[minRange] = overlap + } + + return overlapGroups +} + func (db *DB) String() string { return "HEAD" } @@ -609,7 +725,7 @@ func (db *DB) Close() error { merr.Add(g.Wait()) if db.lockf != nil { - merr.Add(db.lockf.Unlock()) + merr.Add(db.lockf.Release()) } merr.Add(db.head.Close()) return merr.Err() @@ -659,7 +775,7 @@ func (db *DB) Snapshot(dir string, withHead bool) error { if !withHead { return nil } - _, err := db.compactor.Write(dir, db.head, db.head.MinTime(), db.head.MaxTime()) + _, err := db.compactor.Write(dir, db.head, db.head.MinTime(), db.head.MaxTime(), nil) return errors.Wrap(err, "snapshot head block") } @@ -672,8 +788,7 @@ func (db *DB) Querier(mint, maxt int64) (Querier, error) { defer db.mtx.RUnlock() for _, b := range db.blocks { - m := b.Meta() - if intervalOverlap(mint, maxt, m.MinTime, m.MaxTime) { + if b.OverlapsClosedInterval(mint, maxt) { blocks = append(blocks, b) } } @@ -715,8 +830,7 @@ func (db *DB) Delete(mint, maxt int64, ms ...labels.Matcher) error { defer db.mtx.RUnlock() for _, b := range db.blocks { - m := b.Meta() - if intervalOverlap(mint, maxt, m.MinTime, m.MaxTime) { + if b.OverlapsClosedInterval(mint, maxt) { g.Go(func(b *Block) func() error { return func() error { return b.Delete(mint, maxt, ms...) } }(b)) @@ -725,50 +839,43 @@ func (db *DB) Delete(mint, maxt int64, ms ...labels.Matcher) error { g.Go(func() error { return db.head.Delete(mint, maxt, ms...) }) - if err := g.Wait(); err != nil { - return err - } - return nil + return g.Wait() } // CleanTombstones re-writes any blocks with tombstones. -func (db *DB) CleanTombstones() error { +func (db *DB) CleanTombstones() (err error) { db.cmtx.Lock() defer db.cmtx.Unlock() start := time.Now() - defer db.metrics.tombCleanTimer.Observe(float64(time.Since(start).Seconds())) + defer db.metrics.tombCleanTimer.Observe(time.Since(start).Seconds()) + + newUIDs := []ulid.ULID{} + defer func() { + // If any error is caused, we need to delete all the new directory created. + if err != nil { + for _, uid := range newUIDs { + dir := filepath.Join(db.Dir(), uid.String()) + if err := os.RemoveAll(dir); err != nil { + level.Error(db.logger).Log("msg", "failed to delete block after failed `CleanTombstones`", "dir", dir, "err", err) + } + } + } + }() db.mtx.RLock() blocks := db.blocks[:] db.mtx.RUnlock() - deleted := []string{} for _, b := range blocks { - ok, err := b.CleanTombstones(db.Dir(), db.compactor) - if err != nil { - return errors.Wrapf(err, "clean tombstones: %s", b.Dir()) - } - - if ok { - deleted = append(deleted, b.Dir()) + if uid, er := b.CleanTombstones(db.Dir(), db.compactor); er != nil { + err = errors.Wrapf(er, "clean tombstones: %s", b.Dir()) + return err + } else if uid != nil { // New block was created. + newUIDs = append(newUIDs, *uid) } } - - if len(deleted) == 0 { - return nil - } - - return errors.Wrap(db.reload(deleted...), "reload blocks") -} - -func intervalOverlap(amin, amax, bmin, bmax int64) bool { - // Checks Overlap: http://stackoverflow.com/questions/3269434/ - return amin <= bmax && bmin <= amax -} - -func intervalContains(min, max, t int64) bool { - return t >= min && t <= max + return errors.Wrap(db.reload(), "reload blocks") } func isBlockDir(fi os.FileInfo) bool { @@ -869,9 +976,6 @@ func (es MultiError) Err() error { return es } -func yoloString(b []byte) string { return *((*string)(unsafe.Pointer(&b))) } -func yoloBytes(s string) []byte { return *((*[]byte)(unsafe.Pointer(&s))) } - func closeAll(cs ...io.Closer) error { var merr MultiError diff --git a/vendor/github.com/prometheus/tsdb/encoding_helpers.go b/vendor/github.com/prometheus/tsdb/encoding_helpers.go index d0805fe07d9..ffb58b5c8b7 100644 --- a/vendor/github.com/prometheus/tsdb/encoding_helpers.go +++ b/vendor/github.com/prometheus/tsdb/encoding_helpers.go @@ -11,7 +11,7 @@ import ( var errInvalidSize = errors.New("invalid size") -// enbuf is a helper type to populate a byte slice with various types. +// encbuf is a helper type to populate a byte slice with various types. type encbuf struct { b []byte c [binary.MaxVarintLen64]byte diff --git a/vendor/github.com/prometheus/tsdb/fileutil/fileutil.go b/vendor/github.com/prometheus/tsdb/fileutil/fileutil.go index c2c25842ad6..15403c8b3c4 100644 --- a/vendor/github.com/prometheus/tsdb/fileutil/fileutil.go +++ b/vendor/github.com/prometheus/tsdb/fileutil/fileutil.go @@ -5,10 +5,80 @@ package fileutil import ( + "io/ioutil" "os" + "path/filepath" "sort" + "strings" ) +// CopyDirs copies all directories, subdirectories and files recursively including the empty folders. +// Source and destination must be full paths. +func CopyDirs(src, dest string) error { + if err := os.MkdirAll(dest, 0777); err != nil { + return err + } + files, err := readDirs(src) + if err != nil { + return err + } + + for _, f := range files { + dp := filepath.Join(dest, f) + sp := filepath.Join(src, f) + + stat, err := os.Stat(sp) + if err != nil { + return err + } + + // Empty directories are also created. + if stat.IsDir() { + if err := os.MkdirAll(dp, 0777); err != nil { + return err + } + continue + } + + if err := copyFile(sp, dp); err != nil { + return err + } + } + return nil +} + +func copyFile(src, dest string) error { + data, err := ioutil.ReadFile(src) + if err != nil { + return err + } + + err = ioutil.WriteFile(dest, data, 0644) + if err != nil { + return err + } + return nil +} + +// readDirs reads the source directory recursively and +// returns relative paths to all files and empty directories. +func readDirs(src string) ([]string, error) { + var files []string + var err error + + err = filepath.Walk(src, func(path string, f os.FileInfo, err error) error { + relativePath := strings.TrimPrefix(path, src) + if len(relativePath) > 0 { + files = append(files, relativePath) + } + return nil + }) + if err != nil { + return nil, err + } + return files, nil +} + // ReadDir returns the filenames in the given directory in sorted order. func ReadDir(dirpath string) ([]string, error) { dir, err := os.Open(dirpath) @@ -23,3 +93,45 @@ func ReadDir(dirpath string) ([]string, error) { sort.Strings(names) return names, nil } + +// Rename safely renames a file. +func Rename(from, to string) error { + if err := os.Rename(from, to); err != nil { + return err + } + + // Directory was renamed; sync parent dir to persist rename. + pdir, err := OpenDir(filepath.Dir(to)) + if err != nil { + return err + } + + if err = Fsync(pdir); err != nil { + pdir.Close() + return err + } + return pdir.Close() +} + +// Replace moves a file or directory to a new location and deletes any previous data. +// It is not atomic. +func Replace(from, to string) error { + if err := os.RemoveAll(to); err != nil { + return err + } + if err := os.Rename(from, to); err != nil { + return err + } + + // Directory was renamed; sync parent dir to persist rename. + pdir, err := OpenDir(filepath.Dir(to)) + if err != nil { + return err + } + + if err = Fsync(pdir); err != nil { + pdir.Close() + return err + } + return pdir.Close() +} diff --git a/vendor/github.com/prometheus/tsdb/fileutil/flock.go b/vendor/github.com/prometheus/tsdb/fileutil/flock.go new file mode 100644 index 00000000000..d5eaa7ca2ad --- /dev/null +++ b/vendor/github.com/prometheus/tsdb/fileutil/flock.go @@ -0,0 +1,41 @@ +// Copyright 2016 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fileutil + +import ( + "os" + "path/filepath" +) + +// Releaser provides the Release method to release a file lock. +type Releaser interface { + Release() error +} + +// Flock locks the file with the provided name. If the file does not exist, it is +// created. The returned Releaser is used to release the lock. existed is true +// if the file to lock already existed. A non-nil error is returned if the +// locking has failed. Neither this function nor the returned Releaser is +// goroutine-safe. +func Flock(fileName string) (r Releaser, existed bool, err error) { + if err = os.MkdirAll(filepath.Dir(fileName), 0755); err != nil { + return nil, false, err + } + + _, err = os.Stat(fileName) + existed = err == nil + + r, err = newLock(fileName) + return r, existed, err +} diff --git a/vendor/github.com/prometheus/tsdb/fileutil/flock_plan9.go b/vendor/github.com/prometheus/tsdb/fileutil/flock_plan9.go new file mode 100644 index 00000000000..8a3d44c5e14 --- /dev/null +++ b/vendor/github.com/prometheus/tsdb/fileutil/flock_plan9.go @@ -0,0 +1,32 @@ +// Copyright 2016 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fileutil + +import "os" + +type plan9Lock struct { + f *os.File +} + +func (l *plan9Lock) Release() error { + return l.f.Close() +} + +func newLock(fileName string) (Releaser, error) { + f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE, os.ModeExclusive|0644) + if err != nil { + return nil, err + } + return &plan9Lock{f}, nil +} diff --git a/vendor/github.com/prometheus/tsdb/fileutil/flock_solaris.go b/vendor/github.com/prometheus/tsdb/fileutil/flock_solaris.go new file mode 100644 index 00000000000..7f527ae6c49 --- /dev/null +++ b/vendor/github.com/prometheus/tsdb/fileutil/flock_solaris.go @@ -0,0 +1,59 @@ +// Copyright 2016 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build solaris + +package fileutil + +import ( + "os" + "syscall" +) + +type unixLock struct { + f *os.File +} + +func (l *unixLock) Release() error { + if err := l.set(false); err != nil { + return err + } + return l.f.Close() +} + +func (l *unixLock) set(lock bool) error { + flock := syscall.Flock_t{ + Type: syscall.F_UNLCK, + Start: 0, + Len: 0, + Whence: 1, + } + if lock { + flock.Type = syscall.F_WRLCK + } + return syscall.FcntlFlock(l.f.Fd(), syscall.F_SETLK, &flock) +} + +func newLock(fileName string) (Releaser, error) { + f, err := os.OpenFile(fileName, os.O_RDWR|os.O_CREATE, 0644) + if err != nil { + return nil, err + } + l := &unixLock{f} + err = l.set(true) + if err != nil { + f.Close() + return nil, err + } + return l, nil +} diff --git a/vendor/github.com/prometheus/tsdb/fileutil/flock_unix.go b/vendor/github.com/prometheus/tsdb/fileutil/flock_unix.go new file mode 100644 index 00000000000..f493fbd831d --- /dev/null +++ b/vendor/github.com/prometheus/tsdb/fileutil/flock_unix.go @@ -0,0 +1,54 @@ +// Copyright 2016 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build darwin dragonfly freebsd linux netbsd openbsd + +package fileutil + +import ( + "os" + "syscall" +) + +type unixLock struct { + f *os.File +} + +func (l *unixLock) Release() error { + if err := l.set(false); err != nil { + return err + } + return l.f.Close() +} + +func (l *unixLock) set(lock bool) error { + how := syscall.LOCK_UN + if lock { + how = syscall.LOCK_EX + } + return syscall.Flock(int(l.f.Fd()), how|syscall.LOCK_NB) +} + +func newLock(fileName string) (Releaser, error) { + f, err := os.OpenFile(fileName, os.O_RDWR|os.O_CREATE, 0644) + if err != nil { + return nil, err + } + l := &unixLock{f} + err = l.set(true) + if err != nil { + f.Close() + return nil, err + } + return l, nil +} diff --git a/vendor/github.com/prometheus/tsdb/fileutil/flock_windows.go b/vendor/github.com/prometheus/tsdb/fileutil/flock_windows.go new file mode 100644 index 00000000000..1c17ff4ea30 --- /dev/null +++ b/vendor/github.com/prometheus/tsdb/fileutil/flock_windows.go @@ -0,0 +1,36 @@ +// Copyright 2016 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fileutil + +import "syscall" + +type windowsLock struct { + fd syscall.Handle +} + +func (fl *windowsLock) Release() error { + return syscall.Close(fl.fd) +} + +func newLock(fileName string) (Releaser, error) { + pathp, err := syscall.UTF16PtrFromString(fileName) + if err != nil { + return nil, err + } + fd, err := syscall.CreateFile(pathp, syscall.GENERIC_READ|syscall.GENERIC_WRITE, 0, nil, syscall.CREATE_ALWAYS, syscall.FILE_ATTRIBUTE_NORMAL, 0) + if err != nil { + return nil, err + } + return &windowsLock{fd}, nil +} diff --git a/vendor/github.com/prometheus/tsdb/fileutil/sync_linux.go b/vendor/github.com/prometheus/tsdb/fileutil/sync_linux.go index 1145904bfaf..1bbced915e9 100644 --- a/vendor/github.com/prometheus/tsdb/fileutil/sync_linux.go +++ b/vendor/github.com/prometheus/tsdb/fileutil/sync_linux.go @@ -31,4 +31,4 @@ func Fsync(f *os.File) error { // to be correctly handled. func Fdatasync(f *os.File) error { return syscall.Fdatasync(int(f.Fd())) -} \ No newline at end of file +} diff --git a/vendor/github.com/prometheus/tsdb/head.go b/vendor/github.com/prometheus/tsdb/head.go index c76c139d5a5..92d8a128fa8 100644 --- a/vendor/github.com/prometheus/tsdb/head.go +++ b/vendor/github.com/prometheus/tsdb/head.go @@ -15,6 +15,7 @@ package tsdb import ( "math" + "path/filepath" "runtime" "sort" "strings" @@ -30,6 +31,7 @@ import ( "github.com/prometheus/tsdb/chunks" "github.com/prometheus/tsdb/index" "github.com/prometheus/tsdb/labels" + "github.com/prometheus/tsdb/wal" ) var ( @@ -53,9 +55,10 @@ var ( type Head struct { chunkRange int64 metrics *headMetrics - wal WAL + wal *wal.WAL logger log.Logger appendPool sync.Pool + bytesPool sync.Pool minTime, maxTime int64 lastSeriesID uint64 @@ -69,23 +72,29 @@ type Head struct { postings *index.MemPostings // postings lists for terms - tombstones memTombstones + tombstones *memTombstones } type headMetrics struct { - activeAppenders prometheus.Gauge - series prometheus.Gauge - seriesCreated prometheus.Counter - seriesRemoved prometheus.Counter - seriesNotFound prometheus.Counter - chunks prometheus.Gauge - chunksCreated prometheus.Gauge - chunksRemoved prometheus.Gauge - gcDuration prometheus.Summary - minTime prometheus.GaugeFunc - maxTime prometheus.GaugeFunc - samplesAppended prometheus.Counter - walTruncateDuration prometheus.Summary + activeAppenders prometheus.Gauge + series prometheus.Gauge + seriesCreated prometheus.Counter + seriesRemoved prometheus.Counter + seriesNotFound prometheus.Counter + chunks prometheus.Gauge + chunksCreated prometheus.Counter + chunksRemoved prometheus.Counter + gcDuration prometheus.Summary + minTime prometheus.GaugeFunc + maxTime prometheus.GaugeFunc + samplesAppended prometheus.Counter + walTruncateDuration prometheus.Summary + headTruncateFail prometheus.Counter + headTruncateTotal prometheus.Counter + checkpointDeleteFail prometheus.Counter + checkpointDeleteTotal prometheus.Counter + checkpointCreationFail prometheus.Counter + checkpointCreationTotal prometheus.Counter } func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics { @@ -99,27 +108,27 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics { Name: "prometheus_tsdb_head_series", Help: "Total number of series in the head block.", }) - m.seriesCreated = prometheus.NewGauge(prometheus.GaugeOpts{ + m.seriesCreated = prometheus.NewCounter(prometheus.CounterOpts{ Name: "prometheus_tsdb_head_series_created_total", Help: "Total number of series created in the head", }) - m.seriesRemoved = prometheus.NewGauge(prometheus.GaugeOpts{ + m.seriesRemoved = prometheus.NewCounter(prometheus.CounterOpts{ Name: "prometheus_tsdb_head_series_removed_total", Help: "Total number of series removed in the head", }) m.seriesNotFound = prometheus.NewCounter(prometheus.CounterOpts{ - Name: "prometheus_tsdb_head_series_not_found", + Name: "prometheus_tsdb_head_series_not_found_total", Help: "Total number of requests for series that were not found.", }) m.chunks = prometheus.NewGauge(prometheus.GaugeOpts{ Name: "prometheus_tsdb_head_chunks", Help: "Total number of chunks in the head block.", }) - m.chunksCreated = prometheus.NewGauge(prometheus.GaugeOpts{ + m.chunksCreated = prometheus.NewCounter(prometheus.CounterOpts{ Name: "prometheus_tsdb_head_chunks_created_total", Help: "Total number of chunks created in the head", }) - m.chunksRemoved = prometheus.NewGauge(prometheus.GaugeOpts{ + m.chunksRemoved = prometheus.NewCounter(prometheus.CounterOpts{ Name: "prometheus_tsdb_head_chunks_removed_total", Help: "Total number of chunks removed in the head", }) @@ -147,6 +156,30 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics { Name: "prometheus_tsdb_head_samples_appended_total", Help: "Total number of appended samples.", }) + m.headTruncateFail = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "prometheus_tsdb_head_truncations_failed_total", + Help: "Total number of head truncations that failed.", + }) + m.headTruncateTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "prometheus_tsdb_head_truncations_total", + Help: "Total number of head truncations attempted.", + }) + m.checkpointDeleteFail = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "prometheus_tsdb_checkpoint_deletions_failed_total", + Help: "Total number of checkpoint deletions that failed.", + }) + m.checkpointDeleteTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "prometheus_tsdb_checkpoint_deletions_total", + Help: "Total number of checkpoint deletions attempted.", + }) + m.checkpointCreationFail = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "prometheus_tsdb_checkpoint_creations_failed_total", + Help: "Total number of checkpoint creations that failed.", + }) + m.checkpointCreationTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "prometheus_tsdb_checkpoint_creations_total", + Help: "Total number of checkpoint creations attempted.", + }) if r != nil { r.MustRegister( @@ -163,19 +196,22 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics { m.gcDuration, m.walTruncateDuration, m.samplesAppended, + m.headTruncateFail, + m.headTruncateTotal, + m.checkpointDeleteFail, + m.checkpointDeleteTotal, + m.checkpointCreationFail, + m.checkpointCreationTotal, ) } return m } // NewHead opens the head block in dir. -func NewHead(r prometheus.Registerer, l log.Logger, wal WAL, chunkRange int64) (*Head, error) { +func NewHead(r prometheus.Registerer, l log.Logger, wal *wal.WAL, chunkRange int64) (*Head, error) { if l == nil { l = log.NewNopLogger() } - if wal == nil { - wal = NopWAL() - } if chunkRange < 1 { return nil, errors.Errorf("invalid chunk range %d", chunkRange) } @@ -183,13 +219,13 @@ func NewHead(r prometheus.Registerer, l log.Logger, wal WAL, chunkRange int64) ( wal: wal, logger: l, chunkRange: chunkRange, - minTime: math.MinInt64, + minTime: math.MaxInt64, maxTime: math.MinInt64, series: newStripeSeries(), values: map[string]stringset{}, symbols: map[string]struct{}{}, postings: index.NewUnorderedMemPostings(), - tombstones: memTombstones{}, + tombstones: NewMemTombstones(), } h.metrics = newHeadMetrics(h, r) @@ -200,15 +236,17 @@ func NewHead(r prometheus.Registerer, l log.Logger, wal WAL, chunkRange int64) ( // them on to other workers. // Samples before the mint timestamp are discarded. func (h *Head) processWALSamples( - mint int64, + minValidTime int64, partition, total uint64, input <-chan []RefSample, output chan<- []RefSample, ) (unknownRefs uint64) { defer close(output) + mint, maxt := int64(math.MaxInt64), int64(math.MinInt64) + for samples := range input { for _, s := range samples { - if s.T < mint || s.Ref%total != partition { + if s.T < minValidTime || s.Ref%total != partition { continue } ms := h.series.getByID(s.Ref) @@ -221,18 +259,48 @@ func (h *Head) processWALSamples( h.metrics.chunksCreated.Inc() h.metrics.chunks.Inc() } + if s.T > maxt { + maxt = s.T + } + if s.T < mint { + mint = s.T + } } output <- samples } + h.updateMinMaxTime(mint, maxt) + return unknownRefs } -// ReadWAL initializes the head by consuming the write ahead log. -func (h *Head) ReadWAL() error { - defer h.postings.EnsureOrder() +func (h *Head) updateMinMaxTime(mint, maxt int64) { + for { + lt := h.MinTime() + if mint >= lt { + break + } + if atomic.CompareAndSwapInt64(&h.minTime, lt, mint) { + break + } + } + for { + ht := h.MaxTime() + if maxt <= ht { + break + } + if atomic.CompareAndSwapInt64(&h.maxTime, ht, maxt) { + break + } + } +} - r := h.wal.Reader() - mint := h.MinTime() +func (h *Head) loadWAL(r *wal.Reader) error { + minValidTime := h.MinTime() + // If the min time is still uninitialized (no persisted blocks yet), + // we accept all sample timestamps from the WAL. + if minValidTime == math.MaxInt64 { + minValidTime = math.MinInt64 + } // Track number of samples that referenced a series we don't know about // for error reporting. @@ -253,7 +321,7 @@ func (h *Head) ReadWAL() error { output := make(chan []RefSample, 300) go func(i int, input <-chan []RefSample, output chan<- []RefSample) { - unknown := h.processWALSamples(mint, uint64(i), uint64(n), input, output) + unknown := h.processWALSamples(minValidTime, uint64(i), uint64(n), input, output) atomic.AddUint64(&unknownRefs, unknown) wg.Done() }(i, input, output) @@ -263,49 +331,71 @@ func (h *Head) ReadWAL() error { input = output } - // TODO(fabxc): series entries spread between samples can starve the sample workers. - // Even with bufferd channels, this can impact startup time with lots of series churn. - // We must not paralellize series creation itself but could make the indexing asynchronous. - seriesFunc := func(series []RefSeries) { - for _, s := range series { - h.getOrCreateWithID(s.Ref, s.Labels.Hash(), s.Labels) + var ( + dec RecordDecoder + series []RefSeries + samples []RefSample + tstones []Stone + ) + for r.Next() { + series, samples, tstones = series[:0], samples[:0], tstones[:0] + rec := r.Record() + + switch dec.Type(rec) { + case RecordSeries: + series, err := dec.Series(rec, series) + if err != nil { + return errors.Wrap(err, "decode series") + } + for _, s := range series { + h.getOrCreateWithID(s.Ref, s.Labels.Hash(), s.Labels) - if h.lastSeriesID < s.Ref { - h.lastSeriesID = s.Ref + if h.lastSeriesID < s.Ref { + h.lastSeriesID = s.Ref + } } - } - } - samplesFunc := func(samples []RefSample) { - // We split up the samples into chunks of 5000 samples or less. - // With O(300 * #cores) in-flight sample batches, large scrapes could otherwise - // cause thousands of very large in flight buffers occupying large amounts - // of unused memory. - for len(samples) > 0 { - n := 5000 - if len(samples) < n { - n = len(samples) + case RecordSamples: + samples, err := dec.Samples(rec, samples) + if err != nil { + return errors.Wrap(err, "decode samples") } - var buf []RefSample - select { - case buf = <-input: - default: + // We split up the samples into chunks of 5000 samples or less. + // With O(300 * #cores) in-flight sample batches, large scrapes could otherwise + // cause thousands of very large in flight buffers occupying large amounts + // of unused memory. + for len(samples) > 0 { + n := 5000 + if len(samples) < n { + n = len(samples) + } + var buf []RefSample + select { + case buf = <-input: + default: + } + firstInput <- append(buf[:0], samples[:n]...) + samples = samples[n:] } - firstInput <- append(buf[:0], samples[:n]...) - samples = samples[n:] - } - } - deletesFunc := func(stones []Stone) { - for _, s := range stones { - for _, itv := range s.intervals { - if itv.Maxt < mint { - continue + case RecordTombstones: + tstones, err := dec.Tombstones(rec, tstones) + if err != nil { + return errors.Wrap(err, "decode tombstones") + } + for _, s := range tstones { + for _, itv := range s.intervals { + if itv.Maxt < minValidTime { + continue + } + h.tombstones.addInterval(s.ref, itv) } - h.tombstones.add(s.ref, itv) } + default: + return errors.Errorf("invalid record type %v", dec.Type(rec)) } } - - err := r.Read(seriesFunc, samplesFunc, deletesFunc) + if r.Err() != nil { + return errors.Wrap(r.Err(), "read records") + } // Signal termination to first worker and wait for last one to close its output channel. close(firstInput) @@ -313,20 +403,69 @@ func (h *Head) ReadWAL() error { } wg.Wait() + if unknownRefs > 0 { + level.Warn(h.logger).Log("msg", "unknown series references", "count", unknownRefs) + } + return nil +} + +// Init loads data from the write ahead log and prepares the head for writes. +func (h *Head) Init() error { + defer h.postings.EnsureOrder() + + if h.wal == nil { + return nil + } + + // Backfill the checkpoint first if it exists. + dir, startFrom, err := LastCheckpoint(h.wal.Dir()) + if err != nil && err != ErrNotFound { + return errors.Wrap(err, "find last checkpoint") + } + if err == nil { + sr, err := wal.NewSegmentsReader(filepath.Join(h.wal.Dir(), dir)) + if err != nil { + return errors.Wrap(err, "open checkpoint") + } + defer sr.Close() + + // A corrupted checkpoint is a hard error for now and requires user + // intervention. There's likely little data that can be recovered anyway. + if err := h.loadWAL(wal.NewReader(sr)); err != nil { + return errors.Wrap(err, "backfill checkpoint") + } + startFrom++ + } + + // Backfill segments from the last checkpoint onwards + sr, err := wal.NewSegmentsRangeReader(h.wal.Dir(), startFrom, -1) if err != nil { - return errors.Wrap(err, "consume WAL") + return errors.Wrap(err, "open WAL segments") } - if unknownRefs > 0 { - level.Warn(h.logger).Log("msg", "unknown series references in WAL samples", "count", unknownRefs) + defer sr.Close() + + err = h.loadWAL(wal.NewReader(sr)) + if err == nil { + return nil + } + level.Warn(h.logger).Log("msg", "encountered WAL error, attempting repair", "err", err) + + if err := h.wal.Repair(err); err != nil { + return errors.Wrap(err, "repair corrupted WAL") } return nil } -// Truncate removes all data before mint from the head block and truncates its WAL. -func (h *Head) Truncate(mint int64) error { - initialize := h.MinTime() == math.MinInt64 +// Truncate removes old data before mint from the head. +func (h *Head) Truncate(mint int64) (err error) { + defer func() { + if err != nil { + h.metrics.headTruncateFail.Inc() + } + }() + initialize := h.MinTime() == math.MaxInt64 - if h.MinTime() >= mint { + if h.MinTime() >= mint && !initialize { return nil } atomic.StoreInt64(&h.minTime, mint) @@ -342,24 +481,60 @@ func (h *Head) Truncate(mint int64) error { return nil } + h.metrics.headTruncateTotal.Inc() start := time.Now() h.gc() level.Info(h.logger).Log("msg", "head GC completed", "duration", time.Since(start)) h.metrics.gcDuration.Observe(time.Since(start).Seconds()) + if h.wal == nil { + return nil + } start = time.Now() + first, last, err := h.wal.Segments() + if err != nil { + return errors.Wrap(err, "get segment range") + } + last-- // Never consider last segment for checkpoint. + if last < 0 { + return nil // no segments yet. + } + // The lower third of segments should contain mostly obsolete samples. + // If we have less than three segments, it's not worth checkpointing yet. + last = first + (last-first)/3 + if last <= first { + return nil + } + keep := func(id uint64) bool { return h.series.getByID(id) != nil } - if err := h.wal.Truncate(mint, keep); err == nil { - level.Info(h.logger).Log("msg", "WAL truncation completed", "duration", time.Since(start)) - } else { - level.Error(h.logger).Log("msg", "WAL truncation failed", "err", err, "duration", time.Since(start)) + h.metrics.checkpointCreationTotal.Inc() + if _, err = Checkpoint(h.wal, first, last, keep, mint); err != nil { + h.metrics.checkpointCreationFail.Inc() + return errors.Wrap(err, "create checkpoint") + } + if err := h.wal.Truncate(last + 1); err != nil { + // If truncating fails, we'll just try again at the next checkpoint. + // Leftover segments will just be ignored in the future if there's a checkpoint + // that supersedes them. + level.Error(h.logger).Log("msg", "truncating segments failed", "err", err) + } + h.metrics.checkpointDeleteTotal.Inc() + if err := DeleteCheckpoints(h.wal.Dir(), last); err != nil { + // Leftover old checkpoints do not cause problems down the line beyond + // occupying disk space. + // They will just be ignored since a higher checkpoint exists. + level.Error(h.logger).Log("msg", "delete old checkpoints", "err", err) + h.metrics.checkpointDeleteFail.Inc() } h.metrics.walTruncateDuration.Observe(time.Since(start).Seconds()) + level.Info(h.logger).Log("msg", "WAL checkpoint complete", + "first", first, "last", last, "duration", time.Since(start)) + return nil } @@ -367,10 +542,7 @@ func (h *Head) Truncate(mint int64) error { // for a compltely fresh head with an empty WAL. // Returns true if the initialization took an effect. func (h *Head) initTime(t int64) (initialized bool) { - // In the init state, the head has a high timestamp of math.MinInt64. - mint, _ := rangeForTimestamp(t, h.chunkRange) - - if !atomic.CompareAndSwapInt64(&h.minTime, math.MinInt64, mint) { + if !atomic.CompareAndSwapInt64(&h.minTime, math.MaxInt64, t) { return false } // Ensure that max time is initialized to at least the min time we just set. @@ -441,7 +613,7 @@ func (h *Head) Appender() Appender { // The head cache might not have a starting point yet. The init appender // picks up the first appended timestamp as the base. - if h.MinTime() == math.MinInt64 { + if h.MinTime() == math.MaxInt64 { return &initAppender{head: h} } return h.appender() @@ -449,10 +621,11 @@ func (h *Head) Appender() Appender { func (h *Head) appender() *headAppender { return &headAppender{ - head: h, - mint: h.MaxTime() - h.chunkRange/2, - samples: h.getAppendBuffer(), - highTimestamp: math.MinInt64, + head: h, + minValidTime: h.MaxTime() - h.chunkRange/2, + mint: math.MaxInt64, + maxt: math.MinInt64, + samples: h.getAppendBuffer(), } } @@ -468,17 +641,29 @@ func (h *Head) putAppendBuffer(b []RefSample) { h.appendPool.Put(b[:0]) } +func (h *Head) getBytesBuffer() []byte { + b := h.bytesPool.Get() + if b == nil { + return make([]byte, 0, 1024) + } + return b.([]byte) +} + +func (h *Head) putBytesBuffer(b []byte) { + h.bytesPool.Put(b[:0]) +} + type headAppender struct { - head *Head - mint int64 + head *Head + minValidTime int64 // No samples below this timestamp are allowed. + mint, maxt int64 - series []RefSeries - samples []RefSample - highTimestamp int64 + series []RefSeries + samples []RefSample } func (a *headAppender) Add(lset labels.Labels, t int64, v float64) (uint64, error) { - if t < a.mint { + if t < a.minValidTime { return 0, ErrOutOfBounds } @@ -493,23 +678,27 @@ func (a *headAppender) Add(lset labels.Labels, t int64, v float64) (uint64, erro } func (a *headAppender) AddFast(ref uint64, t int64, v float64) error { - s := a.head.series.getByID(ref) + if t < a.minValidTime { + return ErrOutOfBounds + } + s := a.head.series.getByID(ref) if s == nil { return errors.Wrap(ErrNotFound, "unknown series") } s.Lock() - err := s.appendable(t, v) - s.Unlock() - - if err != nil { + if err := s.appendable(t, v); err != nil { + s.Unlock() return err } + s.pendingCommit = true + s.Unlock() + if t < a.mint { - return ErrOutOfBounds + a.mint = t } - if t > a.highTimestamp { - a.highTimestamp = t + if t > a.maxt { + a.maxt = t } a.samples = append(a.samples, RefSample{ @@ -521,14 +710,42 @@ func (a *headAppender) AddFast(ref uint64, t int64, v float64) error { return nil } -func (a *headAppender) Commit() error { - defer a.Rollback() +func (a *headAppender) log() error { + if a.head.wal == nil { + return nil + } - if err := a.head.wal.LogSeries(a.series); err != nil { - return err + buf := a.head.getBytesBuffer() + defer func() { a.head.putBytesBuffer(buf) }() + + var rec []byte + var enc RecordEncoder + + if len(a.series) > 0 { + rec = enc.Series(a.series, buf) + buf = rec[:0] + + if err := a.head.wal.Log(rec); err != nil { + return errors.Wrap(err, "log series") + } + } + if len(a.samples) > 0 { + rec = enc.Samples(a.samples, buf) + buf = rec[:0] + + if err := a.head.wal.Log(rec); err != nil { + return errors.Wrap(err, "log samples") + } } - if err := a.head.wal.LogSamples(a.samples); err != nil { - return errors.Wrap(err, "WAL log samples") + return nil +} + +func (a *headAppender) Commit() error { + defer a.head.metrics.activeAppenders.Dec() + defer a.head.putAppendBuffer(a.samples) + + if err := a.log(); err != nil { + return errors.Wrap(err, "write to WAL") } total := len(a.samples) @@ -536,6 +753,7 @@ func (a *headAppender) Commit() error { for _, s := range a.samples { s.series.Lock() ok, chunkCreated := s.series.append(s.T, s.V) + s.series.pendingCommit = false s.series.Unlock() if !ok { @@ -548,25 +766,24 @@ func (a *headAppender) Commit() error { } a.head.metrics.samplesAppended.Add(float64(total)) - - for { - ht := a.head.MaxTime() - if a.highTimestamp <= ht { - break - } - if atomic.CompareAndSwapInt64(&a.head.maxTime, ht, a.highTimestamp) { - break - } - } + a.head.updateMinMaxTime(a.mint, a.maxt) return nil } func (a *headAppender) Rollback() error { a.head.metrics.activeAppenders.Dec() + for _, s := range a.samples { + s.series.Lock() + s.series.pendingCommit = false + s.series.Unlock() + } a.head.putAppendBuffer(a.samples) - return nil + // Series are created in the head memory regardless of rollback. Thus we have + // to log them to the WAL in any case. + a.samples = nil + return a.log() } // Delete all samples in the range of [mint, maxt] for series that satisfy the given @@ -587,19 +804,27 @@ func (h *Head) Delete(mint, maxt int64, ms ...labels.Matcher) error { for p.Next() { series := h.series.getByID(p.At()) + t0, t1 := series.minTime(), series.maxTime() + if t0 == math.MinInt64 || t1 == math.MinInt64 { + continue + } // Delete only until the current values and not beyond. - t0, t1 := clampInterval(mint, maxt, series.minTime(), series.maxTime()) + t0, t1 = clampInterval(mint, maxt, t0, t1) stones = append(stones, Stone{p.At(), Intervals{{t0, t1}}}) } if p.Err() != nil { return p.Err() } - if err := h.wal.LogDeletes(stones); err != nil { - return err + var enc RecordEncoder + + if h.wal != nil { + if err := h.wal.Log(enc.Tombstones(stones, nil)); err != nil { + return err + } } for _, s := range stones { - h.tombstones.add(s.ref, s.intervals[0]) + h.tombstones.addInterval(s.ref, s.intervals[0]) } return nil } @@ -626,7 +851,7 @@ func (h *Head) gc() { symbols := make(map[string]struct{}) values := make(map[string]stringset, len(h.values)) - h.postings.Iter(func(t labels.Label, _ index.Postings) error { + if err := h.postings.Iter(func(t labels.Label, _ index.Postings) error { symbols[t.Name] = struct{}{} symbols[t.Value] = struct{}{} @@ -637,7 +862,10 @@ func (h *Head) gc() { } ss.set(t.Value) return nil - }) + }); err != nil { + // This should never happen, as the iteration function only returns nil. + panic(err) + } h.symMtx.Lock() @@ -688,6 +916,9 @@ func (h *Head) MaxTime() int64 { // Close flushes the WAL and closes the head. func (h *Head) Close() error { + if h.wal == nil { + return nil + } return h.wal.Close() } @@ -729,19 +960,14 @@ func (h *headChunkReader) Chunk(ref uint64) (chunkenc.Chunk, error) { s.Lock() c := s.chunk(int(cid)) - // This means that the chunk has been garbage collected. - if c == nil { + // This means that the chunk has been garbage collected or is outside + // the specified range. + if c == nil || !c.OverlapsClosedInterval(h.mint, h.maxt) { s.Unlock() return nil, ErrNotFound } - - mint, maxt := c.minTime, c.maxTime s.Unlock() - // Do not expose chunks that are outside of the specified range. - if c == nil || !intervalOverlap(mint, maxt, h.mint, h.maxt) { - return nil, ErrNotFound - } return &safeChunk{ Chunk: c.chunk, s: s, @@ -788,14 +1014,13 @@ func (h *headIndexReader) LabelValues(names ...string) (index.StringTuples, erro if len(names) != 1 { return nil, errInvalidSize } - var sl []string h.head.symMtx.RLock() - defer h.head.symMtx.RUnlock() - + sl := make([]string, 0, len(h.head.values[names[0]])) for s := range h.head.values[names[0]] { sl = append(sl, s) } + h.head.symMtx.RUnlock() sort.Strings(sl) return index.NewStringTuples(sl, len(names)) @@ -846,7 +1071,7 @@ func (h *headIndexReader) Series(ref uint64, lbls *labels.Labels, chks *[]chunks for i, c := range s.chunks { // Do not expose chunks that are outside of the specified range. - if !intervalOverlap(c.minTime, c.maxTime, h.mint, h.maxt) { + if !c.OverlapsClosedInterval(h.mint, h.maxt) { continue } *chks = append(*chks, chunks.Meta{ @@ -1007,7 +1232,7 @@ func (s *stripeSeries) gc(mint int64) (map[uint64]struct{}, int) { series.Lock() rmChunks += series.truncateChunksBefore(mint) - if len(series.chunks) > 0 { + if len(series.chunks) > 0 || series.pendingCommit { series.Unlock() continue } @@ -1087,6 +1312,14 @@ type sample struct { v float64 } +func (s sample) T() int64 { + return s.t +} + +func (s sample) V() float64 { + return s.v +} + // memSeries is the in-memory representation of a series. None of its methods // are goroutine safe and it is the caller's responsibility to lock it. type memSeries struct { @@ -1098,19 +1331,27 @@ type memSeries struct { chunkRange int64 firstChunkID int - nextAt int64 // timestamp at which to cut the next chunk. - lastValue float64 - sampleBuf [4]sample + nextAt int64 // Timestamp at which to cut the next chunk. + lastValue float64 + sampleBuf [4]sample + pendingCommit bool // Whether there are samples waiting to be committed to this series. app chunkenc.Appender // Current appender for the chunk. } func (s *memSeries) minTime() int64 { + if len(s.chunks) == 0 { + return math.MinInt64 + } return s.chunks[0].minTime } func (s *memSeries) maxTime() int64 { - return s.head().maxTime + c := s.head() + if c == nil { + return math.MinInt64 + } + return c.maxTime } func (s *memSeries) cut(mint int64) *memChunk { @@ -1194,6 +1435,9 @@ func (s *memSeries) truncateChunksBefore(mint int64) (removed int) { // append adds the sample (t, v) to the series. func (s *memSeries) append(t int64, v float64) (success, chunkCreated bool) { + // Based on Gorilla white papers this offers near-optimal compression ratio + // so anything bigger that this has diminishing returns and increases + // the time range within which we have to decompress all samples. const samplesPerChunk = 120 c := s.head() @@ -1278,6 +1522,11 @@ type memChunk struct { minTime, maxTime int64 } +// Returns true if the chunk overlaps [mint, maxt]. +func (mc *memChunk) OverlapsClosedInterval(mint, maxt int64) bool { + return mc.minTime <= maxt && mint <= mc.maxTime +} + type memSafeIterator struct { chunkenc.Iterator diff --git a/vendor/github.com/prometheus/tsdb/index/index.go b/vendor/github.com/prometheus/tsdb/index/index.go index 3cad716c073..17acf9ab291 100644 --- a/vendor/github.com/prometheus/tsdb/index/index.go +++ b/vendor/github.com/prometheus/tsdb/index/index.go @@ -271,7 +271,9 @@ func (w *Writer) AddSeries(ref uint64, lset labels.Labels, chunks ...chunks.Meta } // We add padding to 16 bytes to increase the addressable space we get through 4 byte // series references. - w.addPadding(16) + if err := w.addPadding(16); err != nil { + return errors.Errorf("failed to write padding bytes: %v", err) + } if w.pos%16 != 0 { return errors.Errorf("series write not 16-byte aligned at %d", w.pos) @@ -392,7 +394,7 @@ func (w *Writer) WriteLabelIndex(names []string, values []string) error { w.buf2.putBE32int(valt.Len()) // here we have an index for the symbol file if v2, otherwise it's an offset - for _, v := range valt.s { + for _, v := range valt.entries { index, ok := w.symbols[v] if !ok { return errors.Errorf("symbol entry for %q does not exist", v) @@ -740,8 +742,8 @@ func (r *Reader) decbufUvarintAt(off int) decbuf { b := r.b.Range(off, off+binary.MaxVarintLen32) l, n := binary.Uvarint(b) - if n > binary.MaxVarintLen32 { - return decbuf{e: errors.New("invalid uvarint")} + if n <= 0 || n > binary.MaxVarintLen32 { + return decbuf{e: errors.Errorf("invalid uvarint %d", n)} } if r.b.Len() < off+n+int(l)+4 { @@ -780,7 +782,7 @@ func (r *Reader) readSymbols(off int) error { for d.err() == nil && d.len() > 0 && cnt > 0 { s := d.uvarintStr() - r.symbols[uint32(nextPos)] = s + r.symbols[nextPos] = s if r.version == 2 { nextPos++ @@ -800,7 +802,7 @@ func (r *Reader) readOffsetTable(off uint64, f func([]string, uint64) error) err cnt := d.be32() for d.err() == nil && d.len() > 0 && cnt > 0 { - keyCount := int(d.uvarint()) + keyCount := d.uvarint() keys := make([]string, 0, keyCount) for i := 0; i < keyCount; i++ { @@ -868,9 +870,9 @@ func (r *Reader) LabelValues(names ...string) (StringTuples, error) { return nil, errors.Wrap(d.err(), "read label value index") } st := &serializedStringTuples{ - l: nc, - b: d.get(), - lookup: r.lookupSymbol, + idsCount: nc, + idsBytes: d.get(), + lookup: r.lookupSymbol, } return st, nil } @@ -934,33 +936,33 @@ func (r *Reader) SortedPostings(p Postings) Postings { } type stringTuples struct { - l int // tuple length - s []string // flattened tuple entries + length int // tuple length + entries []string // flattened tuple entries } -func NewStringTuples(s []string, l int) (*stringTuples, error) { - if len(s)%l != 0 { +func NewStringTuples(entries []string, length int) (*stringTuples, error) { + if len(entries)%length != 0 { return nil, errors.Wrap(errInvalidSize, "string tuple list") } - return &stringTuples{s: s, l: l}, nil + return &stringTuples{entries: entries, length: length}, nil } -func (t *stringTuples) Len() int { return len(t.s) / t.l } -func (t *stringTuples) At(i int) ([]string, error) { return t.s[i : i+t.l], nil } +func (t *stringTuples) Len() int { return len(t.entries) / t.length } +func (t *stringTuples) At(i int) ([]string, error) { return t.entries[i : i+t.length], nil } func (t *stringTuples) Swap(i, j int) { - c := make([]string, t.l) - copy(c, t.s[i:i+t.l]) + c := make([]string, t.length) + copy(c, t.entries[i:i+t.length]) - for k := 0; k < t.l; k++ { - t.s[i+k] = t.s[j+k] - t.s[j+k] = c[k] + for k := 0; k < t.length; k++ { + t.entries[i+k] = t.entries[j+k] + t.entries[j+k] = c[k] } } func (t *stringTuples) Less(i, j int) bool { - for k := 0; k < t.l; k++ { - d := strings.Compare(t.s[i+k], t.s[j+k]) + for k := 0; k < t.length; k++ { + d := strings.Compare(t.entries[i+k], t.entries[j+k]) if d < 0 { return true @@ -973,23 +975,23 @@ func (t *stringTuples) Less(i, j int) bool { } type serializedStringTuples struct { - l int - b []byte - lookup func(uint32) (string, error) + idsCount int + idsBytes []byte // bytes containing the ids pointing to the string in the lookup table. + lookup func(uint32) (string, error) } func (t *serializedStringTuples) Len() int { - return len(t.b) / (4 * t.l) + return len(t.idsBytes) / (4 * t.idsCount) } func (t *serializedStringTuples) At(i int) ([]string, error) { - if len(t.b) < (i+t.l)*4 { + if len(t.idsBytes) < (i+t.idsCount)*4 { return nil, errInvalidSize } - res := make([]string, 0, t.l) + res := make([]string, 0, t.idsCount) - for k := 0; k < t.l; k++ { - offset := binary.BigEndian.Uint32(t.b[(i+k)*4:]) + for k := 0; k < t.idsCount; k++ { + offset := binary.BigEndian.Uint32(t.idsBytes[(i+k)*4:]) s, err := t.lookup(offset) if err != nil { @@ -1038,7 +1040,7 @@ func (dec *Decoder) Series(b []byte, lbls *labels.Labels, chks *[]chunks.Meta) e d := decbuf{b: b} - k := int(d.uvarint()) + k := d.uvarint() for i := 0; i < k; i++ { lno := uint32(d.uvarint()) @@ -1061,7 +1063,7 @@ func (dec *Decoder) Series(b []byte, lbls *labels.Labels, chks *[]chunks.Meta) e } // Read the chunks meta data. - k = int(d.uvarint()) + k = d.uvarint() if k == 0 { return nil diff --git a/vendor/github.com/prometheus/tsdb/querier.go b/vendor/github.com/prometheus/tsdb/querier.go index d4dad9306ec..bddb81f8634 100644 --- a/vendor/github.com/prometheus/tsdb/querier.go +++ b/vendor/github.com/prometheus/tsdb/querier.go @@ -117,7 +117,7 @@ func (q *querier) Close() error { return merr.Err() } -// NewBlockQuerier returns a queries against the readers. +// NewBlockQuerier returns a querier against the reader. func NewBlockQuerier(b BlockReader, mint, maxt int64) (Querier, error) { indexr, err := b.Index() if err != nil { @@ -249,7 +249,7 @@ func tuplesByPrefix(m *labels.PrefixMatcher, ts StringTuples) ([]string, error) } func postingsForMatcher(ix IndexReader, m labels.Matcher) (index.Postings, error) { - // If the matcher selects an empty value, it selects all the series which dont + // If the matcher selects an empty value, it selects all the series which don't // have the label name set too. See: https://github.com/prometheus/prometheus/issues/3575 // and https://github.com/prometheus/prometheus/pull/3578#issuecomment-351653555 if m.Matches("") { @@ -478,7 +478,7 @@ type baseChunkSeries struct { // over them. It drops chunks based on tombstones in the given reader. func LookupChunkSeries(ir IndexReader, tr TombstoneReader, ms ...labels.Matcher) (ChunkSeriesSet, error) { if tr == nil { - tr = EmptyTombstoneReader() + tr = NewMemTombstones() } p, err := PostingsForMatchers(ir, ms...) if err != nil { @@ -892,30 +892,6 @@ func (it *deletedIterator) Err() error { return it.it.Err() } -type mockSeriesSet struct { - next func() bool - series func() Series - err func() error -} - -func (m *mockSeriesSet) Next() bool { return m.next() } -func (m *mockSeriesSet) At() Series { return m.series() } -func (m *mockSeriesSet) Err() error { return m.err() } - -func newListSeriesSet(list []Series) *mockSeriesSet { - i := -1 - return &mockSeriesSet{ - next: func() bool { - i++ - return i < len(list) - }, - series: func() Series { - return list[i] - }, - err: func() error { return nil }, - } -} - type errSeriesSet struct { err error } diff --git a/vendor/github.com/prometheus/tsdb/record.go b/vendor/github.com/prometheus/tsdb/record.go new file mode 100644 index 00000000000..c8cc7a50431 --- /dev/null +++ b/vendor/github.com/prometheus/tsdb/record.go @@ -0,0 +1,213 @@ +// Copyright 2018 The Prometheus Authors + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tsdb + +import ( + "math" + "sort" + + "github.com/pkg/errors" + "github.com/prometheus/tsdb/labels" +) + +// RecordType represents the data type of a record. +type RecordType uint8 + +const ( + RecordInvalid RecordType = 255 + RecordSeries RecordType = 1 + RecordSamples RecordType = 2 + RecordTombstones RecordType = 3 +) + +type RecordLogger interface { + Log(recs ...[]byte) error +} + +type RecordReader interface { + Next() bool + Err() error + Record() []byte +} + +// RecordDecoder decodes series, sample, and tombstone records. +// The zero value is ready to use. +type RecordDecoder struct { +} + +// Type returns the type of the record. +// Return RecordInvalid if no valid record type is found. +func (d *RecordDecoder) Type(rec []byte) RecordType { + if len(rec) < 1 { + return RecordInvalid + } + switch t := RecordType(rec[0]); t { + case RecordSeries, RecordSamples, RecordTombstones: + return t + } + return RecordInvalid +} + +// Series appends series in rec to the given slice. +func (d *RecordDecoder) Series(rec []byte, series []RefSeries) ([]RefSeries, error) { + dec := decbuf{b: rec} + + if RecordType(dec.byte()) != RecordSeries { + return nil, errors.New("invalid record type") + } + for len(dec.b) > 0 && dec.err() == nil { + ref := dec.be64() + + lset := make(labels.Labels, dec.uvarint()) + + for i := range lset { + lset[i].Name = dec.uvarintStr() + lset[i].Value = dec.uvarintStr() + } + sort.Sort(lset) + + series = append(series, RefSeries{ + Ref: ref, + Labels: lset, + }) + } + if dec.err() != nil { + return nil, dec.err() + } + if len(dec.b) > 0 { + return nil, errors.Errorf("unexpected %d bytes left in entry", len(dec.b)) + } + return series, nil +} + +// Samples appends samples in rec to the given slice. +func (d *RecordDecoder) Samples(rec []byte, samples []RefSample) ([]RefSample, error) { + dec := decbuf{b: rec} + + if RecordType(dec.byte()) != RecordSamples { + return nil, errors.New("invalid record type") + } + if dec.len() == 0 { + return samples, nil + } + var ( + baseRef = dec.be64() + baseTime = dec.be64int64() + ) + for len(dec.b) > 0 && dec.err() == nil { + dref := dec.varint64() + dtime := dec.varint64() + val := dec.be64() + + samples = append(samples, RefSample{ + Ref: uint64(int64(baseRef) + dref), + T: baseTime + dtime, + V: math.Float64frombits(val), + }) + } + + if dec.err() != nil { + return nil, errors.Wrapf(dec.err(), "decode error after %d samples", len(samples)) + } + if len(dec.b) > 0 { + return nil, errors.Errorf("unexpected %d bytes left in entry", len(dec.b)) + } + return samples, nil +} + +// Tombstones appends tombstones in rec to the given slice. +func (d *RecordDecoder) Tombstones(rec []byte, tstones []Stone) ([]Stone, error) { + dec := decbuf{b: rec} + + if RecordType(dec.byte()) != RecordTombstones { + return nil, errors.New("invalid record type") + } + for dec.len() > 0 && dec.err() == nil { + tstones = append(tstones, Stone{ + ref: dec.be64(), + intervals: Intervals{ + {Mint: dec.varint64(), Maxt: dec.varint64()}, + }, + }) + } + if dec.err() != nil { + return nil, dec.err() + } + if len(dec.b) > 0 { + return nil, errors.Errorf("unexpected %d bytes left in entry", len(dec.b)) + } + return tstones, nil +} + +// RecordEncoder encodes series, sample, and tombstones records. +// The zero value is ready to use. +type RecordEncoder struct { +} + +// Series appends the encoded series to b and returns the resulting slice. +func (e *RecordEncoder) Series(series []RefSeries, b []byte) []byte { + buf := encbuf{b: b} + buf.putByte(byte(RecordSeries)) + + for _, s := range series { + buf.putBE64(s.Ref) + buf.putUvarint(len(s.Labels)) + + for _, l := range s.Labels { + buf.putUvarintStr(l.Name) + buf.putUvarintStr(l.Value) + } + } + return buf.get() +} + +// Samples appends the encoded samples to b and returns the resulting slice. +func (e *RecordEncoder) Samples(samples []RefSample, b []byte) []byte { + buf := encbuf{b: b} + buf.putByte(byte(RecordSamples)) + + if len(samples) == 0 { + return buf.get() + } + + // Store base timestamp and base reference number of first sample. + // All samples encode their timestamp and ref as delta to those. + first := samples[0] + + buf.putBE64(first.Ref) + buf.putBE64int64(first.T) + + for _, s := range samples { + buf.putVarint64(int64(s.Ref) - int64(first.Ref)) + buf.putVarint64(s.T - first.T) + buf.putBE64(math.Float64bits(s.V)) + } + return buf.get() +} + +// Tombstones appends the encoded tombstones to b and returns the resulting slice. +func (e *RecordEncoder) Tombstones(tstones []Stone, b []byte) []byte { + buf := encbuf{b: b} + buf.putByte(byte(RecordTombstones)) + + for _, s := range tstones { + for _, iv := range s.intervals { + buf.putBE64(s.ref) + buf.putVarint64(iv.Mint) + buf.putVarint64(iv.Maxt) + } + } + return buf.get() +} diff --git a/vendor/github.com/prometheus/tsdb/repair.go b/vendor/github.com/prometheus/tsdb/repair.go index 2722609a711..be8c1f3ed6e 100644 --- a/vendor/github.com/prometheus/tsdb/repair.go +++ b/vendor/github.com/prometheus/tsdb/repair.go @@ -5,12 +5,10 @@ import ( "io" "io/ioutil" "os" - "path" "path/filepath" "github.com/go-kit/kit/log" "github.com/go-kit/kit/log/level" - "github.com/oklog/ulid" "github.com/pkg/errors" "github.com/prometheus/tsdb/fileutil" ) @@ -20,20 +18,18 @@ import ( func repairBadIndexVersion(logger log.Logger, dir string) error { // All blocks written by Prometheus 2.1 with a meta.json version of 2 are affected. // We must actually set the index file version to 2 and revert the meta.json version back to 1. - subdirs, err := fileutil.ReadDir(dir) + dirs, err := blockDirs(dir) if err != nil { - return err + return errors.Wrapf(err, "list block dirs in %q", dir) } - for _, d := range subdirs { - // Skip non-block dirs. - if _, err := ulid.Parse(d); err != nil { - continue - } - d = path.Join(dir, d) + wrapErr := func(err error, d string) error { + return errors.Wrapf(err, "block dir: %q", d) + } + for _, d := range dirs { meta, err := readBogusMetaFile(d) if err != nil { - return err + return wrapErr(err, d) } if meta.Version == 1 { level.Info(logger).Log( @@ -53,35 +49,35 @@ func repairBadIndexVersion(logger log.Logger, dir string) error { repl, err := os.Create(filepath.Join(d, "index.repaired")) if err != nil { - return err + return wrapErr(err, d) } broken, err := os.Open(filepath.Join(d, "index")) if err != nil { - return err + return wrapErr(err, d) } if _, err := io.Copy(repl, broken); err != nil { - return err + return wrapErr(err, d) } // Set the 5th byte to 2 to indiciate the correct file format version. if _, err := repl.WriteAt([]byte{2}, 4); err != nil { - return err + return wrapErr(err, d) } if err := fileutil.Fsync(repl); err != nil { - return err + return wrapErr(err, d) } if err := repl.Close(); err != nil { - return err + return wrapErr(err, d) } if err := broken.Close(); err != nil { - return err + return wrapErr(err, d) } if err := renameFile(repl.Name(), broken.Name()); err != nil { - return err + return wrapErr(err, d) } // Reset version of meta.json to 1. meta.Version = 1 if err := writeMetaFile(d, meta); err != nil { - return err + return wrapErr(err, d) } } return nil diff --git a/vendor/github.com/prometheus/tsdb/tombstones.go b/vendor/github.com/prometheus/tsdb/tombstones.go index 8c760cdcec0..0626ac58e9f 100644 --- a/vendor/github.com/prometheus/tsdb/tombstones.go +++ b/vendor/github.com/prometheus/tsdb/tombstones.go @@ -20,6 +20,7 @@ import ( "io/ioutil" "os" "path/filepath" + "sync" "github.com/pkg/errors" ) @@ -41,6 +42,9 @@ type TombstoneReader interface { // Iter calls the given function for each encountered interval. Iter(func(uint64, Intervals) error) error + // Total returns the total count of tombstones. + Total() uint64 + // Close any underlying resources Close() error } @@ -72,7 +76,7 @@ func writeTombstoneFile(dir string, tr TombstoneReader) error { mw := io.MultiWriter(f, hash) - tr.Iter(func(ref uint64, ivs Intervals) error { + if err := tr.Iter(func(ref uint64, ivs Intervals) error { for _, iv := range ivs { buf.reset() @@ -86,7 +90,9 @@ func writeTombstoneFile(dir string, tr TombstoneReader) error { } } return nil - }) + }); err != nil { + return fmt.Errorf("error writing tombstones: %v", err) + } _, err = f.Write(hash.Sum(nil)) if err != nil { @@ -107,10 +113,10 @@ type Stone struct { intervals Intervals } -func readTombstones(dir string) (memTombstones, error) { +func readTombstones(dir string) (*memTombstones, error) { b, err := ioutil.ReadFile(filepath.Join(dir, tombstoneFilename)) if os.IsNotExist(err) { - return memTombstones{}, nil + return NewMemTombstones(), nil } else if err != nil { return nil, err } @@ -140,7 +146,7 @@ func readTombstones(dir string) (memTombstones, error) { return nil, errors.New("checksum did not match") } - stonesMap := memTombstones{} + stonesMap := NewMemTombstones() for d.len() > 0 { k := d.uvarint64() @@ -150,27 +156,31 @@ func readTombstones(dir string) (memTombstones, error) { return nil, d.err() } - stonesMap.add(k, Interval{mint, maxt}) + stonesMap.addInterval(k, Interval{mint, maxt}) } return stonesMap, nil } -type memTombstones map[uint64]Intervals - -var emptyTombstoneReader = memTombstones{} +type memTombstones struct { + intvlGroups map[uint64]Intervals + mtx sync.RWMutex +} -// EmptyTombstoneReader returns a TombstoneReader that is always empty. -func EmptyTombstoneReader() TombstoneReader { - return emptyTombstoneReader +func NewMemTombstones() *memTombstones { + return &memTombstones{intvlGroups: make(map[uint64]Intervals)} } -func (t memTombstones) Get(ref uint64) (Intervals, error) { - return t[ref], nil +func (t *memTombstones) Get(ref uint64) (Intervals, error) { + t.mtx.RLock() + defer t.mtx.RUnlock() + return t.intvlGroups[ref], nil } -func (t memTombstones) Iter(f func(uint64, Intervals) error) error { - for ref, ivs := range t { +func (t *memTombstones) Iter(f func(uint64, Intervals) error) error { + t.mtx.RLock() + defer t.mtx.RUnlock() + for ref, ivs := range t.intvlGroups { if err := f(ref, ivs); err != nil { return err } @@ -178,8 +188,24 @@ func (t memTombstones) Iter(f func(uint64, Intervals) error) error { return nil } -func (t memTombstones) add(ref uint64, itv Interval) { - t[ref] = t[ref].add(itv) +func (t *memTombstones) Total() uint64 { + t.mtx.RLock() + defer t.mtx.RUnlock() + + total := uint64(0) + for _, ivs := range t.intvlGroups { + total += uint64(len(ivs)) + } + return total +} + +// addInterval to an existing memTombstones +func (t *memTombstones) addInterval(ref uint64, itvs ...Interval) { + t.mtx.Lock() + defer t.mtx.Unlock() + for _, itv := range itvs { + t.intvlGroups[ref] = t.intvlGroups[ref].add(itv) + } } func (memTombstones) Close() error { @@ -208,7 +234,7 @@ func (tr Interval) isSubrange(dranges Intervals) bool { // Intervals represents a set of increasing and non-overlapping time-intervals. type Intervals []Interval -// This adds the new time-range to the existing ones. +// add the new time-range to the existing ones. // The existing ones must be sorted. func (itvs Intervals) add(n Interval) Intervals { for i, r := range itvs { diff --git a/vendor/github.com/prometheus/tsdb/wal.go b/vendor/github.com/prometheus/tsdb/wal.go index 72e8bc070b0..28217639d0d 100644 --- a/vendor/github.com/prometheus/tsdb/wal.go +++ b/vendor/github.com/prometheus/tsdb/wal.go @@ -33,6 +33,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/tsdb/fileutil" "github.com/prometheus/tsdb/labels" + "github.com/prometheus/tsdb/wal" ) // WALEntryType indicates what data a WAL entry contains. @@ -82,6 +83,8 @@ func newWalMetrics(wal *SegmentWAL, r prometheus.Registerer) *walMetrics { // WAL is a write ahead log that can log new series labels and samples. // It must be completely read before new entries are logged. +// +// DEPRECATED: use wal pkg combined with the record codex instead. type WAL interface { Reader() WALReader LogSeries([]RefSeries) error @@ -173,6 +176,8 @@ func newCRC32() hash.Hash32 { } // SegmentWAL is a write ahead log for series data. +// +// DEPRECATED: use wal pkg combined with the record coders instead. type SegmentWAL struct { mtx sync.Mutex metrics *walMetrics @@ -290,7 +295,7 @@ func (w *SegmentWAL) truncate(err error, file int, lastOffset int64) error { w.files = w.files[:file+1] // Seek the current file to the last valid offset where we continue writing from. - _, err = w.files[file].Seek(lastOffset, os.SEEK_SET) + _, err = w.files[file].Seek(lastOffset, io.SeekStart) return err } @@ -393,7 +398,7 @@ func (w *SegmentWAL) Truncate(mint int64, keep func(uint64) bool) error { return errors.Wrap(r.Err(), "read candidate WAL files") } - off, err := csf.Seek(0, os.SEEK_CUR) + off, err := csf.Seek(0, io.SeekCurrent) if err != nil { return err } @@ -418,7 +423,7 @@ func (w *SegmentWAL) Truncate(mint int64, keep func(uint64) bool) error { } // The file object of csf still holds the name before rename. Recreate it so - // subsequent truncations do not look at a non-existant file name. + // subsequent truncations do not look at a non-existent file name. csf.File, err = w.openSegmentFile(candidates[0].Name()) if err != nil { return err @@ -583,7 +588,7 @@ func (w *SegmentWAL) cut() error { // in the new segment. go func() { w.actorc <- func() error { - off, err := hf.Seek(0, os.SEEK_CUR) + off, err := hf.Seek(0, io.SeekCurrent) if err != nil { return errors.Wrapf(err, "finish old segment %s", hf.Name()) } @@ -718,6 +723,13 @@ func (w *SegmentWAL) run(interval time.Duration) { // Close syncs all data and closes the underlying resources. func (w *SegmentWAL) Close() error { + // Make sure you can call Close() multiple times. + select { + case <-w.stopc: + return nil // Already closed. + default: + } + close(w.stopc) <-w.donec @@ -730,21 +742,14 @@ func (w *SegmentWAL) Close() error { // On opening, a WAL must be fully consumed once. Afterwards // only the current segment will still be open. if hf := w.head(); hf != nil { - return errors.Wrapf(hf.Close(), "closing WAL head %s", hf.Name()) + if err := hf.Close(); err != nil { + return errors.Wrapf(err, "closing WAL head %s", hf.Name()) + } } - return w.dirFile.Close() + return errors.Wrapf(w.dirFile.Close(), "closing WAL dir %s", w.dirFile.Name()) } -const ( - minSectorSize = 512 - - // walPageBytes is the alignment for flushing records to the backing Writer. - // It should be a multiple of the minimum sector size so that WAL can safely - // distinguish between torn writes and ordinary data corruption. - walPageBytes = 16 * minSectorSize -) - func (w *SegmentWAL) write(t WALEntryType, flag uint8, buf []byte) error { // Cut to the next segment if the entry exceeds the file size unless it would also // exceed the size of a new segment. @@ -937,7 +942,7 @@ func (r *walReader) Read( series = v.([]RefSeries) } - err := r.decodeSeries(flag, b, &series) + err = r.decodeSeries(flag, b, &series) if err != nil { err = errors.Wrap(err, "decode series entry") break @@ -958,7 +963,7 @@ func (r *walReader) Read( samples = v.([]RefSample) } - err := r.decodeSamples(flag, b, &samples) + err = r.decodeSamples(flag, b, &samples) if err != nil { err = errors.Wrap(err, "decode samples entry") break @@ -980,7 +985,7 @@ func (r *walReader) Read( deletes = v.([]Stone) } - err := r.decodeDeletes(flag, b, &deletes) + err = r.decodeDeletes(flag, b, &deletes) if err != nil { err = errors.Wrap(err, "decode delete entry") break @@ -1015,7 +1020,7 @@ func (r *walReader) at() (WALEntryType, byte, []byte) { } // next returns decodes the next entry pair and returns true -// if it was succesful. +// if it was successful. func (r *walReader) next() bool { if r.cur >= len(r.files) { return false @@ -1024,7 +1029,7 @@ func (r *walReader) next() bool { // Remember the offset after the last correctly read entry. If the next one // is corrupted, this is where we can safely truncate. - r.lastOffset, r.err = cf.Seek(0, os.SEEK_CUR) + r.lastOffset, r.err = cf.Seek(0, io.SeekCurrent) if r.err != nil { return false } @@ -1206,3 +1211,113 @@ func (r *walReader) decodeDeletes(flag byte, b []byte, res *[]Stone) error { } return nil } + +func deprecatedWALExists(logger log.Logger, dir string) (bool, error) { + // Detect whether we still have the old WAL. + fns, err := sequenceFiles(dir) + if err != nil && !os.IsNotExist(err) { + return false, errors.Wrap(err, "list sequence files") + } + if len(fns) == 0 { + return false, nil // No WAL at all yet. + } + // Check header of first segment to see whether we are still dealing with an + // old WAL. + f, err := os.Open(fns[0]) + if err != nil { + return false, errors.Wrap(err, "check first existing segment") + } + defer f.Close() + + var hdr [4]byte + if _, err := f.Read(hdr[:]); err != nil && err != io.EOF { + return false, errors.Wrap(err, "read header from first segment") + } + // If we cannot read the magic header for segments of the old WAL, abort. + // Either it's migrated already or there's a corruption issue with which + // we cannot deal here anyway. Subsequent attempts to open the WAL will error in that case. + if binary.BigEndian.Uint32(hdr[:]) != WALMagic { + return false, nil + } + return true, nil +} + +// MigrateWAL rewrites the deprecated write ahead log into the new format. +func MigrateWAL(logger log.Logger, dir string) (err error) { + if logger == nil { + logger = log.NewNopLogger() + } + if exists, err := deprecatedWALExists(logger, dir); err != nil || !exists { + return err + } + level.Info(logger).Log("msg", "migrating WAL format") + + tmpdir := dir + ".tmp" + if err := os.RemoveAll(tmpdir); err != nil { + return errors.Wrap(err, "cleanup replacement dir") + } + repl, err := wal.New(logger, nil, tmpdir) + if err != nil { + return errors.Wrap(err, "open new WAL") + } + + // It should've already been closed as part of the previous finalization. + // Do it once again in case of prior errors. + defer func() { + if err != nil { + repl.Close() + } + }() + + w, err := OpenSegmentWAL(dir, logger, time.Minute, nil) + if err != nil { + return errors.Wrap(err, "open old WAL") + } + defer w.Close() + + rdr := w.Reader() + + var ( + enc RecordEncoder + b []byte + ) + decErr := rdr.Read( + func(s []RefSeries) { + if err != nil { + return + } + err = repl.Log(enc.Series(s, b[:0])) + }, + func(s []RefSample) { + if err != nil { + return + } + err = repl.Log(enc.Samples(s, b[:0])) + }, + func(s []Stone) { + if err != nil { + return + } + err = repl.Log(enc.Tombstones(s, b[:0])) + }, + ) + if decErr != nil { + return errors.Wrap(err, "decode old entries") + } + if err != nil { + return errors.Wrap(err, "write new entries") + } + // We explicitly close even when there is a defer for Windows to be + // able to delete it. The defer is in place to close it in-case there + // are errors above. + if err := w.Close(); err != nil { + return errors.Wrap(err, "close old WAL") + } + if err := repl.Close(); err != nil { + return errors.Wrap(err, "close new WAL") + } + if err := fileutil.Replace(tmpdir, dir); err != nil { + return errors.Wrap(err, "replace old WAL") + } + return nil +} diff --git a/vendor/github.com/prometheus/tsdb/wal/wal.go b/vendor/github.com/prometheus/tsdb/wal/wal.go new file mode 100644 index 00000000000..20ed69e7d9a --- /dev/null +++ b/vendor/github.com/prometheus/tsdb/wal/wal.go @@ -0,0 +1,856 @@ +// Copyright 2017 The Prometheus Authors + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package wal + +import ( + "bufio" + "encoding/binary" + "fmt" + "hash/crc32" + "io" + "math" + "os" + "path/filepath" + "sort" + "strconv" + "sync" + "time" + + "github.com/go-kit/kit/log" + "github.com/go-kit/kit/log/level" + "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/tsdb/fileutil" +) + +const ( + defaultSegmentSize = 128 * 1024 * 1024 // 128 MB + pageSize = 32 * 1024 // 32KB + recordHeaderSize = 7 +) + +// The table gets initialized with sync.Once but may still cause a race +// with any other use of the crc32 package anywhere. Thus we initialize it +// before. +var castagnoliTable = crc32.MakeTable(crc32.Castagnoli) + +type page struct { + alloc int + flushed int + buf [pageSize]byte +} + +func (p *page) remaining() int { + return pageSize - p.alloc +} + +func (p *page) full() bool { + return pageSize-p.alloc < recordHeaderSize +} + +// Segment represents a segment file. +type Segment struct { + *os.File + dir string + i int +} + +// Index returns the index of the segment. +func (s *Segment) Index() int { + return s.i +} + +// Dir returns the directory of the segment. +func (s *Segment) Dir() string { + return s.dir +} + +// CorruptionErr is an error that's returned when corruption is encountered. +type CorruptionErr struct { + Segment int + Offset int64 + Err error +} + +func (e *CorruptionErr) Error() string { + if e.Segment < 0 { + return fmt.Sprintf("corruption after %d bytes: %s", e.Offset, e.Err) + } + return fmt.Sprintf("corruption in segment %d at %d: %s", e.Segment, e.Offset, e.Err) +} + +// OpenWriteSegment opens segment k in dir. The returned segment is ready for new appends. +func OpenWriteSegment(dir string, k int) (*Segment, error) { + f, err := os.OpenFile(SegmentName(dir, k), os.O_WRONLY|os.O_APPEND, 0666) + if err != nil { + return nil, err + } + stat, err := f.Stat() + if err != nil { + f.Close() + return nil, err + } + // If the last page is torn, fill it with zeros. + // In case it was torn after all records were written successfully, this + // will just pad the page and everything will be fine. + // If it was torn mid-record, a full read (which the caller should do anyway + // to ensure integrity) will detect it as a corruption by the end. + if d := stat.Size() % pageSize; d != 0 { + if _, err := f.Write(make([]byte, pageSize-d)); err != nil { + f.Close() + return nil, errors.Wrap(err, "zero-pad torn page") + } + } + return &Segment{File: f, i: k, dir: dir}, nil +} + +// CreateSegment creates a new segment k in dir. +func CreateSegment(dir string, k int) (*Segment, error) { + f, err := os.OpenFile(SegmentName(dir, k), os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0666) + if err != nil { + return nil, err + } + return &Segment{File: f, i: k, dir: dir}, nil +} + +// OpenReadSegment opens the segment with the given filename. +func OpenReadSegment(fn string) (*Segment, error) { + k, err := strconv.Atoi(filepath.Base(fn)) + if err != nil { + return nil, errors.New("not a valid filename") + } + f, err := os.Open(fn) + if err != nil { + return nil, err + } + return &Segment{File: f, i: k, dir: filepath.Dir(fn)}, nil +} + +// WAL is a write ahead log that stores records in segment files. +// It must be read from start to end once before logging new data. +// If an error occurs during read, the repair procedure must be called +// before it's safe to do further writes. +// +// Segments are written to in pages of 32KB, with records possibly split +// across page boundaries. +// Records are never split across segments to allow full segments to be +// safely truncated. It also ensures that torn writes never corrupt records +// beyond the most recent segment. +type WAL struct { + dir string + logger log.Logger + segmentSize int + mtx sync.RWMutex + segment *Segment // active segment + donePages int // pages written to the segment + page *page // active page + stopc chan chan struct{} + actorc chan func() + + fsyncDuration prometheus.Summary + pageFlushes prometheus.Counter + pageCompletions prometheus.Counter + truncateFail prometheus.Counter + truncateTotal prometheus.Counter +} + +// New returns a new WAL over the given directory. +func New(logger log.Logger, reg prometheus.Registerer, dir string) (*WAL, error) { + return NewSize(logger, reg, dir, defaultSegmentSize) +} + +// NewSize returns a new WAL over the given directory. +// New segments are created with the specified size. +func NewSize(logger log.Logger, reg prometheus.Registerer, dir string, segmentSize int) (*WAL, error) { + if segmentSize%pageSize != 0 { + return nil, errors.New("invalid segment size") + } + if err := os.MkdirAll(dir, 0777); err != nil { + return nil, errors.Wrap(err, "create dir") + } + if logger == nil { + logger = log.NewNopLogger() + } + w := &WAL{ + dir: dir, + logger: logger, + segmentSize: segmentSize, + page: &page{}, + actorc: make(chan func(), 100), + stopc: make(chan chan struct{}), + } + w.fsyncDuration = prometheus.NewSummary(prometheus.SummaryOpts{ + Name: "prometheus_tsdb_wal_fsync_duration_seconds", + Help: "Duration of WAL fsync.", + }) + w.pageFlushes = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "prometheus_tsdb_wal_page_flushes_total", + Help: "Total number of page flushes.", + }) + w.pageCompletions = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "prometheus_tsdb_wal_completed_pages_total", + Help: "Total number of completed pages.", + }) + w.truncateFail = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "prometheus_tsdb_wal_truncations_failed_total", + Help: "Total number of WAL truncations that failed.", + }) + w.truncateTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "prometheus_tsdb_wal_truncations_total", + Help: "Total number of WAL truncations attempted.", + }) + if reg != nil { + reg.MustRegister(w.fsyncDuration, w.pageFlushes, w.pageCompletions, w.truncateFail, w.truncateTotal) + } + + _, j, err := w.Segments() + if err != nil { + return nil, errors.Wrap(err, "get segment range") + } + // Fresh dir, no segments yet. + if j == -1 { + if w.segment, err = CreateSegment(w.dir, 0); err != nil { + return nil, err + } + } else { + if w.segment, err = OpenWriteSegment(w.dir, j); err != nil { + return nil, err + } + // Correctly initialize donePages. + stat, err := w.segment.Stat() + if err != nil { + return nil, err + } + w.donePages = int(stat.Size() / pageSize) + } + go w.run() + + return w, nil +} + +// Dir returns the directory of the WAL. +func (w *WAL) Dir() string { + return w.dir +} + +func (w *WAL) run() { +Loop: + for { + select { + case f := <-w.actorc: + f() + case donec := <-w.stopc: + close(w.actorc) + defer close(donec) + break Loop + } + } + // Drain and process any remaining functions. + for f := range w.actorc { + f() + } +} + +// Repair attempts to repair the WAL based on the error. +// It discards all data after the corruption. +func (w *WAL) Repair(origErr error) error { + // We could probably have a mode that only discards torn records right around + // the corruption to preserve as data much as possible. + // But that's not generally applicable if the records have any kind of causality. + // Maybe as an extra mode in the future if mid-WAL corruptions become + // a frequent concern. + err := errors.Cause(origErr) // So that we can pick up errors even if wrapped. + + cerr, ok := err.(*CorruptionErr) + if !ok { + return errors.Wrap(origErr, "cannot handle error") + } + if cerr.Segment < 0 { + return errors.New("corruption error does not specify position") + } + + level.Warn(w.logger).Log("msg", "starting corruption repair", + "segment", cerr.Segment, "offset", cerr.Offset) + + // All segments behind the corruption can no longer be used. + segs, err := listSegments(w.dir) + if err != nil { + return errors.Wrap(err, "list segments") + } + level.Warn(w.logger).Log("msg", "deleting all segments behind corruption", "segment", cerr.Segment) + + for _, s := range segs { + if s.index <= cerr.Segment { + continue + } + if w.segment.i == s.index { + // The active segment needs to be removed, + // close it first (Windows!). Can be closed safely + // as we set the current segment to repaired file + // below. + if err := w.segment.Close(); err != nil { + return errors.Wrap(err, "close active segment") + } + } + if err := os.Remove(filepath.Join(w.dir, s.name)); err != nil { + return errors.Wrapf(err, "delete segment:%v", s.index) + } + } + // Regardless of the corruption offset, no record reaches into the previous segment. + // So we can safely repair the WAL by removing the segment and re-inserting all + // its records up to the corruption. + level.Warn(w.logger).Log("msg", "rewrite corrupted segment", "segment", cerr.Segment) + + fn := SegmentName(w.dir, cerr.Segment) + tmpfn := fn + ".repair" + + if err := fileutil.Rename(fn, tmpfn); err != nil { + return err + } + // Create a clean segment and make it the active one. + s, err := CreateSegment(w.dir, cerr.Segment) + if err != nil { + return err + } + w.segment = s + + f, err := os.Open(tmpfn) + if err != nil { + return errors.Wrap(err, "open segment") + } + defer f.Close() + + r := NewReader(bufio.NewReader(f)) + + for r.Next() { + if err := w.Log(r.Record()); err != nil { + return errors.Wrap(err, "insert record") + } + } + // We expect an error here from r.Err(), so nothing to handle. + + // We explicitly close even when there is a defer for Windows to be + // able to delete it. The defer is in place to close it in-case there + // are errors above. + if err := f.Close(); err != nil { + return errors.Wrap(err, "close corrupted file") + } + if err := os.Remove(tmpfn); err != nil { + return errors.Wrap(err, "delete corrupted segment") + } + return nil +} + +// SegmentName builds a segment name for the directory. +func SegmentName(dir string, i int) string { + return filepath.Join(dir, fmt.Sprintf("%08d", i)) +} + +// nextSegment creates the next segment and closes the previous one. +func (w *WAL) nextSegment() error { + // Only flush the current page if it actually holds data. + if w.page.alloc > 0 { + if err := w.flushPage(true); err != nil { + return err + } + } + next, err := CreateSegment(w.dir, w.segment.Index()+1) + if err != nil { + return errors.Wrap(err, "create new segment file") + } + prev := w.segment + w.segment = next + w.donePages = 0 + + // Don't block further writes by fsyncing the last segment. + w.actorc <- func() { + if err := w.fsync(prev); err != nil { + level.Error(w.logger).Log("msg", "sync previous segment", "err", err) + } + if err := prev.Close(); err != nil { + level.Error(w.logger).Log("msg", "close previous segment", "err", err) + } + } + return nil +} + +// flushPage writes the new contents of the page to disk. If no more records will fit into +// the page, the remaining bytes will be set to zero and a new page will be started. +// If clear is true, this is enforced regardless of how many bytes are left in the page. +func (w *WAL) flushPage(clear bool) error { + w.pageFlushes.Inc() + + p := w.page + clear = clear || p.full() + + // No more data will fit into the page. Enqueue and clear it. + if clear { + p.alloc = pageSize // write till end of page + w.pageCompletions.Inc() + } + n, err := w.segment.Write(p.buf[p.flushed:p.alloc]) + if err != nil { + return err + } + p.flushed += n + + // We flushed an entire page, prepare a new one. + if clear { + for i := range p.buf { + p.buf[i] = 0 + } + p.alloc = 0 + p.flushed = 0 + w.donePages++ + } + return nil +} + +type recType uint8 + +const ( + recPageTerm recType = 0 // Rest of page is empty. + recFull recType = 1 // Full record. + recFirst recType = 2 // First fragment of a record. + recMiddle recType = 3 // Middle fragments of a record. + recLast recType = 4 // Final fragment of a record. +) + +func (t recType) String() string { + switch t { + case recPageTerm: + return "zero" + case recFull: + return "full" + case recFirst: + return "first" + case recMiddle: + return "middle" + case recLast: + return "last" + default: + return "" + } +} + +func (w *WAL) pagesPerSegment() int { + return w.segmentSize / pageSize +} + +// Log writes the records into the log. +// Multiple records can be passed at once to reduce writes and increase throughput. +func (w *WAL) Log(recs ...[]byte) error { + w.mtx.Lock() + defer w.mtx.Unlock() + // Callers could just implement their own list record format but adding + // a bit of extra logic here frees them from that overhead. + for i, r := range recs { + if err := w.log(r, i == len(recs)-1); err != nil { + return err + } + } + return nil +} + +// log writes rec to the log and forces a flush of the current page if its +// the final record of a batch. +func (w *WAL) log(rec []byte, final bool) error { + // If the record is too big to fit within pages in the current + // segment, terminate the active segment and advance to the next one. + // This ensures that records do not cross segment boundaries. + left := w.page.remaining() - recordHeaderSize // Active pages. + left += (pageSize - recordHeaderSize) * (w.pagesPerSegment() - w.donePages - 1) // Free pages. + + if len(rec) > left { + if err := w.nextSegment(); err != nil { + return err + } + } + + // Populate as many pages as necessary to fit the record. + // Be careful to always do one pass to ensure we write zero-length records. + for i := 0; i == 0 || len(rec) > 0; i++ { + p := w.page + + // Find how much of the record we can fit into the page. + var ( + l = min(len(rec), (pageSize-p.alloc)-recordHeaderSize) + part = rec[:l] + buf = p.buf[p.alloc:] + typ recType + ) + + switch { + case i == 0 && len(part) == len(rec): + typ = recFull + case len(part) == len(rec): + typ = recLast + case i == 0: + typ = recFirst + default: + typ = recMiddle + } + + buf[0] = byte(typ) + crc := crc32.Checksum(part, castagnoliTable) + binary.BigEndian.PutUint16(buf[1:], uint16(len(part))) + binary.BigEndian.PutUint32(buf[3:], crc) + + copy(buf[recordHeaderSize:], part) + p.alloc += len(part) + recordHeaderSize + + // If we wrote a full record, we can fit more records of the batch + // into the page before flushing it. + if final || typ != recFull || w.page.full() { + if err := w.flushPage(false); err != nil { + return err + } + } + rec = rec[l:] + } + return nil +} + +// Segments returns the range [first, n] of currently existing segments. +// If no segments are found, first and n are -1. +func (w *WAL) Segments() (first, last int, err error) { + refs, err := listSegments(w.dir) + if err != nil { + return 0, 0, err + } + if len(refs) == 0 { + return -1, -1, nil + } + return refs[0].index, refs[len(refs)-1].index, nil +} + +// Truncate drops all segments before i. +func (w *WAL) Truncate(i int) (err error) { + w.truncateTotal.Inc() + defer func() { + if err != nil { + w.truncateFail.Inc() + } + }() + refs, err := listSegments(w.dir) + if err != nil { + return err + } + for _, r := range refs { + if r.index >= i { + break + } + if err = os.Remove(filepath.Join(w.dir, r.name)); err != nil { + return err + } + } + return nil +} + +func (w *WAL) fsync(f *Segment) error { + start := time.Now() + err := fileutil.Fsync(f.File) + w.fsyncDuration.Observe(time.Since(start).Seconds()) + return err +} + +// Close flushes all writes and closes active segment. +func (w *WAL) Close() (err error) { + w.mtx.Lock() + defer w.mtx.Unlock() + + // Flush the last page and zero out all its remaining size. + // We must not flush an empty page as it would falsely signal + // the segment is done if we start writing to it again after opening. + if w.page.alloc > 0 { + if err := w.flushPage(true); err != nil { + return err + } + } + + donec := make(chan struct{}) + w.stopc <- donec + <-donec + + if err = w.fsync(w.segment); err != nil { + level.Error(w.logger).Log("msg", "sync previous segment", "err", err) + } + if err := w.segment.Close(); err != nil { + level.Error(w.logger).Log("msg", "close previous segment", "err", err) + } + + return nil +} + +type segmentRef struct { + name string + index int +} + +func listSegments(dir string) (refs []segmentRef, err error) { + files, err := fileutil.ReadDir(dir) + if err != nil { + return nil, err + } + var last int + for _, fn := range files { + k, err := strconv.Atoi(fn) + if err != nil { + continue + } + if len(refs) > 0 && k > last+1 { + return nil, errors.New("segments are not sequential") + } + refs = append(refs, segmentRef{name: fn, index: k}) + last = k + } + sort.Slice(refs, func(i, j int) bool { + return refs[i].index < refs[j].index + }) + return refs, nil +} + +// NewSegmentsReader returns a new reader over all segments in the directory. +func NewSegmentsReader(dir string) (io.ReadCloser, error) { + return NewSegmentsRangeReader(dir, 0, math.MaxInt32) +} + +// NewSegmentsRangeReader returns a new reader over the given WAL segment range. +// If first or last are -1, the range is open on the respective end. +func NewSegmentsRangeReader(dir string, first, last int) (io.ReadCloser, error) { + refs, err := listSegments(dir) + if err != nil { + return nil, err + } + var segs []*Segment + + for _, r := range refs { + if first >= 0 && r.index < first { + continue + } + if last >= 0 && r.index > last { + break + } + s, err := OpenReadSegment(filepath.Join(dir, r.name)) + if err != nil { + return nil, err + } + segs = append(segs, s) + } + return newSegmentBufReader(segs...), nil +} + +// segmentBufReader is a buffered reader that reads in multiples of pages. +// The main purpose is that we are able to track segment and offset for +// corruption reporting. +type segmentBufReader struct { + buf *bufio.Reader + segs []*Segment + cur int + off int + more bool +} + +func newSegmentBufReader(segs ...*Segment) *segmentBufReader { + return &segmentBufReader{ + buf: bufio.NewReaderSize(nil, 16*pageSize), + segs: segs, + cur: -1, + } +} + +func (r *segmentBufReader) Close() (err error) { + for _, s := range r.segs { + if e := s.Close(); e != nil { + err = e + } + } + return err +} + +func (r *segmentBufReader) Read(b []byte) (n int, err error) { + if !r.more { + if r.cur+1 >= len(r.segs) { + return 0, io.EOF + } + r.cur++ + r.off = 0 + r.more = true + r.buf.Reset(r.segs[r.cur]) + } + n, err = r.buf.Read(b) + r.off += n + if err != io.EOF { + return n, err + } + // Just return what we read so far, but don't signal EOF. + // Only unset more so we don't invalidate the current segment and + // offset before the next read. + r.more = false + return n, nil +} + +// Reader reads WAL records from an io.Reader. +type Reader struct { + rdr io.Reader + err error + rec []byte + buf [pageSize]byte + total int64 // total bytes processed. +} + +// NewReader returns a new reader. +func NewReader(r io.Reader) *Reader { + return &Reader{rdr: r} +} + +// Next advances the reader to the next records and returns true if it exists. +// It must not be called again after it returned false. +func (r *Reader) Next() bool { + err := r.next() + if errors.Cause(err) == io.EOF { + return false + } + r.err = err + return r.err == nil +} + +func (r *Reader) next() (err error) { + // We have to use r.buf since allocating byte arrays here fails escape + // analysis and ends up on the heap, even though it seemingly should not. + hdr := r.buf[:recordHeaderSize] + buf := r.buf[recordHeaderSize:] + + r.rec = r.rec[:0] + + i := 0 + for { + if _, err = io.ReadFull(r.rdr, hdr[:1]); err != nil { + return errors.Wrap(err, "read first header byte") + } + r.total++ + typ := recType(hdr[0]) + + // Gobble up zero bytes. + if typ == recPageTerm { + // We are pedantic and check whether the zeros are actually up + // to a page boundary. + // It's not strictly necessary but may catch sketchy state early. + k := pageSize - (r.total % pageSize) + if k == pageSize { + continue // Initial 0 byte was last page byte. + } + n, err := io.ReadFull(r.rdr, buf[:k]) + if err != nil { + return errors.Wrap(err, "read remaining zeros") + } + r.total += int64(n) + + for _, c := range buf[:k] { + if c != 0 { + return errors.New("unexpected non-zero byte in padded page") + } + } + continue + } + n, err := io.ReadFull(r.rdr, hdr[1:]) + if err != nil { + return errors.Wrap(err, "read remaining header") + } + r.total += int64(n) + + var ( + length = binary.BigEndian.Uint16(hdr[1:]) + crc = binary.BigEndian.Uint32(hdr[3:]) + ) + + if length > pageSize-recordHeaderSize { + return errors.Errorf("invalid record size %d", length) + } + n, err = io.ReadFull(r.rdr, buf[:length]) + if err != nil { + return err + } + r.total += int64(n) + + if n != int(length) { + return errors.Errorf("invalid size: expected %d, got %d", length, n) + } + if c := crc32.Checksum(buf[:length], castagnoliTable); c != crc { + return errors.Errorf("unexpected checksum %x, expected %x", c, crc) + } + r.rec = append(r.rec, buf[:length]...) + + switch typ { + case recFull: + if i != 0 { + return errors.New("unexpected full record") + } + return nil + case recFirst: + if i != 0 { + return errors.New("unexpected first record") + } + case recMiddle: + if i == 0 { + return errors.New("unexpected middle record") + } + case recLast: + if i == 0 { + return errors.New("unexpected last record") + } + return nil + default: + return errors.Errorf("unexpected record type %d", typ) + } + // Only increment i for non-zero records since we use it + // to determine valid content record sequences. + i++ + } +} + +// Err returns the last encountered error wrapped in a corruption error. +// If the reader does not allow to infer a segment index and offset, a total +// offset in the reader stream will be provided. +func (r *Reader) Err() error { + if r.err == nil { + return nil + } + if b, ok := r.rdr.(*segmentBufReader); ok { + return &CorruptionErr{ + Err: r.err, + Segment: b.segs[b.cur].Index(), + Offset: int64(b.off), + } + } + return &CorruptionErr{ + Err: r.err, + Segment: -1, + Offset: r.total, + } +} + +// Record returns the current record. The returned byte slice is only +// valid until the next call to Next. +func (r *Reader) Record() []byte { + return r.rec +} + +func min(i, j int) int { + if i < j { + return i + } + return j +} From 831080f73a8c6d39ad63da98d0da281ed51ba384 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Tue, 30 Oct 2018 17:22:16 -0400 Subject: [PATCH 02/56] Use TSDB's WAL for writes. Signed-off-by: Tom Wilkie --- .gitignore | 1 + Makefile | 4 +- pkg/ingester/ingester.go | 38 +++- pkg/ingester/ingester_test.go | 22 +-- pkg/ingester/series.go | 2 + pkg/ingester/transfer.go | 6 +- pkg/ingester/user_state.go | 102 ++++++----- pkg/ingester/wal.go | 321 ++++++++++++++++++++++++++++++++++ pkg/ingester/wal.proto | 32 ++++ pkg/ingester/wal_test.go | 37 ++++ 10 files changed, 507 insertions(+), 58 deletions(-) create mode 100644 pkg/ingester/wal.go create mode 100644 pkg/ingester/wal.proto create mode 100644 pkg/ingester/wal_test.go diff --git a/.gitignore b/.gitignore index b431dc49786..9ed9bd38641 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ cmd/test-exporter/test-exporter .pkg .cache pkg/ingester/client/cortex.pb.go +pkg/ingester/wal.pb.go pkg/querier/frontend/frontend.pb.go pkg/ring/ring.pb.go pkg/chunk/storage/caching_storage_client.pb.go diff --git a/Makefile b/Makefile index ec0cbd77cc3..6c18d15dde2 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,9 @@ $(foreach exe, $(EXES), $(eval $(call dep_exe, $(exe)))) # Manually declared dependancies And what goes into each exe pkg/ingester/client/cortex.pb.go: pkg/ingester/client/cortex.proto +pkg/ingester/wal.pb.go: pkg/ingester/wal.proto pkg/ring/ring.pb.go: pkg/ring/ring.proto +pkg/querier/frontend/frontend.pb.go: pkg/querier/frontend/frontend.proto all: $(UPTODATE_FILES) test: $(PROTO_GOS) protos: $(PROTO_GOS) @@ -113,7 +115,7 @@ $(EXES): $(NETGO_CHECK) %.pb.go: - protoc -I ./vendor:./$(@D) --gogoslick_out=plugins=grpc:./$(@D) ./$(patsubst %.pb.go,%.proto,$@) + protoc -I $(GOPATH)/src:./vendor:./$(@D) --gogoslick_out=plugins=grpc:./$(@D) ./$(patsubst %.pb.go,%.proto,$@) lint: ./tools/lint -notestpackage -ignorespelling queriers -ignorespelling Queriers . diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 00548133180..647319a519f 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -75,6 +75,7 @@ var ( // Config for an Ingester. type Config struct { LifecyclerConfig ring.LifecyclerConfig + WALConfig WALConfig // Config for transferring chunks. SearchPendingFor time.Duration @@ -98,6 +99,7 @@ type Config struct { // RegisterFlags adds the flags required to config this to the given FlagSet func (cfg *Config) RegisterFlags(f *flag.FlagSet) { cfg.LifecyclerConfig.RegisterFlags(f) + cfg.WALConfig.RegisterFlags(f) f.DurationVar(&cfg.SearchPendingFor, "ingester.search-pending-for", 30*time.Second, "Time to spend searching for a pending ingester when shutting down.") f.DurationVar(&cfg.FlushCheckPeriod, "ingester.flush-period", 1*time.Minute, "Period with which to attempt to flush chunks.") @@ -141,6 +143,8 @@ type Ingester struct { flushQueues []*util.PriorityQueue flushQueuesDone sync.WaitGroup + wal WAL + // Hook for injecting behaviour from tests. preFlushUserSeries func() } @@ -173,6 +177,11 @@ func New(cfg Config, clientConfig client.Config, limits *validation.Overrides, c } var err error + i.wal, err = newWAL(cfg.WALConfig, i) + if err != nil { + return nil, err + } + i.lifecycler, err = ring.NewLifecycler(cfg.LifecyclerConfig, i) if err != nil { return nil, err @@ -221,6 +230,8 @@ func (i *Ingester) Shutdown() { close(i.quit) i.done.Wait() + i.wal.Stop() + // Next initiate our graceful exit from the ring. i.lifecycler.Shutdown() } @@ -234,11 +245,20 @@ func (i *Ingester) StopIncomingRequests() { // Push implements client.IngesterServer func (i *Ingester) Push(ctx old_ctx.Context, req *client.WriteRequest) (*client.WriteResponse, error) { - var lastPartialErr error + userID, err := user.ExtractOrgID(ctx) + if err != nil { + return nil, err + } + + record := Record{ + UserId: userID, + Samples: make([]Sample, 0, len(req.Timeseries)), + } + var lastPartialErr error for _, ts := range req.Timeseries { for _, s := range ts.Samples { - err := i.append(ctx, ts.Labels, model.Time(s.TimestampMs), model.SampleValue(s.Value), req.Source) + err := i.append(ctx, ts.Labels, model.Time(s.TimestampMs), model.SampleValue(s.Value), req.Source, &record) if err == nil { continue } @@ -256,10 +276,14 @@ func (i *Ingester) Push(ctx old_ctx.Context, req *client.WriteRequest) (*client. } } + if err := i.wal.Log(&record); err != nil { + return nil, err + } + return &client.WriteResponse{}, lastPartialErr } -func (i *Ingester) append(ctx context.Context, labels labelPairs, timestamp model.Time, value model.SampleValue, source client.WriteRequest_SourceEnum) error { +func (i *Ingester) append(ctx context.Context, labels labelPairs, timestamp model.Time, value model.SampleValue, source client.WriteRequest_SourceEnum, record *Record) error { labels.removeBlanks() i.stopLock.RLock() @@ -270,7 +294,7 @@ func (i *Ingester) append(ctx context.Context, labels labelPairs, timestamp mode i.userStatesMtx.RLock() defer i.userStatesMtx.RUnlock() - state, fp, series, err := i.userStates.getOrCreateSeries(ctx, labels) + state, fp, series, err := i.userStates.getOrCreateSeries(ctx, labels, record) if err != nil { return err } @@ -291,6 +315,12 @@ func (i *Ingester) append(ctx context.Context, labels labelPairs, timestamp mode return err } + record.Samples = append(record.Samples, Sample{ + Fingerprint: int64(fp), + Timestamp: int64(timestamp), + Value: float64(value), + }) + memoryChunks.Add(float64(len(series.chunkDescs) - prevNumChunks)) ingestedSamples.Inc() switch source { diff --git a/pkg/ingester/ingester_test.go b/pkg/ingester/ingester_test.go index a9c0012162e..5a46ad14698 100644 --- a/pkg/ingester/ingester_test.go +++ b/pkg/ingester/ingester_test.go @@ -159,11 +159,7 @@ func pushTestSamples(t *testing.T, ing *Ingester, numSeries, samplesPerSeries in return userIDs, testData } -func TestIngesterAppend(t *testing.T) { - store, ing := newDefaultTestStore(t) - - userIDs, testData := pushTestSamples(t, ing, 10, 1000) - +func retrieveTestSamples(t *testing.T, ing *Ingester, userIDs []string, testData map[string]model.Matrix) { // Read samples back via ingester queries. for _, userID := range userIDs { ctx := user.InjectOrgID(context.Background(), userID) @@ -181,6 +177,12 @@ func TestIngesterAppend(t *testing.T) { require.NoError(t, err) assert.Equal(t, testData[userID].String(), res.String()) } +} + +func TestIngesterAppend(t *testing.T) { + store, ing := newDefaultTestStore(t) + userIDs, testData := pushTestSamples(t, ing, 10, 1000) + retrieveTestSamples(t, ing, userIDs, testData) // Read samples back via chunk store. ing.Shutdown() @@ -245,22 +247,22 @@ func TestIngesterAppendOutOfOrderAndDuplicate(t *testing.T) { {Name: []byte(model.MetricNameLabel), Value: []byte("testmetric")}, } ctx := user.InjectOrgID(context.Background(), userID) - err := ing.append(ctx, m, 1, 0, client.API) + err := ing.append(ctx, m, 1, 0, client.API, &Record{}) require.NoError(t, err) // Two times exactly the same sample (noop). - err = ing.append(ctx, m, 1, 0, client.API) + err = ing.append(ctx, m, 1, 0, client.API, &Record{}) require.NoError(t, err) // Earlier sample than previous one. - err = ing.append(ctx, m, 0, 0, client.API) + err = ing.append(ctx, m, 0, 0, client.API, &Record{}) require.Contains(t, err.Error(), "sample timestamp out of order") errResp, ok := httpgrpc.HTTPResponseFromError(err) require.True(t, ok) require.Equal(t, errResp.Code, int32(400)) // Same timestamp as previous sample, but different value. - err = ing.append(ctx, m, 1, 1, client.API) + err = ing.append(ctx, m, 1, 1, client.API, &Record{}) require.Contains(t, err.Error(), "sample with repeated timestamp but different value") errResp, ok = httpgrpc.HTTPResponseFromError(err) require.True(t, ok) @@ -278,7 +280,7 @@ func TestIngesterAppendBlankLabel(t *testing.T) { {Name: []byte("bar"), Value: []byte("")}, } ctx := user.InjectOrgID(context.Background(), userID) - err := ing.append(ctx, lp, 1, 0, client.API) + err := ing.append(ctx, lp, 1, 0, client.API, &Record{}) require.NoError(t, err) res, _, err := runTestQuery(ctx, t, ing, labels.MatchEqual, model.MetricNameLabel, "testmetric") diff --git a/pkg/ingester/series.go b/pkg/ingester/series.go index baece97be31..df7815db386 100644 --- a/pkg/ingester/series.go +++ b/pkg/ingester/series.go @@ -191,6 +191,8 @@ func (s *memorySeries) setChunks(descs []*desc) error { if len(descs) > 0 { s.lastTime = descs[len(descs)-1].LastTime } + memoryChunks.Add(float64(len(descs))) + return nil } diff --git a/pkg/ingester/transfer.go b/pkg/ingester/transfer.go index 0f01bef2eed..0d7d76979f4 100644 --- a/pkg/ingester/transfer.go +++ b/pkg/ingester/transfer.go @@ -85,16 +85,17 @@ func (i *Ingester) TransferChunks(stream client.Ingester_TransferChunksServer) e level.Info(util.Logger).Log("msg", "processing TransferChunks request", "from_ingester", fromIngesterID) } userCtx := user.InjectOrgID(stream.Context(), wireSeries.UserId) + descs, err := fromWireChunks(wireSeries.Chunks) if err != nil { return err } - state, fp, series, err := userStates.getOrCreateSeries(userCtx, wireSeries.Labels) + var record Record // for the WAL, not used. + state, fp, series, err := userStates.getOrCreateSeries(userCtx, wireSeries.Labels, &record) if err != nil { return err } - prevNumChunks := len(series.chunkDescs) err = series.setChunks(descs) state.fpLocker.Unlock(fp) // acquired in getOrCreateSeries @@ -103,7 +104,6 @@ func (i *Ingester) TransferChunks(stream client.Ingester_TransferChunksServer) e } seriesReceived++ - memoryChunks.Add(float64(len(series.chunkDescs) - prevNumChunks)) receivedChunks.Add(float64(len(descs))) } diff --git a/pkg/ingester/user_state.go b/pkg/ingester/user_state.go index d16a28395dc..beddbc93c04 100644 --- a/pkg/ingester/user_state.go +++ b/pkg/ingester/user_state.go @@ -120,51 +120,61 @@ func (us *userStates) getViaContext(ctx context.Context) (*userState, bool, erro return state, ok, nil } -func (us *userStates) getOrCreateSeries(ctx context.Context, labels labelPairs) (*userState, model.Fingerprint, *memorySeries, error) { +func (us *userStates) getOrCreate(ctx context.Context) (*userState, error) { userID, err := user.ExtractOrgID(ctx) if err != nil { - return nil, 0, nil, fmt.Errorf("no user id") + return nil, fmt.Errorf("no user id") } state, ok := us.get(userID) + if ok { + return state, nil + } + + seriesInMetric := make([]metricCounterShard, 0, metricCounterShards) + for i := 0; i < metricCounterShards; i++ { + seriesInMetric = append(seriesInMetric, metricCounterShard{ + m: map[string]int{}, + }) + } + + // Speculatively create a userState object and try to store it + // in the map. Another goroutine may have got there before + // us, in which case this userState will be discarded + state = &userState{ + userID: userID, + limits: us.limits, + fpToSeries: newSeriesMap(), + fpLocker: newFingerprintLocker(16 * 1024), + index: newInvertedIndex(), + ingestedAPISamples: newEWMARate(0.2, us.cfg.RateUpdatePeriod), + ingestedRuleSamples: newEWMARate(0.2, us.cfg.RateUpdatePeriod), + seriesInMetric: seriesInMetric, + + memSeriesCreatedTotal: memSeriesCreatedTotal.WithLabelValues(userID), + memSeriesRemovedTotal: memSeriesRemovedTotal.WithLabelValues(userID), + } + state.mapper = newFPMapper(state.fpToSeries) + stored, ok := us.states.LoadOrStore(userID, state) if !ok { + memUsers.Inc() + } + state = stored.(*userState) - seriesInMetric := make([]metricCounterShard, 0, metricCounterShards) - for i := 0; i < metricCounterShards; i++ { - seriesInMetric = append(seriesInMetric, metricCounterShard{ - m: map[string]int{}, - }) - } + return state, nil +} - // Speculatively create a userState object and try to store it - // in the map. Another goroutine may have got there before - // us, in which case this userState will be discarded - state = &userState{ - userID: userID, - limits: us.limits, - fpToSeries: newSeriesMap(), - fpLocker: newFingerprintLocker(16 * 1024), - index: newInvertedIndex(), - ingestedAPISamples: newEWMARate(0.2, us.cfg.RateUpdatePeriod), - ingestedRuleSamples: newEWMARate(0.2, us.cfg.RateUpdatePeriod), - seriesInMetric: seriesInMetric, - - memSeriesCreatedTotal: memSeriesCreatedTotal.WithLabelValues(userID), - memSeriesRemovedTotal: memSeriesRemovedTotal.WithLabelValues(userID), - } - state.mapper = newFPMapper(state.fpToSeries) - stored, ok := us.states.LoadOrStore(userID, state) - if !ok { - memUsers.Inc() - } - state = stored.(*userState) +func (us *userStates) getOrCreateSeries(ctx context.Context, labels labelPairs, record *Record) (*userState, model.Fingerprint, *memorySeries, error) { + state, err := us.getOrCreate(ctx) + if err != nil { + return nil, 0, nil, err } - fp, series, err := state.getSeries(labels) + fp, series, err := state.getSeries(labels, record) return state, fp, series, err } -func (u *userState) getSeries(metric labelPairs) (model.Fingerprint, *memorySeries, error) { +func (u *userState) getSeries(metric labelPairs, record *Record) (model.Fingerprint, *memorySeries, error) { rawFP := client.FastFingerprint(metric) u.fpLocker.Lock(rawFP) fp := u.mapper.mapFP(rawFP, metric) @@ -178,36 +188,48 @@ func (u *userState) getSeries(metric labelPairs) (model.Fingerprint, *memorySeri return fp, series, nil } + series, err := u.createSeriesWithFingerprint(fp, metric, record) + if err != nil { + u.fpLocker.Unlock(fp) + return 0, nil, err + } + + return fp, series, nil +} + +func (u *userState) createSeriesWithFingerprint(fp model.Fingerprint, metric labelPairs, record *Record) (*memorySeries, error) { // There's theoretically a relatively harmless race here if multiple // goroutines get the length of the series map at the same time, then // all proceed to add a new series. This is likely not worth addressing, // as this should happen rarely (all samples from one push are added // serially), and the overshoot in allowed series would be minimal. if u.fpToSeries.length() >= u.limits.MaxSeriesPerUser(u.userID) { - u.fpLocker.Unlock(fp) - return fp, nil, httpgrpc.Errorf(http.StatusTooManyRequests, "per-user series limit (%d) exceeded", u.limits.MaxSeriesPerUser(u.userID)) + return nil, httpgrpc.Errorf(http.StatusTooManyRequests, "per-user series limit (%d) exceeded", u.limits.MaxSeriesPerUser(u.userID)) } metricName, err := extract.MetricNameFromLabelPairs(metric) if err != nil { - u.fpLocker.Unlock(fp) - return fp, nil, err + return nil, err } if !u.canAddSeriesFor(string(metricName)) { - u.fpLocker.Unlock(fp) - return fp, nil, httpgrpc.Errorf(http.StatusTooManyRequests, "per-metric series limit (%d) exceeded for %s: %s", u.limits.MaxSeriesPerMetric(u.userID), metricName, metric) + return nil, httpgrpc.Errorf(http.StatusTooManyRequests, "per-metric series limit (%d) exceeded for %s: %s", u.limits.MaxSeriesPerMetric(u.userID), metricName, metric) } util.Event().Log("msg", "new series", "userID", u.userID, "fp", fp, "series", metric) u.memSeriesCreatedTotal.Inc() memSeries.Inc() - series = newMemorySeries(metric) + record.Labels = append(record.Labels, Labels{ + Fingerprint: int64(fp), + Labels: metric, + }) + + series := newMemorySeries(metric) u.fpToSeries.put(fp, series) u.index.add(metric, fp) - return fp, series, nil + return series, nil } func (u *userState) canAddSeriesFor(metric string) bool { diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go new file mode 100644 index 00000000000..7ed091132ba --- /dev/null +++ b/pkg/ingester/wal.go @@ -0,0 +1,321 @@ +package ingester + +import ( + "flag" + "path" + "sync" + "time" + + "github.com/go-kit/kit/log/level" + "github.com/golang/protobuf/proto" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/model" + "github.com/prometheus/tsdb/wal" + "github.com/weaveworks/common/user" + "golang.org/x/net/context" + + "github.com/cortexproject/cortex/pkg/util" +) + +// WALConfig is config for the Write Ahead Log. +type WALConfig struct { + enabled bool + recover bool + dir string + checkpointDuration time.Duration + metricsRegisterer prometheus.Registerer +} + +// RegisterFlags adds the flags required to config this to the given FlagSet +func (cfg *WALConfig) RegisterFlags(f *flag.FlagSet) { + f.BoolVar(&cfg.enabled, "ingester.wal-enable", false, "Enable the WAL.") + f.BoolVar(&cfg.recover, "ingester.wal-recover", false, "Recover from the WAL on startup.") + f.StringVar(&cfg.dir, "ingester.wal-dir", "", "Directory to store the WAL.") + f.DurationVar(&cfg.checkpointDuration, "ingester.checkpoint-duration", 1*time.Hour, "Duration over which to checkpoint.") +} + +// WAL interface allows us to have a no-op WAL when the WAL is disabled. +type WAL interface { + Log(record *Record) error + Stop() +} + +type noop struct{} + +// Log a Record to the WAL. +func (noop) Log(*Record) error { + return nil +} + +// Stop any background WAL processes. +func (noop) Stop() {} + +type wrapper struct { + cfg WALConfig + ingester *Ingester + quit chan struct{} + wait sync.WaitGroup + + lastCheckpointSegment int + lastSamplesSegment int + samples *wal.WAL + checkpoints *wal.WAL +} + +func newWAL(cfg WALConfig, ingester *Ingester) (WAL, error) { + if !cfg.enabled { + return &noop{}, nil + } + + var samplesRegistry prometheus.Registerer + if cfg.metricsRegisterer != nil { + samplesRegistry = prometheus.WrapRegistererWith(prometheus.Labels{"kind": "samples"}, cfg.metricsRegisterer) + } + samples, err := wal.New(util.Logger, samplesRegistry, path.Join(cfg.dir, "samples")) + if err != nil { + return nil, err + } + + var checkpointsRegistry prometheus.Registerer + if cfg.metricsRegisterer != nil { + checkpointsRegistry = prometheus.WrapRegistererWith(prometheus.Labels{"kind": "checkpoints"}, cfg.metricsRegisterer) + } + checkpoints, err := wal.New(util.Logger, checkpointsRegistry, path.Join(cfg.dir, "checkpoints")) + if err != nil { + return nil, err + } + + w := &wrapper{ + cfg: cfg, + ingester: ingester, + quit: make(chan struct{}), + samples: samples, + checkpoints: checkpoints, + } + + if cfg.recover { + w.recover(context.Background()) + } + + w.wait.Add(1) + go w.run() + return w, nil +} + +func (w *wrapper) Stop() { + close(w.quit) + w.wait.Wait() + + w.samples.Close() + w.checkpoints.Close() +} + +func (w *wrapper) Log(record *Record) error { + buf, err := proto.Marshal(record) + if err != nil { + return err + } + return w.samples.Log(buf) +} + +func (w *wrapper) run() { + defer w.wait.Done() + + for !w.isStopped() { + if err := w.checkpoint(); err != nil { + level.Error(util.Logger).Log("msg", "Error checkpointing series", "err", err) + continue + } + + if err := w.truncateSamples(); err != nil { + level.Error(util.Logger).Log("msg", "Error truncating wal", "err", err) + continue + } + } +} + +func (w *wrapper) isStopped() bool { + select { + case <-w.quit: + return true + default: + return false + } +} + +func (w *wrapper) checkpoint() error { + // Count number of series - we'll use this to rate limit checkpoints. + numSeries := 0 + for _, state := range w.ingester.userStates.cp() { + numSeries += state.fpToSeries.length() + } + if numSeries == 0 { + return nil + } + perSeriesDuration := w.cfg.checkpointDuration / time.Duration(numSeries) + ticker := time.NewTicker(perSeriesDuration) + defer ticker.Stop() + + for userID, state := range w.ingester.userStates.cp() { + for pair := range state.fpToSeries.iter() { + state.fpLocker.Lock(pair.fp) + err := w.checkpointSeries(userID, pair.fp, pair.series) + state.fpLocker.Unlock(pair.fp) + if err != nil { + return err + } + + select { + case <-ticker.C: + case <-w.quit: // When we're trying to shutdown, finish the checkpoint as fast as possible. + } + } + } + + // Remove the previous checkpoint. + _, last, err := w.checkpoints.Segments() + if err != nil { + return err + } + if err := w.checkpoints.Truncate(w.lastCheckpointSegment); err != nil { + return err + } + w.lastCheckpointSegment = last + + return nil +} + +func (w *wrapper) checkpointSeries(userID string, fp model.Fingerprint, series *memorySeries) error { + wireChunks, err := toWireChunks(series.chunkDescs) + if err != nil { + return err + } + + buf, err := proto.Marshal(&Series{ + UserId: userID, + Fingerprint: int64(fp), + Labels: series.labels(), + Chunks: wireChunks, + }) + if err != nil { + return err + } + + return w.checkpoints.Log(buf) +} + +// truncateSamples removed the wal from before the checkpoint. +func (w *wrapper) truncateSamples() error { + _, last, err := w.samples.Segments() + if err != nil { + return err + } + + if err := w.samples.Truncate(w.lastSamplesSegment); err != nil { + return err + } + + w.lastSamplesSegment = last + return nil +} + +func (w *wrapper) recover(ctx context.Context) error { + // Use a local userStates, so we don't need to worry about locking. + userStates := newUserStates(w.ingester.limits, w.ingester.cfg) + + if err := w.recoverRecords("checkpoints", &Series{}, func(msg proto.Message) error { + walSeries := msg.(*Series) + + descs, err := fromWireChunks(walSeries.Chunks) + if err != nil { + return err + } + + userCtx := user.InjectOrgID(ctx, walSeries.UserId) + state, err := userStates.getOrCreate(userCtx) + if err != nil { + return err + } + + series, err := state.createSeriesWithFingerprint(model.Fingerprint(walSeries.Fingerprint), walSeries.Labels, &Record{}) + if err != nil { + return err + } + + return series.setChunks(descs) + }); err != nil { + return err + } + + if err := w.recoverRecords("samples", &Record{}, func(msg proto.Message) error { + record := msg.(*Record) + userCtx := user.InjectOrgID(ctx, record.UserId) + + state, err := userStates.getOrCreate(userCtx) + if err != nil { + return err + } + + for _, labels := range record.Labels { + _, ok := state.fpToSeries.get(model.Fingerprint(labels.Fingerprint)) + if ok { + continue + } + + _, err = state.createSeriesWithFingerprint(model.Fingerprint(labels.Fingerprint), labels.Labels, &Record{}) + if err != nil { + return err + } + } + + for _, sample := range record.Samples { + series, ok := state.fpToSeries.get(model.Fingerprint(sample.Fingerprint)) + if !ok { + return nil + } + + err := series.add(model.SamplePair{ + Timestamp: model.Time(sample.Timestamp), + Value: model.SampleValue(sample.Value), + }) + if err != nil { + level.Info(util.Logger).Log("msg", "error appending sample", "err", err) + } + } + + return nil + }); err != nil { + return err + } + + w.ingester.userStatesMtx.Lock() + w.ingester.userStates = userStates + w.ingester.userStatesMtx.Unlock() + + return nil +} + +func (w *wrapper) recoverRecords(name string, ty proto.Message, callback func(proto.Message) error) error { + segmentReader, err := wal.NewSegmentsReader(path.Join(w.cfg.dir, name)) + if err != nil { + return err + } + defer segmentReader.Close() + + reader := wal.NewReader(segmentReader) + for reader.Next() { + ty.Reset() + if err := proto.Unmarshal(reader.Record(), ty); err != nil { + return err + } + + if err := callback(ty); err != nil { + return err + } + } + if err := reader.Err(); err != nil { + return err + } + + return nil +} diff --git a/pkg/ingester/wal.proto b/pkg/ingester/wal.proto new file mode 100644 index 00000000000..0d21be2687f --- /dev/null +++ b/pkg/ingester/wal.proto @@ -0,0 +1,32 @@ +syntax = "proto3"; + +package ingester; + +option go_package = "ingester"; + +import "github.com/gogo/protobuf/gogoproto/gogo.proto"; +import "github.com/cortexproject/cortex/pkg/ingester/client/cortex.proto"; + +message Record { + string user_id = 1; + repeated Labels labels = 2 [(gogoproto.nullable) = false]; + repeated Sample samples = 3 [(gogoproto.nullable) = false]; +} + +message Labels { + int64 fingerprint = 1; + repeated cortex.LabelPair labels = 2 [(gogoproto.nullable) = false]; +} + +message Sample { + int64 fingerprint = 1; + int64 timestamp = 2; + double value = 3; +} + +message Series { + string user_id = 1; + int64 fingerprint = 2; + repeated cortex.LabelPair labels = 3 [(gogoproto.nullable) = false]; + repeated cortex.Chunk chunks = 4 [(gogoproto.nullable) = false]; +} diff --git a/pkg/ingester/wal_test.go b/pkg/ingester/wal_test.go new file mode 100644 index 00000000000..bd5c4beba72 --- /dev/null +++ b/pkg/ingester/wal_test.go @@ -0,0 +1,37 @@ +package ingester + +import ( + "io/ioutil" + "os" + "testing" + + "github.com/cortexproject/cortex/pkg/util" + "github.com/go-kit/kit/log" + "github.com/stretchr/testify/require" +) + +func init() { + util.Logger = log.NewLogfmtLogger(os.Stdout) +} + +func TestWAL(t *testing.T) { + dirname, err := ioutil.TempDir("", "cortex-wal") + require.NoError(t, err) + + cfg := defaultIngesterTestConfig() + cfg.WALConfig.enabled = true + cfg.WALConfig.dir = dirname + + // Build an ingester, add some samples, then shut it down. + _, ing := newTestStore(t, cfg, defaultClientTestConfig(), defaultLimitsTestConfig()) + userIDs, testData := pushTestSamples(t, ing, 10, 1000) + ing.Shutdown() + + // Start a new ingester and recover the WAL. + cfg.WALConfig.recover = true + _, ing = newTestStore(t, cfg, defaultClientTestConfig(), defaultLimitsTestConfig()) + defer ing.Shutdown() + + // Check the samples are still there! + retrieveTestSamples(t, ing, userIDs, testData) +} From 2a6fc2db5a80997a8b055172cf1c1dad04d24785 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Wed, 10 Jul 2019 18:41:30 +0530 Subject: [PATCH 03/56] Fix merge conflicts Signed-off-by: Ganesh Vernekar --- pkg/ingester/label_pairs.go | 22 ++++++++++++ pkg/ingester/query_test.go | 2 +- pkg/ingester/series.go | 6 ++++ pkg/ingester/user_state.go | 67 +++++++------------------------------ pkg/ingester/wal.go | 36 ++++++++++++++------ 5 files changed, 67 insertions(+), 66 deletions(-) diff --git a/pkg/ingester/label_pairs.go b/pkg/ingester/label_pairs.go index 896cbf0053e..43304cd6e34 100644 --- a/pkg/ingester/label_pairs.go +++ b/pkg/ingester/label_pairs.go @@ -88,3 +88,25 @@ func (a labelPairs) equal(b labels.Labels) bool { } return true } + +func newLabelPairs(metric labels.Labels) []client.LabelPair { + lp := make([]client.LabelPair, 0, len(metric)) + for _, m := range metric { + lp = append(lp, client.LabelPair{ + Name: []byte(m.Name), + Value: []byte(m.Value), + }) + } + return lp +} + +func newLabelPairsFromLabelAdapters(metric []client.LabelAdapter) []client.LabelPair { + lp := make([]client.LabelPair, 0, len(metric)) + for _, m := range metric { + lp = append(lp, client.LabelPair{ + Name: []byte(m.Name), + Value: []byte(m.Value), + }) + } + return lp +} diff --git a/pkg/ingester/query_test.go b/pkg/ingester/query_test.go index 9debe783b19..ac82690b66b 100644 --- a/pkg/ingester/query_test.go +++ b/pkg/ingester/query_test.go @@ -55,7 +55,7 @@ func BenchmarkQueryStream(b *testing.B) { {Name: "cpu", Value: cpus[i%numCPUs]}, } - state, fp, series, err := ing.userStates.getOrCreateSeries(ctx, labels) + state, fp, series, err := ing.userStates.getOrCreateSeries(ctx, labels, &Record{}) require.NoError(b, err) for j := 0; j < numSamples; j++ { diff --git a/pkg/ingester/series.go b/pkg/ingester/series.go index 01c5645c535..388d2ac3603 100644 --- a/pkg/ingester/series.go +++ b/pkg/ingester/series.go @@ -9,6 +9,7 @@ import ( "github.com/prometheus/prometheus/pkg/labels" "github.com/cortexproject/cortex/pkg/chunk/encoding" + "github.com/cortexproject/cortex/pkg/ingester/client" "github.com/cortexproject/cortex/pkg/prom1/storage/metric" ) @@ -58,6 +59,11 @@ func newMemorySeries(m labels.Labels) *memorySeries { } } +// helper to extract the not-necessarily-sorted type used elsewhere, without casting everywhere. +func (s *memorySeries) labels() []client.LabelPair { + return newLabelPairs(s.metric) +} + // add adds a sample pair to the series. It returns the number of newly // completed chunks (which are now eligible for persistence). // diff --git a/pkg/ingester/user_state.go b/pkg/ingester/user_state.go index 6e3baa8d4c5..4099f55b13c 100644 --- a/pkg/ingester/user_state.go +++ b/pkg/ingester/user_state.go @@ -129,60 +129,22 @@ func (us *userStates) getViaContext(ctx context.Context) (*userState, bool, erro return state, ok, nil } -<<<<<<< HEAD -func (us *userStates) getOrCreate(ctx context.Context) (*userState, error) { -======= -func (us *userStates) getOrCreateSeries(ctx context.Context, labels []client.LabelAdapter) (*userState, model.Fingerprint, *memorySeries, error) { ->>>>>>> upstream/master +func (us *userStates) getOrCreateSeries(ctx context.Context, labels []client.LabelAdapter, record *Record) (*userState, model.Fingerprint, *memorySeries, error) { userID, err := user.ExtractOrgID(ctx) if err != nil { - return nil, fmt.Errorf("no user id") + return nil, 0, nil, fmt.Errorf("no user id") } state, ok := us.get(userID) - if ok { - return state, nil - } - - seriesInMetric := make([]metricCounterShard, 0, metricCounterShards) - for i := 0; i < metricCounterShards; i++ { - seriesInMetric = append(seriesInMetric, metricCounterShard{ - m: map[string]int{}, - }) - } - - // Speculatively create a userState object and try to store it - // in the map. Another goroutine may have got there before - // us, in which case this userState will be discarded - state = &userState{ - userID: userID, - limits: us.limits, - fpToSeries: newSeriesMap(), - fpLocker: newFingerprintLocker(16 * 1024), - index: newInvertedIndex(), - ingestedAPISamples: newEWMARate(0.2, us.cfg.RateUpdatePeriod), - ingestedRuleSamples: newEWMARate(0.2, us.cfg.RateUpdatePeriod), - seriesInMetric: seriesInMetric, - - memSeriesCreatedTotal: memSeriesCreatedTotal.WithLabelValues(userID), - memSeriesRemovedTotal: memSeriesRemovedTotal.WithLabelValues(userID), - } - state.mapper = newFPMapper(state.fpToSeries) - stored, ok := us.states.LoadOrStore(userID, state) if !ok { - memUsers.Inc() - } - state = stored.(*userState) - return state, nil -} + seriesInMetric := make([]metricCounterShard, 0, metricCounterShards) + for i := 0; i < metricCounterShards; i++ { + seriesInMetric = append(seriesInMetric, metricCounterShard{ + m: map[string]int{}, + }) + } -<<<<<<< HEAD -func (us *userStates) getOrCreateSeries(ctx context.Context, labels labelPairs, record *Record) (*userState, model.Fingerprint, *memorySeries, error) { - state, err := us.getOrCreate(ctx) - if err != nil { - return nil, 0, nil, err -======= // Speculatively create a userState object and try to store it // in the map. Another goroutine may have got there before // us, in which case this userState will be discarded @@ -205,7 +167,6 @@ func (us *userStates) getOrCreateSeries(ctx context.Context, labels labelPairs, memUsers.Inc() } state = stored.(*userState) ->>>>>>> upstream/master } fp, series, err := state.getSeries(labels, record) @@ -242,9 +203,8 @@ func (u *userState) createSeriesWithFingerprint(fp model.Fingerprint, metric lab // as this should happen rarely (all samples from one push are added // serially), and the overshoot in allowed series would be minimal. if u.fpToSeries.length() >= u.limits.MaxSeriesPerUser(u.userID) { - u.fpLocker.Unlock(fp) validation.DiscardedSamples.WithLabelValues(perUserSeriesLimit, u.userID).Inc() - return fp, nil, httpgrpc.Errorf(http.StatusTooManyRequests, "per-user series limit (%d) exceeded", u.limits.MaxSeriesPerUser(u.userID)) + return nil, httpgrpc.Errorf(http.StatusTooManyRequests, "per-user series limit (%d) exceeded", u.limits.MaxSeriesPerUser(u.userID)) } metricName, err := extract.MetricNameFromLabelAdapters(metric) @@ -253,23 +213,22 @@ func (u *userState) createSeriesWithFingerprint(fp model.Fingerprint, metric lab } if !u.canAddSeriesFor(string(metricName)) { - u.fpLocker.Unlock(fp) validation.DiscardedSamples.WithLabelValues(perMetricSeriesLimit, u.userID).Inc() - return fp, nil, httpgrpc.Errorf(http.StatusTooManyRequests, "per-metric series limit (%d) exceeded for %s: %s", u.limits.MaxSeriesPerMetric(u.userID), metricName, metric) + return nil, httpgrpc.Errorf(http.StatusTooManyRequests, "per-metric series limit (%d) exceeded for %s: %s", u.limits.MaxSeriesPerMetric(u.userID), metricName, metric) } u.memSeriesCreatedTotal.Inc() memSeries.Inc() - record.Labels = append(record.Labels, Labels{ Fingerprint: int64(fp), - Labels: metric, + Labels: newLabelPairsFromLabelAdapters(metric), }) labels := u.index.Add(metric, fp) - series = newMemorySeries(labels) + series := newMemorySeries(labels) u.fpToSeries.put(fp, series) + return series, nil } diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index 7ed091132ba..e7527118ea3 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -2,6 +2,7 @@ package ingester import ( "flag" + "fmt" "path" "sync" "time" @@ -11,9 +12,9 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/model" "github.com/prometheus/tsdb/wal" - "github.com/weaveworks/common/user" "golang.org/x/net/context" + "github.com/cortexproject/cortex/pkg/ingester/client" "github.com/cortexproject/cortex/pkg/util" ) @@ -223,6 +224,7 @@ func (w *wrapper) recover(ctx context.Context) error { // Use a local userStates, so we don't need to worry about locking. userStates := newUserStates(w.ingester.limits, w.ingester.cfg) + la := []client.LabelAdapter{} if err := w.recoverRecords("checkpoints", &Series{}, func(msg proto.Message) error { walSeries := msg.(*Series) @@ -231,13 +233,19 @@ func (w *wrapper) recover(ctx context.Context) error { return err } - userCtx := user.InjectOrgID(ctx, walSeries.UserId) - state, err := userStates.getOrCreate(userCtx) - if err != nil { - return err + state, ok := userStates.get(walSeries.UserId) + if !ok { + return fmt.Errorf("user state not found for userid=%s", walSeries.UserId) } - series, err := state.createSeriesWithFingerprint(model.Fingerprint(walSeries.Fingerprint), walSeries.Labels, &Record{}) + la = la[:0] + for _, l := range walSeries.Labels { + la = append(la, client.LabelAdapter{ + Name: string(l.Name), + Value: string(l.Value), + }) + } + series, err := state.createSeriesWithFingerprint(model.Fingerprint(walSeries.Fingerprint), la, &Record{}) if err != nil { return err } @@ -249,11 +257,10 @@ func (w *wrapper) recover(ctx context.Context) error { if err := w.recoverRecords("samples", &Record{}, func(msg proto.Message) error { record := msg.(*Record) - userCtx := user.InjectOrgID(ctx, record.UserId) - state, err := userStates.getOrCreate(userCtx) - if err != nil { - return err + state, ok := userStates.get(record.UserId) + if !ok { + return fmt.Errorf("user state not found for userid=%s", record.UserId) } for _, labels := range record.Labels { @@ -262,7 +269,14 @@ func (w *wrapper) recover(ctx context.Context) error { continue } - _, err = state.createSeriesWithFingerprint(model.Fingerprint(labels.Fingerprint), labels.Labels, &Record{}) + la = la[:0] + for _, l := range labels.Labels { + la = append(la, client.LabelAdapter{ + Name: string(l.Name), + Value: string(l.Value), + }) + } + _, err := state.createSeriesWithFingerprint(model.Fingerprint(labels.Fingerprint), la, &Record{}) if err != nil { return err } From f023ddb88d27c5c52b8f8e62887c91ca07b2083f Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Thu, 11 Jul 2019 16:50:39 +0530 Subject: [PATCH 04/56] Fix creation of user states in WAL recover Signed-off-by: Ganesh Vernekar --- pkg/ingester/user_state.go | 71 ++++++++++++++++++++------------------ pkg/ingester/wal.go | 13 ++----- 2 files changed, 41 insertions(+), 43 deletions(-) diff --git a/pkg/ingester/user_state.go b/pkg/ingester/user_state.go index 4099f55b13c..16d1bbf3e38 100644 --- a/pkg/ingester/user_state.go +++ b/pkg/ingester/user_state.go @@ -120,6 +120,43 @@ func (us *userStates) get(userID string) (*userState, bool) { return state.(*userState), ok } +func (us *userStates) getOrCreate(userID string) *userState { + rawState, ok := us.states.Load(userID) + if ok { + return rawState.(*userState) + } + + seriesInMetric := make([]metricCounterShard, 0, metricCounterShards) + for i := 0; i < metricCounterShards; i++ { + seriesInMetric = append(seriesInMetric, metricCounterShard{ + m: map[string]int{}, + }) + } + + // Speculatively create a userState object and try to store it + // in the map. Another goroutine may have got there before + // us, in which case this userState will be discarded + state := &userState{ + userID: userID, + limits: us.limits, + fpToSeries: newSeriesMap(), + fpLocker: newFingerprintLocker(16 * 1024), + index: index.New(), + ingestedAPISamples: newEWMARate(0.2, us.cfg.RateUpdatePeriod), + ingestedRuleSamples: newEWMARate(0.2, us.cfg.RateUpdatePeriod), + seriesInMetric: seriesInMetric, + + memSeriesCreatedTotal: memSeriesCreatedTotal.WithLabelValues(userID), + memSeriesRemovedTotal: memSeriesRemovedTotal.WithLabelValues(userID), + } + state.mapper = newFPMapper(state.fpToSeries) + stored, ok := us.states.LoadOrStore(userID, state) + if !ok { + memUsers.Inc() + } + return stored.(*userState) +} + func (us *userStates) getViaContext(ctx context.Context) (*userState, bool, error) { userID, err := user.ExtractOrgID(ctx) if err != nil { @@ -135,39 +172,7 @@ func (us *userStates) getOrCreateSeries(ctx context.Context, labels []client.Lab return nil, 0, nil, fmt.Errorf("no user id") } - state, ok := us.get(userID) - if !ok { - - seriesInMetric := make([]metricCounterShard, 0, metricCounterShards) - for i := 0; i < metricCounterShards; i++ { - seriesInMetric = append(seriesInMetric, metricCounterShard{ - m: map[string]int{}, - }) - } - - // Speculatively create a userState object and try to store it - // in the map. Another goroutine may have got there before - // us, in which case this userState will be discarded - state = &userState{ - userID: userID, - limits: us.limits, - fpToSeries: newSeriesMap(), - fpLocker: newFingerprintLocker(16 * 1024), - index: index.New(), - ingestedAPISamples: newEWMARate(0.2, us.cfg.RateUpdatePeriod), - ingestedRuleSamples: newEWMARate(0.2, us.cfg.RateUpdatePeriod), - seriesInMetric: seriesInMetric, - - memSeriesCreatedTotal: memSeriesCreatedTotal.WithLabelValues(userID), - memSeriesRemovedTotal: memSeriesRemovedTotal.WithLabelValues(userID), - } - state.mapper = newFPMapper(state.fpToSeries) - stored, ok := us.states.LoadOrStore(userID, state) - if !ok { - memUsers.Inc() - } - state = stored.(*userState) - } + state := us.getOrCreate(userID) fp, series, err := state.getSeries(labels, record) return state, fp, series, err diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index e7527118ea3..aae9ab14144 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -2,7 +2,6 @@ package ingester import ( "flag" - "fmt" "path" "sync" "time" @@ -220,7 +219,7 @@ func (w *wrapper) truncateSamples() error { return nil } -func (w *wrapper) recover(ctx context.Context) error { +func (w *wrapper) recover(ctx context.Context) (err error) { // Use a local userStates, so we don't need to worry about locking. userStates := newUserStates(w.ingester.limits, w.ingester.cfg) @@ -233,10 +232,7 @@ func (w *wrapper) recover(ctx context.Context) error { return err } - state, ok := userStates.get(walSeries.UserId) - if !ok { - return fmt.Errorf("user state not found for userid=%s", walSeries.UserId) - } + state := userStates.getOrCreate(walSeries.UserId) la = la[:0] for _, l := range walSeries.Labels { @@ -258,10 +254,7 @@ func (w *wrapper) recover(ctx context.Context) error { if err := w.recoverRecords("samples", &Record{}, func(msg proto.Message) error { record := msg.(*Record) - state, ok := userStates.get(record.UserId) - if !ok { - return fmt.Errorf("user state not found for userid=%s", record.UserId) - } + state := userStates.getOrCreate(record.UserId) for _, labels := range record.Labels { _, ok := state.fpToSeries.get(model.Fingerprint(labels.Fingerprint)) From ac98b13ffa5da5445118d6feb3be83ce96211df5 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Tue, 6 Aug 2019 18:48:27 +0530 Subject: [PATCH 05/56] Remove WAL recovery on start-up and allow nil Record Signed-off-by: Ganesh Vernekar --- pkg/ingester/ingester.go | 25 +++++--- pkg/ingester/ingester_test.go | 10 +-- pkg/ingester/query_test.go | 2 +- pkg/ingester/user_state.go | 10 +-- pkg/ingester/wal.go | 116 ---------------------------------- pkg/ingester/wal_test.go | 34 +++++----- 6 files changed, 44 insertions(+), 153 deletions(-) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index f5b0befbc55..4c47adb7064 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -243,15 +243,18 @@ func (i *Ingester) Push(ctx old_ctx.Context, req *client.WriteRequest) (*client. return nil, err } - record := Record{ - UserId: userID, - Samples: make([]Sample, 0, len(req.Timeseries)), + var record *Record + if i.cfg.WALConfig.enabled { + record = &Record{ + UserId: userID, + Samples: make([]Sample, 0, len(req.Timeseries)), + } } var lastPartialErr error for _, ts := range req.Timeseries { for _, s := range ts.Samples { - err := i.append(ctx, ts.Labels, model.Time(s.TimestampMs), model.SampleValue(s.Value), req.Source, &record) + err := i.append(ctx, ts.Labels, model.Time(s.TimestampMs), model.SampleValue(s.Value), req.Source, record) if err == nil { continue } @@ -269,7 +272,7 @@ func (i *Ingester) Push(ctx old_ctx.Context, req *client.WriteRequest) (*client. } } - if err := i.wal.Log(&record); err != nil { + if err := i.wal.Log(record); err != nil { return nil, err } @@ -311,11 +314,13 @@ func (i *Ingester) append(ctx context.Context, labels labelPairs, timestamp mode return err } - record.Samples = append(record.Samples, Sample{ - Fingerprint: int64(fp), - Timestamp: int64(timestamp), - Value: float64(value), - }) + if record != nil { + record.Samples = append(record.Samples, Sample{ + Fingerprint: int64(fp), + Timestamp: int64(timestamp), + Value: float64(value), + }) + } memoryChunks.Add(float64(len(series.chunkDescs) - prevNumChunks)) ingestedSamples.Inc() diff --git a/pkg/ingester/ingester_test.go b/pkg/ingester/ingester_test.go index 811dfcdf7a2..541f883ff2d 100644 --- a/pkg/ingester/ingester_test.go +++ b/pkg/ingester/ingester_test.go @@ -277,22 +277,22 @@ func TestIngesterAppendOutOfOrderAndDuplicate(t *testing.T) { {Name: model.MetricNameLabel, Value: "testmetric"}, } ctx := user.InjectOrgID(context.Background(), userID) - err := ing.append(ctx, m, 1, 0, client.API, &Record{}) + err := ing.append(ctx, m, 1, 0, client.API, nil) require.NoError(t, err) // Two times exactly the same sample (noop). - err = ing.append(ctx, m, 1, 0, client.API, &Record{}) + err = ing.append(ctx, m, 1, 0, client.API, nil) require.NoError(t, err) // Earlier sample than previous one. - err = ing.append(ctx, m, 0, 0, client.API, &Record{}) + err = ing.append(ctx, m, 0, 0, client.API, nil) require.Contains(t, err.Error(), "sample timestamp out of order") errResp, ok := httpgrpc.HTTPResponseFromError(err) require.True(t, ok) require.Equal(t, errResp.Code, int32(400)) // Same timestamp as previous sample, but different value. - err = ing.append(ctx, m, 1, 1, client.API, &Record{}) + err = ing.append(ctx, m, 1, 1, client.API, nil) require.Contains(t, err.Error(), "sample with repeated timestamp but different value") errResp, ok = httpgrpc.HTTPResponseFromError(err) require.True(t, ok) @@ -310,7 +310,7 @@ func TestIngesterAppendBlankLabel(t *testing.T) { {Name: "bar", Value: ""}, } ctx := user.InjectOrgID(context.Background(), userID) - err := ing.append(ctx, lp, 1, 0, client.API, &Record{}) + err := ing.append(ctx, lp, 1, 0, client.API, nil) require.NoError(t, err) res, _, err := runTestQuery(ctx, t, ing, labels.MatchEqual, model.MetricNameLabel, "testmetric") diff --git a/pkg/ingester/query_test.go b/pkg/ingester/query_test.go index ac82690b66b..4e666b28a7d 100644 --- a/pkg/ingester/query_test.go +++ b/pkg/ingester/query_test.go @@ -55,7 +55,7 @@ func BenchmarkQueryStream(b *testing.B) { {Name: "cpu", Value: cpus[i%numCPUs]}, } - state, fp, series, err := ing.userStates.getOrCreateSeries(ctx, labels, &Record{}) + state, fp, series, err := ing.userStates.getOrCreateSeries(ctx, labels, nil) require.NoError(b, err) for j := 0; j < numSamples; j++ { diff --git a/pkg/ingester/user_state.go b/pkg/ingester/user_state.go index 0e7d8f8382f..074127227e7 100644 --- a/pkg/ingester/user_state.go +++ b/pkg/ingester/user_state.go @@ -228,10 +228,12 @@ func (u *userState) createSeriesWithFingerprint(fp model.Fingerprint, metric lab u.memSeriesCreatedTotal.Inc() memSeries.Inc() - record.Labels = append(record.Labels, Labels{ - Fingerprint: int64(fp), - Labels: newLabelPairsFromLabelAdapters(metric), - }) + if record != nil { + record.Labels = append(record.Labels, Labels{ + Fingerprint: int64(fp), + Labels: newLabelPairsFromLabelAdapters(metric), + }) + } labels := u.index.Add(metric, fp) series := newMemorySeries(labels) diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index aae9ab14144..8d90f8c0c5f 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -11,16 +11,13 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/model" "github.com/prometheus/tsdb/wal" - "golang.org/x/net/context" - "github.com/cortexproject/cortex/pkg/ingester/client" "github.com/cortexproject/cortex/pkg/util" ) // WALConfig is config for the Write Ahead Log. type WALConfig struct { enabled bool - recover bool dir string checkpointDuration time.Duration metricsRegisterer prometheus.Registerer @@ -29,7 +26,6 @@ type WALConfig struct { // RegisterFlags adds the flags required to config this to the given FlagSet func (cfg *WALConfig) RegisterFlags(f *flag.FlagSet) { f.BoolVar(&cfg.enabled, "ingester.wal-enable", false, "Enable the WAL.") - f.BoolVar(&cfg.recover, "ingester.wal-recover", false, "Recover from the WAL on startup.") f.StringVar(&cfg.dir, "ingester.wal-dir", "", "Directory to store the WAL.") f.DurationVar(&cfg.checkpointDuration, "ingester.checkpoint-duration", 1*time.Hour, "Duration over which to checkpoint.") } @@ -93,10 +89,6 @@ func newWAL(cfg WALConfig, ingester *Ingester) (WAL, error) { checkpoints: checkpoints, } - if cfg.recover { - w.recover(context.Background()) - } - w.wait.Add(1) go w.run() return w, nil @@ -218,111 +210,3 @@ func (w *wrapper) truncateSamples() error { w.lastSamplesSegment = last return nil } - -func (w *wrapper) recover(ctx context.Context) (err error) { - // Use a local userStates, so we don't need to worry about locking. - userStates := newUserStates(w.ingester.limits, w.ingester.cfg) - - la := []client.LabelAdapter{} - if err := w.recoverRecords("checkpoints", &Series{}, func(msg proto.Message) error { - walSeries := msg.(*Series) - - descs, err := fromWireChunks(walSeries.Chunks) - if err != nil { - return err - } - - state := userStates.getOrCreate(walSeries.UserId) - - la = la[:0] - for _, l := range walSeries.Labels { - la = append(la, client.LabelAdapter{ - Name: string(l.Name), - Value: string(l.Value), - }) - } - series, err := state.createSeriesWithFingerprint(model.Fingerprint(walSeries.Fingerprint), la, &Record{}) - if err != nil { - return err - } - - return series.setChunks(descs) - }); err != nil { - return err - } - - if err := w.recoverRecords("samples", &Record{}, func(msg proto.Message) error { - record := msg.(*Record) - - state := userStates.getOrCreate(record.UserId) - - for _, labels := range record.Labels { - _, ok := state.fpToSeries.get(model.Fingerprint(labels.Fingerprint)) - if ok { - continue - } - - la = la[:0] - for _, l := range labels.Labels { - la = append(la, client.LabelAdapter{ - Name: string(l.Name), - Value: string(l.Value), - }) - } - _, err := state.createSeriesWithFingerprint(model.Fingerprint(labels.Fingerprint), la, &Record{}) - if err != nil { - return err - } - } - - for _, sample := range record.Samples { - series, ok := state.fpToSeries.get(model.Fingerprint(sample.Fingerprint)) - if !ok { - return nil - } - - err := series.add(model.SamplePair{ - Timestamp: model.Time(sample.Timestamp), - Value: model.SampleValue(sample.Value), - }) - if err != nil { - level.Info(util.Logger).Log("msg", "error appending sample", "err", err) - } - } - - return nil - }); err != nil { - return err - } - - w.ingester.userStatesMtx.Lock() - w.ingester.userStates = userStates - w.ingester.userStatesMtx.Unlock() - - return nil -} - -func (w *wrapper) recoverRecords(name string, ty proto.Message, callback func(proto.Message) error) error { - segmentReader, err := wal.NewSegmentsReader(path.Join(w.cfg.dir, name)) - if err != nil { - return err - } - defer segmentReader.Close() - - reader := wal.NewReader(segmentReader) - for reader.Next() { - ty.Reset() - if err := proto.Unmarshal(reader.Record(), ty); err != nil { - return err - } - - if err := callback(ty); err != nil { - return err - } - } - if err := reader.Err(); err != nil { - return err - } - - return nil -} diff --git a/pkg/ingester/wal_test.go b/pkg/ingester/wal_test.go index bd5c4beba72..dc2e5e03378 100644 --- a/pkg/ingester/wal_test.go +++ b/pkg/ingester/wal_test.go @@ -1,13 +1,13 @@ package ingester import ( - "io/ioutil" + // "io/ioutil" "os" "testing" "github.com/cortexproject/cortex/pkg/util" "github.com/go-kit/kit/log" - "github.com/stretchr/testify/require" + // "github.com/stretchr/testify/require" ) func init() { @@ -15,23 +15,23 @@ func init() { } func TestWAL(t *testing.T) { - dirname, err := ioutil.TempDir("", "cortex-wal") - require.NoError(t, err) + // dirname, err := ioutil.TempDir("", "cortex-wal") + // require.NoError(t, err) - cfg := defaultIngesterTestConfig() - cfg.WALConfig.enabled = true - cfg.WALConfig.dir = dirname + // cfg := defaultIngesterTestConfig() + // cfg.WALConfig.enabled = true + // cfg.WALConfig.dir = dirname - // Build an ingester, add some samples, then shut it down. - _, ing := newTestStore(t, cfg, defaultClientTestConfig(), defaultLimitsTestConfig()) - userIDs, testData := pushTestSamples(t, ing, 10, 1000) - ing.Shutdown() + // // Build an ingester, add some samples, then shut it down. + // _, ing := newTestStore(t, cfg, defaultClientTestConfig(), defaultLimitsTestConfig()) + // userIDs, testData := pushTestSamples(t, ing, 10, 1000) + // ing.Shutdown() - // Start a new ingester and recover the WAL. - cfg.WALConfig.recover = true - _, ing = newTestStore(t, cfg, defaultClientTestConfig(), defaultLimitsTestConfig()) - defer ing.Shutdown() + // // Start a new ingester and recover the WAL. + // cfg.WALConfig.recover = true + // _, ing = newTestStore(t, cfg, defaultClientTestConfig(), defaultLimitsTestConfig()) + // defer ing.Shutdown() - // Check the samples are still there! - retrieveTestSamples(t, ing, userIDs, testData) + // // Check the samples are still there! + // retrieveTestSamples(t, ing, userIDs, testData) } From b84b02e85b1ea99ec8178d32779c6e1c97a676c2 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Tue, 6 Aug 2019 19:36:46 +0530 Subject: [PATCH 06/56] Fix types Signed-off-by: Ganesh Vernekar --- pkg/ingester/ingester.go | 4 ++-- pkg/ingester/user_state.go | 2 +- pkg/ingester/wal.go | 2 +- pkg/ingester/wal.proto | 8 ++++---- vendor/modules.txt | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 4c47adb7064..3386008717e 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -316,8 +316,8 @@ func (i *Ingester) append(ctx context.Context, labels labelPairs, timestamp mode if record != nil { record.Samples = append(record.Samples, Sample{ - Fingerprint: int64(fp), - Timestamp: int64(timestamp), + Fingerprint: uint64(fp), + Timestamp: uint64(timestamp), Value: float64(value), }) } diff --git a/pkg/ingester/user_state.go b/pkg/ingester/user_state.go index 074127227e7..c904ad111fa 100644 --- a/pkg/ingester/user_state.go +++ b/pkg/ingester/user_state.go @@ -230,7 +230,7 @@ func (u *userState) createSeriesWithFingerprint(fp model.Fingerprint, metric lab if record != nil { record.Labels = append(record.Labels, Labels{ - Fingerprint: int64(fp), + Fingerprint: uint64(fp), Labels: newLabelPairsFromLabelAdapters(metric), }) } diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index 8d90f8c0c5f..e37c241dc35 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -185,7 +185,7 @@ func (w *wrapper) checkpointSeries(userID string, fp model.Fingerprint, series * buf, err := proto.Marshal(&Series{ UserId: userID, - Fingerprint: int64(fp), + Fingerprint: uint64(fp), Labels: series.labels(), Chunks: wireChunks, }) diff --git a/pkg/ingester/wal.proto b/pkg/ingester/wal.proto index 0d21be2687f..51bb2a30eed 100644 --- a/pkg/ingester/wal.proto +++ b/pkg/ingester/wal.proto @@ -14,19 +14,19 @@ message Record { } message Labels { - int64 fingerprint = 1; + uint64 fingerprint = 1; repeated cortex.LabelPair labels = 2 [(gogoproto.nullable) = false]; } message Sample { - int64 fingerprint = 1; - int64 timestamp = 2; + uint64 fingerprint = 1; + uint64 timestamp = 2; double value = 3; } message Series { string user_id = 1; - int64 fingerprint = 2; + uint64 fingerprint = 2; repeated cortex.LabelPair labels = 3 [(gogoproto.nullable) = false]; repeated cortex.Chunk chunks = 4 [(gogoproto.nullable) = false]; } diff --git a/vendor/modules.txt b/vendor/modules.txt index d2dd5064b14..59e076ff871 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -357,10 +357,10 @@ github.com/prometheus/prometheus/pkg/logging github.com/prometheus/prometheus/util/treecache # github.com/prometheus/tsdb v0.7.2-0.20190506134726-2ae028114c89 github.com/prometheus/tsdb/chunkenc +github.com/prometheus/tsdb/wal github.com/prometheus/tsdb/labels -github.com/prometheus/tsdb github.com/prometheus/tsdb/fileutil -github.com/prometheus/tsdb/wal +github.com/prometheus/tsdb github.com/prometheus/tsdb/chunks github.com/prometheus/tsdb/encoding github.com/prometheus/tsdb/errors From 05127b016acf3750c303fe7507e7054aa3445295 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Mon, 12 Aug 2019 19:05:28 +0530 Subject: [PATCH 07/56] WAL compression always enabled Signed-off-by: Ganesh Vernekar --- pkg/ingester/wal.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index e37c241dc35..c75059f0ece 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -67,7 +67,7 @@ func newWAL(cfg WALConfig, ingester *Ingester) (WAL, error) { if cfg.metricsRegisterer != nil { samplesRegistry = prometheus.WrapRegistererWith(prometheus.Labels{"kind": "samples"}, cfg.metricsRegisterer) } - samples, err := wal.New(util.Logger, samplesRegistry, path.Join(cfg.dir, "samples")) + samples, err := wal.New(util.Logger, samplesRegistry, path.Join(cfg.dir, "samples"), true) if err != nil { return nil, err } @@ -76,7 +76,7 @@ func newWAL(cfg WALConfig, ingester *Ingester) (WAL, error) { if cfg.metricsRegisterer != nil { checkpointsRegistry = prometheus.WrapRegistererWith(prometheus.Labels{"kind": "checkpoints"}, cfg.metricsRegisterer) } - checkpoints, err := wal.New(util.Logger, checkpointsRegistry, path.Join(cfg.dir, "checkpoints")) + checkpoints, err := wal.New(util.Logger, checkpointsRegistry, path.Join(cfg.dir, "checkpoints"), true) if err != nil { return nil, err } From 787fbb81c7874c8310b0c2e7c9d099da1c536ca4 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Tue, 13 Aug 2019 16:09:12 +0530 Subject: [PATCH 08/56] Change checkpoint logic to be more like prometheus's tsdb Signed-off-by: Ganesh Vernekar --- pkg/ingester/wal.go | 139 +++++++++++++++++++++++++++++++---------- pkg/ingester/wal.pb.go | 46 +++++++------- vendor/modules.txt | 4 +- 3 files changed, 130 insertions(+), 59 deletions(-) diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index c75059f0ece..b1781590378 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -2,14 +2,23 @@ package ingester import ( "flag" + "fmt" + "io/ioutil" + "os" "path" + "path/filepath" + "strconv" + "strings" "sync" "time" "github.com/go-kit/kit/log/level" "github.com/golang/protobuf/proto" + "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/model" + tsdb_errors "github.com/prometheus/tsdb/errors" + "github.com/prometheus/tsdb/fileutil" "github.com/prometheus/tsdb/wal" "github.com/cortexproject/cortex/pkg/util" @@ -52,10 +61,8 @@ type wrapper struct { quit chan struct{} wait sync.WaitGroup - lastCheckpointSegment int - lastSamplesSegment int - samples *wal.WAL - checkpoints *wal.WAL + lastWalSegment int + wal *wal.WAL } func newWAL(cfg WALConfig, ingester *Ingester) (WAL, error) { @@ -72,21 +79,11 @@ func newWAL(cfg WALConfig, ingester *Ingester) (WAL, error) { return nil, err } - var checkpointsRegistry prometheus.Registerer - if cfg.metricsRegisterer != nil { - checkpointsRegistry = prometheus.WrapRegistererWith(prometheus.Labels{"kind": "checkpoints"}, cfg.metricsRegisterer) - } - checkpoints, err := wal.New(util.Logger, checkpointsRegistry, path.Join(cfg.dir, "checkpoints"), true) - if err != nil { - return nil, err - } - w := &wrapper{ - cfg: cfg, - ingester: ingester, - quit: make(chan struct{}), - samples: samples, - checkpoints: checkpoints, + cfg: cfg, + ingester: ingester, + quit: make(chan struct{}), + wal: samples, } w.wait.Add(1) @@ -98,8 +95,7 @@ func (w *wrapper) Stop() { close(w.quit) w.wait.Wait() - w.samples.Close() - w.checkpoints.Close() + w.wal.Close() } func (w *wrapper) Log(record *Record) error { @@ -107,13 +103,14 @@ func (w *wrapper) Log(record *Record) error { if err != nil { return err } - return w.samples.Log(buf) + return w.wal.Log(buf) } func (w *wrapper) run() { defer w.wait.Done() for !w.isStopped() { + // TODO: add metrics from checkpoint success/failure. if err := w.checkpoint(); err != nil { level.Error(util.Logger).Log("msg", "Error checkpointing series", "err", err) continue @@ -135,7 +132,31 @@ func (w *wrapper) isStopped() bool { } } +const checkpointPrefix = "checkpoint." + func (w *wrapper) checkpoint() error { + _, last, err := w.lastCheckpoint() + if err != nil { + return err + } + + newIdx := last + 1 + + cpdir := filepath.Join(w.wal.Dir(), fmt.Sprintf(checkpointPrefix+"%06d", newIdx)) + cpdirtmp := cpdir + ".tmp" + + if err := os.MkdirAll(cpdirtmp, 0777); err != nil { + return errors.Wrap(err, "create checkpoint dir") + } + cp, err := wal.New(nil, nil, cpdirtmp, true) + if err != nil { + return errors.Wrap(err, "open checkpoint") + } + defer func() { + cp.Close() + os.RemoveAll(cpdirtmp) + }() + // Count number of series - we'll use this to rate limit checkpoints. numSeries := 0 for _, state := range w.ingester.userStates.cp() { @@ -151,7 +172,7 @@ func (w *wrapper) checkpoint() error { for userID, state := range w.ingester.userStates.cp() { for pair := range state.fpToSeries.iter() { state.fpLocker.Lock(pair.fp) - err := w.checkpointSeries(userID, pair.fp, pair.series) + err := w.checkpointSeries(cp, userID, pair.fp, pair.series) state.fpLocker.Unlock(pair.fp) if err != nil { return err @@ -164,20 +185,70 @@ func (w *wrapper) checkpoint() error { } } - // Remove the previous checkpoint. - _, last, err := w.checkpoints.Segments() - if err != nil { - return err + if err := cp.Close(); err != nil { + return errors.Wrap(err, "close checkpoint") } - if err := w.checkpoints.Truncate(w.lastCheckpointSegment); err != nil { - return err + if err := fileutil.Replace(cpdirtmp, cpdir); err != nil { + return errors.Wrap(err, "rename checkpoint directory") + } + + if last >= 0 { + return w.deleteCheckpoints(last) } - w.lastCheckpointSegment = last return nil } -func (w *wrapper) checkpointSeries(userID string, fp model.Fingerprint, series *memorySeries) error { +// lastCheckpoint returns the directory name and index of the most recent checkpoint. +// If dir does not contain any checkpoints, -1 is returned as index. +func (w *wrapper) lastCheckpoint() (string, int, error) { + files, err := ioutil.ReadDir(w.wal.Dir()) + if err != nil { + return "", -1, err + } + // Traverse list backwards since there may be multiple checkpoints left. + for i := len(files) - 1; i >= 0; i-- { + fi := files[i] + + if !strings.HasPrefix(fi.Name(), checkpointPrefix) { + continue + } + if !fi.IsDir() { + return "", -1, fmt.Errorf("checkpoint %s is not a directory", fi.Name()) + } + idx, err := strconv.Atoi(fi.Name()[len(checkpointPrefix):]) + if err != nil { + continue + } + return filepath.Join(w.wal.Dir(), fi.Name()), idx, nil + } + return "", -1, nil +} + +// deleteCheckpoints deletes all checkpoints in a directory below a given index. +func (w *wrapper) deleteCheckpoints(maxIndex int) error { + var errs tsdb_errors.MultiError + + files, err := ioutil.ReadDir(w.wal.Dir()) + if err != nil { + return err + } + for _, fi := range files { + if !strings.HasPrefix(fi.Name(), checkpointPrefix) { + continue + } + index, err := strconv.Atoi(fi.Name()[len(checkpointPrefix):]) + if err != nil || index >= maxIndex { + continue + } + if err := os.RemoveAll(filepath.Join(w.wal.Dir(), fi.Name())); err != nil { + errs.Add(err) + } + } + return errs.Err() +} + +func (w *wrapper) checkpointSeries(cp *wal.WAL, userID string, fp model.Fingerprint, series *memorySeries) error { wireChunks, err := toWireChunks(series.chunkDescs) if err != nil { return err @@ -193,20 +264,20 @@ func (w *wrapper) checkpointSeries(userID string, fp model.Fingerprint, series * return err } - return w.checkpoints.Log(buf) + return cp.Log(buf) } // truncateSamples removed the wal from before the checkpoint. func (w *wrapper) truncateSamples() error { - _, last, err := w.samples.Segments() + _, last, err := w.wal.Segments() if err != nil { return err } - if err := w.samples.Truncate(w.lastSamplesSegment); err != nil { + if err := w.wal.Truncate(w.lastWalSegment); err != nil { return err } - w.lastSamplesSegment = last + w.lastWalSegment = last return nil } diff --git a/pkg/ingester/wal.pb.go b/pkg/ingester/wal.pb.go index 19cd02ad65e..54ef840ca1d 100644 --- a/pkg/ingester/wal.pb.go +++ b/pkg/ingester/wal.pb.go @@ -86,7 +86,7 @@ func (m *Record) GetSamples() []Sample { } type Labels struct { - Fingerprint int64 `protobuf:"varint,1,opt,name=fingerprint,proto3" json:"fingerprint,omitempty"` + Fingerprint uint64 `protobuf:"varint,1,opt,name=fingerprint,proto3" json:"fingerprint,omitempty"` Labels []client.LabelPair `protobuf:"bytes,2,rep,name=labels,proto3" json:"labels"` } @@ -122,7 +122,7 @@ func (m *Labels) XXX_DiscardUnknown() { var xxx_messageInfo_Labels proto.InternalMessageInfo -func (m *Labels) GetFingerprint() int64 { +func (m *Labels) GetFingerprint() uint64 { if m != nil { return m.Fingerprint } @@ -137,8 +137,8 @@ func (m *Labels) GetLabels() []client.LabelPair { } type Sample struct { - Fingerprint int64 `protobuf:"varint,1,opt,name=fingerprint,proto3" json:"fingerprint,omitempty"` - Timestamp int64 `protobuf:"varint,2,opt,name=timestamp,proto3" json:"timestamp,omitempty"` + Fingerprint uint64 `protobuf:"varint,1,opt,name=fingerprint,proto3" json:"fingerprint,omitempty"` + Timestamp uint64 `protobuf:"varint,2,opt,name=timestamp,proto3" json:"timestamp,omitempty"` Value float64 `protobuf:"fixed64,3,opt,name=value,proto3" json:"value,omitempty"` } @@ -174,14 +174,14 @@ func (m *Sample) XXX_DiscardUnknown() { var xxx_messageInfo_Sample proto.InternalMessageInfo -func (m *Sample) GetFingerprint() int64 { +func (m *Sample) GetFingerprint() uint64 { if m != nil { return m.Fingerprint } return 0 } -func (m *Sample) GetTimestamp() int64 { +func (m *Sample) GetTimestamp() uint64 { if m != nil { return m.Timestamp } @@ -197,7 +197,7 @@ func (m *Sample) GetValue() float64 { type Series struct { UserId string `protobuf:"bytes,1,opt,name=user_id,json=userId,proto3" json:"user_id,omitempty"` - Fingerprint int64 `protobuf:"varint,2,opt,name=fingerprint,proto3" json:"fingerprint,omitempty"` + Fingerprint uint64 `protobuf:"varint,2,opt,name=fingerprint,proto3" json:"fingerprint,omitempty"` Labels []client.LabelPair `protobuf:"bytes,3,rep,name=labels,proto3" json:"labels"` Chunks []client.Chunk `protobuf:"bytes,4,rep,name=chunks,proto3" json:"chunks"` } @@ -241,7 +241,7 @@ func (m *Series) GetUserId() string { return "" } -func (m *Series) GetFingerprint() int64 { +func (m *Series) GetFingerprint() uint64 { if m != nil { return m.Fingerprint } @@ -287,17 +287,17 @@ var fileDescriptor_ae6364fc8077884f = []byte{ 0xc2, 0x9f, 0x63, 0xec, 0x22, 0x98, 0x04, 0x21, 0x4d, 0xa4, 0xd5, 0xb2, 0xcd, 0x41, 0xef, 0xcb, 0x6b, 0xb7, 0x89, 0x74, 0x7f, 0x69, 0x7e, 0xd8, 0x9e, 0x5f, 0x7f, 0x30, 0xfc, 0x5a, 0x85, 0x3f, 0xa3, 0x8e, 0x0c, 0xd2, 0x3c, 0xa1, 0xd2, 0x32, 0xf7, 0x0d, 0x23, 0x5d, 0xa8, 0x0d, 0x8d, 0xcc, - 0xf9, 0x8b, 0x60, 0x95, 0x84, 0x6d, 0xd4, 0x9b, 0x6c, 0xc5, 0x22, 0x17, 0x2c, 0x53, 0x7a, 0x10, - 0xd3, 0xdf, 0xa5, 0xb0, 0xb7, 0x37, 0xcd, 0x1b, 0xb7, 0x5e, 0x48, 0x27, 0xfc, 0x0e, 0x98, 0x78, - 0x3c, 0x8e, 0xf3, 0x0f, 0xc1, 0xaa, 0xeb, 0x01, 0xe1, 0xef, 0x51, 0x57, 0xb1, 0x94, 0x4a, 0x15, - 0xa4, 0xb9, 0xd5, 0xd2, 0xf5, 0x07, 0x02, 0xbf, 0x45, 0x2f, 0x8e, 0x83, 0xa4, 0xa0, 0x96, 0x69, - 0x83, 0x01, 0xf0, 0x2b, 0xe0, 0x5c, 0x02, 0x04, 0x47, 0x54, 0x30, 0x2a, 0x9f, 0xfe, 0xc2, 0xbd, - 0xce, 0xad, 0xe7, 0xd6, 0x32, 0x0f, 0x5a, 0x0b, 0x7f, 0x44, 0x30, 0x9a, 0x16, 0xd9, 0x4c, 0x5a, - 0x6d, 0x6d, 0x78, 0xd5, 0x18, 0x7e, 0x6c, 0xd9, 0x46, 0x5c, 0x49, 0x86, 0xdf, 0x16, 0x2b, 0x62, - 0x2c, 0x57, 0xc4, 0xd8, 0xac, 0x08, 0x38, 0x2b, 0x09, 0xb8, 0x2a, 0x09, 0x98, 0x97, 0x04, 0x2c, - 0x4a, 0x02, 0x6e, 0x4a, 0x02, 0x6e, 0x4b, 0x62, 0x6c, 0x4a, 0x02, 0x2e, 0xd6, 0xc4, 0x58, 0xac, - 0x89, 0xb1, 0x5c, 0x13, 0xe3, 0xcf, 0xfd, 0x5d, 0x86, 0x50, 0x5f, 0xcb, 0xd7, 0xbb, 0x00, 0x00, - 0x00, 0xff, 0xff, 0xb5, 0x04, 0xfd, 0xfb, 0xb5, 0x02, 0x00, 0x00, + 0xf9, 0x8b, 0x60, 0x95, 0x84, 0x6d, 0xd4, 0x9b, 0x6c, 0xc5, 0x22, 0x17, 0x2c, 0x53, 0x7a, 0x90, + 0xb6, 0xbf, 0x4b, 0x61, 0x6f, 0x6f, 0x9a, 0x37, 0x6e, 0xbd, 0x90, 0x4e, 0xf8, 0x1d, 0x30, 0xf1, + 0x78, 0x1c, 0xe7, 0x1f, 0x82, 0x55, 0xd7, 0x03, 0xc2, 0xdf, 0xa3, 0xae, 0x62, 0x29, 0x95, 0x2a, + 0x48, 0x73, 0xab, 0xa5, 0xeb, 0x0f, 0x04, 0x7e, 0x8b, 0x5e, 0x1c, 0x07, 0x49, 0x41, 0x2d, 0xd3, + 0x06, 0x03, 0xe0, 0x57, 0xc0, 0xb9, 0x04, 0x08, 0x8e, 0xa8, 0x60, 0x54, 0x3e, 0xfd, 0x85, 0x7b, + 0x9d, 0x5b, 0xcf, 0xad, 0x65, 0x1e, 0xb4, 0x16, 0xfe, 0x88, 0x60, 0x34, 0x2d, 0xb2, 0x99, 0xb4, + 0xda, 0xda, 0xf0, 0xaa, 0x31, 0xfc, 0xd8, 0xb2, 0x8d, 0xb8, 0x92, 0x0c, 0xbf, 0x2d, 0x56, 0xc4, + 0x58, 0xae, 0x88, 0xb1, 0x59, 0x11, 0x70, 0x56, 0x12, 0x70, 0x55, 0x12, 0x30, 0x2f, 0x09, 0x58, + 0x94, 0x04, 0xdc, 0x94, 0x04, 0xdc, 0x96, 0xc4, 0xd8, 0x94, 0x04, 0x5c, 0xac, 0x89, 0xb1, 0x58, + 0x13, 0x63, 0xb9, 0x26, 0xc6, 0x9f, 0xfb, 0xbb, 0x0c, 0xa1, 0xbe, 0x96, 0xaf, 0x77, 0x01, 0x00, + 0x00, 0xff, 0xff, 0x63, 0xe6, 0x8a, 0xe9, 0xb5, 0x02, 0x00, 0x00, } func (this *Record) Equal(that interface{}) bool { @@ -1064,7 +1064,7 @@ func (m *Labels) Unmarshal(dAtA []byte) error { } b := dAtA[iNdEx] iNdEx++ - m.Fingerprint |= int64(b&0x7F) << shift + m.Fingerprint |= uint64(b&0x7F) << shift if b < 0x80 { break } @@ -1170,7 +1170,7 @@ func (m *Sample) Unmarshal(dAtA []byte) error { } b := dAtA[iNdEx] iNdEx++ - m.Fingerprint |= int64(b&0x7F) << shift + m.Fingerprint |= uint64(b&0x7F) << shift if b < 0x80 { break } @@ -1189,7 +1189,7 @@ func (m *Sample) Unmarshal(dAtA []byte) error { } b := dAtA[iNdEx] iNdEx++ - m.Timestamp |= int64(b&0x7F) << shift + m.Timestamp |= uint64(b&0x7F) << shift if b < 0x80 { break } @@ -1304,7 +1304,7 @@ func (m *Series) Unmarshal(dAtA []byte) error { } b := dAtA[iNdEx] iNdEx++ - m.Fingerprint |= int64(b&0x7F) << shift + m.Fingerprint |= uint64(b&0x7F) << shift if b < 0x80 { break } diff --git a/vendor/modules.txt b/vendor/modules.txt index 6e655a5b144..7abfb9b7a03 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -363,13 +363,13 @@ github.com/prometheus/prometheus/pkg/logging github.com/prometheus/prometheus/util/treecache # github.com/prometheus/tsdb v0.9.1 github.com/prometheus/tsdb/chunkenc +github.com/prometheus/tsdb/errors +github.com/prometheus/tsdb/fileutil github.com/prometheus/tsdb/wal github.com/prometheus/tsdb/labels -github.com/prometheus/tsdb/fileutil github.com/prometheus/tsdb github.com/prometheus/tsdb/chunks github.com/prometheus/tsdb/encoding -github.com/prometheus/tsdb/errors github.com/prometheus/tsdb/goversion github.com/prometheus/tsdb/index # github.com/samuel/go-zookeeper v0.0.0-20161028232340-1d7be4effb13 From 9845a5e2d1742de20ed126d5ad0f91515c132fb8 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Tue, 13 Aug 2019 19:41:12 +0530 Subject: [PATCH 09/56] Add metrics for checkpoint and name changes Signed-off-by: Ganesh Vernekar --- pkg/ingester/wal.go | 74 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 59 insertions(+), 15 deletions(-) diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index b1781590378..155100f636c 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -55,7 +55,7 @@ func (noop) Log(*Record) error { // Stop any background WAL processes. func (noop) Stop() {} -type wrapper struct { +type walWrapper struct { cfg WALConfig ingester *Ingester quit chan struct{} @@ -63,6 +63,12 @@ type wrapper struct { lastWalSegment int wal *wal.WAL + + // Checkpoint metrics. + checkpointDeleteFail prometheus.Counter + checkpointDeleteTotal prometheus.Counter + checkpointCreationFail prometheus.Counter + checkpointCreationTotal prometheus.Counter } func newWAL(cfg WALConfig, ingester *Ingester) (WAL, error) { @@ -70,20 +76,45 @@ func newWAL(cfg WALConfig, ingester *Ingester) (WAL, error) { return &noop{}, nil } - var samplesRegistry prometheus.Registerer + var walRegistry prometheus.Registerer if cfg.metricsRegisterer != nil { - samplesRegistry = prometheus.WrapRegistererWith(prometheus.Labels{"kind": "samples"}, cfg.metricsRegisterer) + walRegistry = prometheus.WrapRegistererWith(prometheus.Labels{"kind": "wal"}, cfg.metricsRegisterer) } - samples, err := wal.New(util.Logger, samplesRegistry, path.Join(cfg.dir, "samples"), true) + tsdbWAL, err := wal.New(util.Logger, walRegistry, path.Join(cfg.dir, "wal"), true) if err != nil { return nil, err } - w := &wrapper{ + w := &walWrapper{ cfg: cfg, ingester: ingester, quit: make(chan struct{}), - wal: samples, + wal: tsdbWAL, + } + + w.checkpointDeleteFail = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "ingester_checkpoint_deletions_failed_total", + Help: "Total number of checkpoint deletions that failed.", + }) + w.checkpointDeleteTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "ingester_checkpoint_deletions_total", + Help: "Total number of checkpoint deletions attempted.", + }) + w.checkpointCreationFail = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "ingester_checkpoint_creations_failed_total", + Help: "Total number of checkpoint creations that failed.", + }) + w.checkpointCreationTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "ingester_checkpoint_creations_total", + Help: "Total number of checkpoint creations attempted.", + }) + if cfg.metricsRegisterer != nil { + cfg.metricsRegisterer.MustRegister( + w.checkpointDeleteFail, + w.checkpointDeleteTotal, + w.checkpointCreationFail, + w.checkpointCreationTotal, + ) } w.wait.Add(1) @@ -91,14 +122,14 @@ func newWAL(cfg WALConfig, ingester *Ingester) (WAL, error) { return w, nil } -func (w *wrapper) Stop() { +func (w *walWrapper) Stop() { close(w.quit) w.wait.Wait() w.wal.Close() } -func (w *wrapper) Log(record *Record) error { +func (w *walWrapper) Log(record *Record) error { buf, err := proto.Marshal(record) if err != nil { return err @@ -106,7 +137,7 @@ func (w *wrapper) Log(record *Record) error { return w.wal.Log(buf) } -func (w *wrapper) run() { +func (w *walWrapper) run() { defer w.wait.Done() for !w.isStopped() { @@ -123,7 +154,7 @@ func (w *wrapper) run() { } } -func (w *wrapper) isStopped() bool { +func (w *walWrapper) isStopped() bool { select { case <-w.quit: return true @@ -134,7 +165,13 @@ func (w *wrapper) isStopped() bool { const checkpointPrefix = "checkpoint." -func (w *wrapper) checkpoint() error { +func (w *walWrapper) checkpoint() (err error) { + w.checkpointCreationTotal.Inc() + defer func() { + if err != nil { + w.checkpointCreationFail.Inc() + } + }() _, last, err := w.lastCheckpoint() if err != nil { return err @@ -201,7 +238,7 @@ func (w *wrapper) checkpoint() error { // lastCheckpoint returns the directory name and index of the most recent checkpoint. // If dir does not contain any checkpoints, -1 is returned as index. -func (w *wrapper) lastCheckpoint() (string, int, error) { +func (w *walWrapper) lastCheckpoint() (string, int, error) { files, err := ioutil.ReadDir(w.wal.Dir()) if err != nil { return "", -1, err @@ -226,7 +263,14 @@ func (w *wrapper) lastCheckpoint() (string, int, error) { } // deleteCheckpoints deletes all checkpoints in a directory below a given index. -func (w *wrapper) deleteCheckpoints(maxIndex int) error { +func (w *walWrapper) deleteCheckpoints(maxIndex int) (err error) { + w.checkpointDeleteTotal.Inc() + defer func() { + if err != nil { + w.checkpointDeleteFail.Inc() + } + }() + var errs tsdb_errors.MultiError files, err := ioutil.ReadDir(w.wal.Dir()) @@ -248,7 +292,7 @@ func (w *wrapper) deleteCheckpoints(maxIndex int) error { return errs.Err() } -func (w *wrapper) checkpointSeries(cp *wal.WAL, userID string, fp model.Fingerprint, series *memorySeries) error { +func (w *walWrapper) checkpointSeries(cp *wal.WAL, userID string, fp model.Fingerprint, series *memorySeries) error { wireChunks, err := toWireChunks(series.chunkDescs) if err != nil { return err @@ -268,7 +312,7 @@ func (w *wrapper) checkpointSeries(cp *wal.WAL, userID string, fp model.Fingerpr } // truncateSamples removed the wal from before the checkpoint. -func (w *wrapper) truncateSamples() error { +func (w *walWrapper) truncateSamples() error { _, last, err := w.wal.Segments() if err != nil { return err From f21f814ba32b869f384779bcce7f637ae4b57891 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Wed, 14 Aug 2019 11:36:33 +0530 Subject: [PATCH 10/56] Initial attempt for flushing chunks from WAL [WIP] Signed-off-by: Ganesh Vernekar --- pkg/ingester/wal.go | 103 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 99 insertions(+), 4 deletions(-) diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index acecfb9fc84..57eb3479c87 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -1,6 +1,7 @@ package ingester import ( + "context" "flag" "fmt" "io/ioutil" @@ -17,10 +18,12 @@ import ( "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/model" + "github.com/prometheus/prometheus/pkg/labels" tsdb_errors "github.com/prometheus/tsdb/errors" "github.com/prometheus/tsdb/fileutil" "github.com/prometheus/tsdb/wal" + "github.com/cortexproject/cortex/pkg/ingester/client" "github.com/cortexproject/cortex/pkg/util" ) @@ -172,7 +175,7 @@ func (w *walWrapper) checkpoint() (err error) { w.checkpointCreationFail.Inc() } }() - _, last, err := w.lastCheckpoint() + _, last, err := lastCheckpoint(w.wal.Dir()) if err != nil { return err } @@ -238,8 +241,8 @@ func (w *walWrapper) checkpoint() (err error) { // lastCheckpoint returns the directory name and index of the most recent checkpoint. // If dir does not contain any checkpoints, -1 is returned as index. -func (w *walWrapper) lastCheckpoint() (string, int, error) { - files, err := ioutil.ReadDir(w.wal.Dir()) +func lastCheckpoint(dir string) (string, int, error) { + files, err := ioutil.ReadDir(dir) if err != nil { return "", -1, err } @@ -257,7 +260,7 @@ func (w *walWrapper) lastCheckpoint() (string, int, error) { if err != nil { continue } - return filepath.Join(w.wal.Dir(), fi.Name()), idx, nil + return filepath.Join(dir, fi.Name()), idx, nil } return "", -1, nil } @@ -325,3 +328,95 @@ func (w *walWrapper) truncateSamples() error { w.lastWalSegment = last return nil } + +func FlushFromWAL(ingester *Ingester, dir string) error { + lastCheckpointDir, _, err := lastCheckpoint(dir) + if err != nil { + return err + } + + sr, err := wal.NewSegmentsReader(lastCheckpointDir) + if err != nil { + return err + } + + series, err := loadCheckpoint(wal.NewReader(sr)) + if err != nil { + return err + } + + // Either use ingester series like normal reading and then flush once per series + // or like below - flush once for the checkpoint and 1 more time from the WAL segments + + // TODO: check about adding index entries. + + for _, s := range series { + chunkDesc, err := fromWireChunks(s.Chunks) + if err != nil { + return err + } + + // TODO: user id in the context. + if err := ingester.flushChunks( + context.Background(), + model.Fingerprint(s.Fingerprint), + pbLabelPairToLabels(s.Labels), + chunkDesc, + ); err != nil { + return nil + } + } + + // TODO: Read segments from WAL which start after the checkpoint. + // Need to do some alignment with segment and checkpoint numbers for that. + sr, err = wal.NewSegmentsReader(dir) + if err != nil { + return nil + } + + records, err := loadWAL(wal.NewReader(sr)) + if err != nil { + return nil + } + + for range records { + + } + + return nil +} + +func loadCheckpoint(r *wal.Reader) (series []*Series, err error) { + for r.Next() { + rec := r.Record() + s := &Series{} + if err := proto.Unmarshal(rec, s); err != nil { + return nil, err + } + series = append(series, s) + + } + return series, r.Err() +} + +func loadWAL(r *wal.Reader) (records []*Record, err error) { + for r.Next() { + rec := r.Record() + record := &Record{} + if err := proto.Unmarshal(rec, record); err != nil { + return nil, err + } + records = append(records, record) + + } + return records, r.Err() +} + +func pbLabelPairToLabels(lps []client.LabelPair) labels.Labels { + lbls := make(labels.Labels, len(lps)) + for i := range lps { + lbls[i].Name = string(lps[i].Name) + lbls[i].Value = string(lps[i].Value) + } + return lbls +} From 46eaf362566700d8b64d546d62a7c349bbf717d6 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Wed, 14 Aug 2019 12:43:49 +0530 Subject: [PATCH 11/56] Combine checkpoint and WAL chunks before flushing Signed-off-by: Ganesh Vernekar --- pkg/ingester/wal.go | 97 +++++++++++++++++++++++++++++++++------------ 1 file changed, 72 insertions(+), 25 deletions(-) diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index 57eb3479c87..60cff181c33 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -22,6 +22,7 @@ import ( tsdb_errors "github.com/prometheus/tsdb/errors" "github.com/prometheus/tsdb/fileutil" "github.com/prometheus/tsdb/wal" + "github.com/weaveworks/common/user" "github.com/cortexproject/cortex/pkg/ingester/client" "github.com/cortexproject/cortex/pkg/util" @@ -329,61 +330,107 @@ func (w *walWrapper) truncateSamples() error { return nil } -func FlushFromWAL(ingester *Ingester, dir string) error { - lastCheckpointDir, _, err := lastCheckpoint(dir) +func flushChunksFromWAL(ingester *Ingester, dir string) error { + allSeries, err := getSeriesFromWAL(dir) if err != nil { return err } + for userID, seriesMap := range allSeries { + ctx := user.InjectUserID(context.Background(), userID) + for fingerprint, ms := range seriesMap { + if err := ingester.flushChunks( + ctx, + model.Fingerprint(fingerprint), + pbLabelPairToLabels(ms.labels()), + ms.chunkDescs, + ); err != nil { + return err + } + } + } + + return nil +} + +func getSeriesFromWAL(dir string) (map[string]map[uint64]*memorySeries, error) { + // TODO: check about adding index entries. + // Map of user_id -> series_fingerprint -> *memorySeries + allSeries := make(map[string]map[uint64]*memorySeries) + + // Build the series from the last checkpoint. + lastCheckpointDir, _, err := lastCheckpoint(dir) + if err != nil { + return allSeries, err + } + sr, err := wal.NewSegmentsReader(lastCheckpointDir) if err != nil { - return err + return allSeries, err } series, err := loadCheckpoint(wal.NewReader(sr)) if err != nil { - return err + return allSeries, err } - // Either use ingester series like normal reading and then flush once per series - // or like below - flush once for the checkpoint and 1 more time from the WAL segments - - // TODO: check about adding index entries. - for _, s := range series { chunkDesc, err := fromWireChunks(s.Chunks) if err != nil { - return err + return allSeries, err } - // TODO: user id in the context. - if err := ingester.flushChunks( - context.Background(), - model.Fingerprint(s.Fingerprint), - pbLabelPairToLabels(s.Labels), - chunkDesc, - ); err != nil { - return nil + if _, ok := allSeries[s.UserId]; !ok { + allSeries[s.UserId] = make(map[uint64]*memorySeries) } + + ms := newMemorySeries(pbLabelPairToLabels(s.Labels)) + ms.setChunks(chunkDesc) + allSeries[s.UserId][s.Fingerprint] = ms } - // TODO: Read segments from WAL which start after the checkpoint. - // Need to do some alignment with segment and checkpoint numbers for that. + // Load the WAL and add the samples to the series loaded + // from the checkpoint. sr, err = wal.NewSegmentsReader(dir) if err != nil { - return nil + return allSeries, nil } - records, err := loadWAL(wal.NewReader(sr)) if err != nil { - return nil + return allSeries, nil } - for range records { + samplePair := model.SamplePair{} + for _, r := range records { + seriesMap, ok := allSeries[r.UserId] + if !ok { + seriesMap = make(map[uint64]*memorySeries) + allSeries[r.UserId] = seriesMap + } + + // Some series might not be present in the checkpoint, + // hence add them here. + for _, lbl := range r.Labels { + if _, ok := allSeries[r.UserId][lbl.Fingerprint]; !ok { + allSeries[r.UserId][lbl.Fingerprint] = newMemorySeries(pbLabelPairToLabels(lbl.Labels)) + } + } + for _, sample := range r.Samples { + // TODO: verify if sample.Timestamp is in milliseconds. + samplePair.Timestamp = model.Time(sample.Timestamp) + samplePair.Value = model.SampleValue(sample.Value) + if err := seriesMap[sample.Fingerprint].add(samplePair); err != nil { + // We can ignore memorySeriesError because duplicate (or) out-of-order samples are possible + // here because the WAL is not truncated to align with the checkpoint. + if _, ok := err.(*memorySeriesError); !ok { + return allSeries, err + } + } + } } - return nil + return allSeries, nil } func loadCheckpoint(r *wal.Reader) (series []*Series, err error) { From 2b1710341fb4137df34f8c20852366ac214bcfd2 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Wed, 14 Aug 2019 17:01:07 +0530 Subject: [PATCH 12/56] Bring back recovery and tests Signed-off-by: Ganesh Vernekar --- pkg/ingester/wal.go | 123 ++++++++++++++++++++++++++++++++++++++- pkg/ingester/wal_test.go | 35 +++++------ 2 files changed, 138 insertions(+), 20 deletions(-) diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index 60cff181c33..085a8371316 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -6,7 +6,6 @@ import ( "fmt" "io/ioutil" "os" - "path" "path/filepath" "strconv" "strings" @@ -84,7 +83,7 @@ func newWAL(cfg WALConfig, ingester *Ingester) (WAL, error) { if cfg.metricsRegisterer != nil { walRegistry = prometheus.WrapRegistererWith(prometheus.Labels{"kind": "wal"}, cfg.metricsRegisterer) } - tsdbWAL, err := wal.New(util.Logger, walRegistry, path.Join(cfg.dir, "wal"), true) + tsdbWAL, err := wal.New(util.Logger, walRegistry, cfg.dir, true) if err != nil { return nil, err } @@ -417,7 +416,6 @@ func getSeriesFromWAL(dir string) (map[string]map[uint64]*memorySeries, error) { } for _, sample := range r.Samples { - // TODO: verify if sample.Timestamp is in milliseconds. samplePair.Timestamp = model.Time(sample.Timestamp) samplePair.Value = model.SampleValue(sample.Value) if err := seriesMap[sample.Fingerprint].add(samplePair); err != nil { @@ -467,3 +465,122 @@ func pbLabelPairToLabels(lps []client.LabelPair) labels.Labels { } return lbls } + +func recoverFromWal(ctx context.Context, ingester *Ingester, walDir string) (err error) { + // Use a local userStates, so we don't need to worry about locking. + userStates := newUserStates(ingester.limits, ingester.cfg) + + la := []client.LabelAdapter{} + + lastCheckpointDir, idx, err := lastCheckpoint(walDir) + if err != nil { + return err + } + if idx >= 0 { + if err := recoverRecords(lastCheckpointDir, &Series{}, func(msg proto.Message) error { + walSeries := msg.(*Series) + + descs, err := fromWireChunks(walSeries.Chunks) + if err != nil { + return err + } + + state := userStates.getOrCreate(walSeries.UserId) + + la = la[:0] + for _, l := range walSeries.Labels { + la = append(la, client.LabelAdapter{ + Name: string(l.Name), + Value: string(l.Value), + }) + } + series, err := state.createSeriesWithFingerprint(model.Fingerprint(walSeries.Fingerprint), la, &Record{}) + if err != nil { + return err + } + + return series.setChunks(descs) + }); err != nil { + return err + } + } + + if err := recoverRecords(walDir, &Record{}, func(msg proto.Message) error { + record := msg.(*Record) + + state := userStates.getOrCreate(record.UserId) + + for _, labels := range record.Labels { + _, ok := state.fpToSeries.get(model.Fingerprint(labels.Fingerprint)) + if ok { + continue + } + + la = la[:0] + for _, l := range labels.Labels { + la = append(la, client.LabelAdapter{ + Name: string(l.Name), + Value: string(l.Value), + }) + } + _, err := state.createSeriesWithFingerprint(model.Fingerprint(labels.Fingerprint), la, &Record{}) + if err != nil { + return err + } + } + + for _, sample := range record.Samples { + series, ok := state.fpToSeries.get(model.Fingerprint(sample.Fingerprint)) + if !ok { + return nil + } + + err := series.add(model.SamplePair{ + Timestamp: model.Time(sample.Timestamp), + Value: model.SampleValue(sample.Value), + }) + if err != nil { + // We can ignore memorySeriesError because duplicate (or) out-of-order samples are possible + // here because the WAL is not truncated to align with the checkpoint. + if _, ok := err.(*memorySeriesError); !ok { + return err + } + } + } + + return nil + }); err != nil { + return err + } + + ingester.userStatesMtx.Lock() + ingester.userStates = userStates + ingester.userStatesMtx.Unlock() + + return nil +} + +func recoverRecords(name string, ty proto.Message, callback func(proto.Message) error) error { + segmentReader, err := wal.NewSegmentsReader(name) + if err != nil { + return err + } + defer segmentReader.Close() + + reader := wal.NewReader(segmentReader) + for reader.Next() { + ty.Reset() + if err := proto.Unmarshal(reader.Record(), ty); err != nil { + return err + } + + if err := callback(ty); err != nil { + return err + } + } + if err := reader.Err(); err != nil { + return err + } + + return nil +} diff --git a/pkg/ingester/wal_test.go b/pkg/ingester/wal_test.go index dc2e5e03378..7cad4e02326 100644 --- a/pkg/ingester/wal_test.go +++ b/pkg/ingester/wal_test.go @@ -1,13 +1,14 @@ package ingester import ( - // "io/ioutil" + "context" + "io/ioutil" "os" "testing" "github.com/cortexproject/cortex/pkg/util" "github.com/go-kit/kit/log" - // "github.com/stretchr/testify/require" + "github.com/stretchr/testify/require" ) func init() { @@ -15,23 +16,23 @@ func init() { } func TestWAL(t *testing.T) { - // dirname, err := ioutil.TempDir("", "cortex-wal") - // require.NoError(t, err) + dirname, err := ioutil.TempDir("", "cortex-wal") + require.NoError(t, err) - // cfg := defaultIngesterTestConfig() - // cfg.WALConfig.enabled = true - // cfg.WALConfig.dir = dirname + cfg := defaultIngesterTestConfig() + cfg.WALConfig.enabled = true + cfg.WALConfig.dir = dirname - // // Build an ingester, add some samples, then shut it down. - // _, ing := newTestStore(t, cfg, defaultClientTestConfig(), defaultLimitsTestConfig()) - // userIDs, testData := pushTestSamples(t, ing, 10, 1000) - // ing.Shutdown() + // Build an ingester, add some samples, then shut it down. + _, ing := newTestStore(t, cfg, defaultClientTestConfig(), defaultLimitsTestConfig()) + userIDs, testData := pushTestSamples(t, ing, 10, 1000) + ing.Shutdown() - // // Start a new ingester and recover the WAL. - // cfg.WALConfig.recover = true - // _, ing = newTestStore(t, cfg, defaultClientTestConfig(), defaultLimitsTestConfig()) - // defer ing.Shutdown() + // Start a new ingester and recover the WAL. + _, ing = newTestStore(t, cfg, defaultClientTestConfig(), defaultLimitsTestConfig()) + defer ing.Shutdown() + require.NoError(t, recoverFromWal(context.Background(), ing, dirname)) - // // Check the samples are still there! - // retrieveTestSamples(t, ing, userIDs, testData) + // Check the samples are still there! + retrieveTestSamples(t, ing, userIDs, testData) } From 935b73e447591097056fc1cfb56bca6a5f4e9526 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Wed, 14 Aug 2019 17:23:28 +0530 Subject: [PATCH 13/56] Fix race in the test Signed-off-by: Ganesh Vernekar --- pkg/ingester/wal_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/ingester/wal_test.go b/pkg/ingester/wal_test.go index 7cad4e02326..2cec7432cee 100644 --- a/pkg/ingester/wal_test.go +++ b/pkg/ingester/wal_test.go @@ -28,6 +28,7 @@ func TestWAL(t *testing.T) { userIDs, testData := pushTestSamples(t, ing, 10, 1000) ing.Shutdown() + cfg.WALConfig.enabled = false // Start a new ingester and recover the WAL. _, ing = newTestStore(t, cfg, defaultClientTestConfig(), defaultLimitsTestConfig()) defer ing.Shutdown() From 538f4078b3395f1d87d790e62e86f3579a0b1d53 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Mon, 19 Aug 2019 17:14:05 +0530 Subject: [PATCH 14/56] Recover on startup Signed-off-by: Ganesh Vernekar --- pkg/cortex/modules.go | 7 ++ pkg/ingester/ingester.go | 15 ++++ pkg/ingester/transfer.go | 3 +- pkg/ingester/wal.go | 146 ++------------------------------------- pkg/ingester/wal_test.go | 3 +- 5 files changed, 28 insertions(+), 146 deletions(-) diff --git a/pkg/cortex/modules.go b/pkg/cortex/modules.go index 411f19f70f0..237223e7838 100644 --- a/pkg/cortex/modules.go +++ b/pkg/cortex/modules.go @@ -225,6 +225,13 @@ func (t *Cortex) initIngester(cfg *Config) (err error) { return } + if cfg.Ingester.RecoverFromWAL { + if err := t.ingester.RecoverFromWALAndFlush(); err != nil { + return err + } + return t.stopIngester() + } + client.RegisterIngesterServer(t.server.GRPC, t.ingester) grpc_health_v1.RegisterHealthServer(t.server.GRPC, t.ingester) t.server.HTTP.Path("/ready").Handler(http.HandlerFunc(t.ingester.ReadinessHandler)) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 3a1a6e8ef92..c5fdaebbc59 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -119,6 +119,7 @@ type Config struct { ConcurrentFlushes int SpreadFlushes bool MinChunkLength int + RecoverFromWAL bool RateUpdatePeriod time.Duration @@ -142,6 +143,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) { f.IntVar(&cfg.MinChunkLength, "ingester.min-chunk-length", 0, "Minimum number of samples in an idle chunk to flush it to the store. Use with care, if chunks are less than this size they will be discarded.") f.IntVar(&cfg.ConcurrentFlushes, "ingester.concurrent-flushes", 50, "Number of concurrent goroutines flushing to dynamodb.") f.DurationVar(&cfg.RateUpdatePeriod, "ingester.rate-update-period", 15*time.Second, "Period with which to update the per-user ingestion rates.") + f.BoolVar(&cfg.RecoverFromWAL, "ingester.recover-from-wal", false, "If true, ingesters recovers from WAL and quits.") } // Ingester deals with "in flight" chunks. Based on Prometheus 1.x @@ -185,6 +187,9 @@ func New(cfg Config, clientConfig client.Config, limits *validation.Overrides, c if cfg.ingesterClientFactory == nil { cfg.ingesterClientFactory = client.MakeIngesterClient } + if cfg.RecoverFromWAL { + cfg.WALConfig.enabled = false + } i := &Ingester{ cfg: cfg, @@ -651,3 +656,13 @@ func (i *Ingester) ReadinessHandler(w http.ResponseWriter, r *http.Request) { http.Error(w, "Not ready: "+err.Error(), http.StatusServiceUnavailable) } } + +// RecoverFromWALAndFlush recovers chunks from the WAL with directory pointed by +// the WALConfig and then flushes it to the chunk store. +func (i *Ingester) RecoverFromWALAndFlush() error { + if err := recoverFromWAL(i); err != nil { + return err + } + i.Flush() + return nil +} diff --git a/pkg/ingester/transfer.go b/pkg/ingester/transfer.go index e2b8ae66104..867b33c2af2 100644 --- a/pkg/ingester/transfer.go +++ b/pkg/ingester/transfer.go @@ -89,8 +89,7 @@ func (i *Ingester) TransferChunks(stream client.Ingester_TransferChunksServer) e return err } - var record Record // for the WAL, not used. - state, fp, series, err := userStates.getOrCreateSeries(userCtx, wireSeries.Labels, &record) + state, fp, series, err := userStates.getOrCreateSeries(userCtx, wireSeries.Labels, nil) if err != nil { return err } diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index 085a8371316..ffb8d3dfc6f 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -1,7 +1,6 @@ package ingester import ( - "context" "flag" "fmt" "io/ioutil" @@ -17,11 +16,9 @@ import ( "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/model" - "github.com/prometheus/prometheus/pkg/labels" tsdb_errors "github.com/prometheus/tsdb/errors" "github.com/prometheus/tsdb/fileutil" "github.com/prometheus/tsdb/wal" - "github.com/weaveworks/common/user" "github.com/cortexproject/cortex/pkg/ingester/client" "github.com/cortexproject/cortex/pkg/util" @@ -265,7 +262,7 @@ func lastCheckpoint(dir string) (string, int, error) { return "", -1, nil } -// deleteCheckpoints deletes all checkpoints in a directory below a given index. +// deleteCheckpoints deletes all checkpoints in a directory which is <= maxIndex. func (w *walWrapper) deleteCheckpoints(maxIndex int) (err error) { w.checkpointDeleteTotal.Inc() defer func() { @@ -329,144 +326,8 @@ func (w *walWrapper) truncateSamples() error { return nil } -func flushChunksFromWAL(ingester *Ingester, dir string) error { - allSeries, err := getSeriesFromWAL(dir) - if err != nil { - return err - } - - for userID, seriesMap := range allSeries { - ctx := user.InjectUserID(context.Background(), userID) - for fingerprint, ms := range seriesMap { - if err := ingester.flushChunks( - ctx, - model.Fingerprint(fingerprint), - pbLabelPairToLabels(ms.labels()), - ms.chunkDescs, - ); err != nil { - return err - } - } - } - - return nil -} - -func getSeriesFromWAL(dir string) (map[string]map[uint64]*memorySeries, error) { - // TODO: check about adding index entries. - // Map of user_id -> series_fingerprint -> *memorySeries - allSeries := make(map[string]map[uint64]*memorySeries) - - // Build the series from the last checkpoint. - lastCheckpointDir, _, err := lastCheckpoint(dir) - if err != nil { - return allSeries, err - } - - sr, err := wal.NewSegmentsReader(lastCheckpointDir) - if err != nil { - return allSeries, err - } - - series, err := loadCheckpoint(wal.NewReader(sr)) - if err != nil { - return allSeries, err - } - - for _, s := range series { - chunkDesc, err := fromWireChunks(s.Chunks) - if err != nil { - return allSeries, err - } - - if _, ok := allSeries[s.UserId]; !ok { - allSeries[s.UserId] = make(map[uint64]*memorySeries) - } - - ms := newMemorySeries(pbLabelPairToLabels(s.Labels)) - ms.setChunks(chunkDesc) - allSeries[s.UserId][s.Fingerprint] = ms - } - - // Load the WAL and add the samples to the series loaded - // from the checkpoint. - sr, err = wal.NewSegmentsReader(dir) - if err != nil { - return allSeries, nil - } - records, err := loadWAL(wal.NewReader(sr)) - if err != nil { - return allSeries, nil - } - - samplePair := model.SamplePair{} - for _, r := range records { - seriesMap, ok := allSeries[r.UserId] - if !ok { - seriesMap = make(map[uint64]*memorySeries) - allSeries[r.UserId] = seriesMap - } - - // Some series might not be present in the checkpoint, - // hence add them here. - for _, lbl := range r.Labels { - if _, ok := allSeries[r.UserId][lbl.Fingerprint]; !ok { - allSeries[r.UserId][lbl.Fingerprint] = newMemorySeries(pbLabelPairToLabels(lbl.Labels)) - } - } - - for _, sample := range r.Samples { - samplePair.Timestamp = model.Time(sample.Timestamp) - samplePair.Value = model.SampleValue(sample.Value) - if err := seriesMap[sample.Fingerprint].add(samplePair); err != nil { - // We can ignore memorySeriesError because duplicate (or) out-of-order samples are possible - // here because the WAL is not truncated to align with the checkpoint. - if _, ok := err.(*memorySeriesError); !ok { - return allSeries, err - } - } - } - } - - return allSeries, nil -} - -func loadCheckpoint(r *wal.Reader) (series []*Series, err error) { - for r.Next() { - rec := r.Record() - s := &Series{} - if err := proto.Unmarshal(rec, s); err != nil { - return nil, err - } - series = append(series, s) - - } - return series, r.Err() -} - -func loadWAL(r *wal.Reader) (records []*Record, err error) { - for r.Next() { - rec := r.Record() - record := &Record{} - if err := proto.Unmarshal(rec, record); err != nil { - return nil, err - } - records = append(records, record) - - } - return records, r.Err() -} - -func pbLabelPairToLabels(lps []client.LabelPair) labels.Labels { - lbls := make(labels.Labels, len(lps)) - for i := range lps { - lbls[i].Name = string(lps[i].Name) - lbls[i].Value = string(lps[i].Value) - } - return lbls -} - -func recoverFromWal(ctx context.Context, ingester *Ingester, walDir string) (err error) { +func recoverFromWAL(ingester *Ingester) (err error) { + walDir := ingester.cfg.WALConfig.dir // Use a local userStates, so we don't need to worry about locking. userStates := newUserStates(ingester.limits, ingester.cfg) @@ -477,6 +338,7 @@ func recoverFromWal(ctx context.Context, ingester *Ingester, walDir string) (err return err } if idx >= 0 { + // Checkpoint exists. if err := recoverRecords(lastCheckpointDir, &Series{}, func(msg proto.Message) error { walSeries := msg.(*Series) diff --git a/pkg/ingester/wal_test.go b/pkg/ingester/wal_test.go index 589c15ed8c4..a4eef4c2f4f 100644 --- a/pkg/ingester/wal_test.go +++ b/pkg/ingester/wal_test.go @@ -1,7 +1,6 @@ package ingester import ( - "context" "io/ioutil" "os" "testing" @@ -32,7 +31,7 @@ func TestWAL(t *testing.T) { // Start a new ingester and recover the WAL. _, ing = newTestStore(t, cfg, defaultClientTestConfig(), defaultLimitsTestConfig()) defer ing.Shutdown() - require.NoError(t, recoverFromWal(context.Background(), ing, dirname)) + require.NoError(t, recoverFromWAL(ing)) // Check the samples are still there! retrieveTestSamples(t, ing, userIDs, testData) From ff359481571a86b1ac8c5c060839ecdb8d35679a Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Fri, 30 Aug 2019 15:13:43 +0200 Subject: [PATCH 15/56] Dont remove the last segment in truncation Signed-off-by: Ganesh Vernekar --- pkg/ingester/wal.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index ffb8d3dfc6f..5c3e3417047 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -318,7 +318,9 @@ func (w *walWrapper) truncateSamples() error { return err } - if err := w.wal.Truncate(w.lastWalSegment); err != nil { + // The last segment might still have been active after the checpoint, + // hence delete only the segments before that. + if err := w.wal.Truncate(w.lastWalSegment - 1); err != nil { return err } From 2913b9960710e3a584f8f2423adefa6584d9292d Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Tue, 3 Sep 2019 12:11:44 +0530 Subject: [PATCH 16/56] Always read WAL and remove recover-only mode Signed-off-by: Ganesh Vernekar --- pkg/cortex/modules.go | 7 ------- pkg/ingester/ingester.go | 15 --------------- pkg/ingester/wal.go | 4 ++++ 3 files changed, 4 insertions(+), 22 deletions(-) diff --git a/pkg/cortex/modules.go b/pkg/cortex/modules.go index 237223e7838..411f19f70f0 100644 --- a/pkg/cortex/modules.go +++ b/pkg/cortex/modules.go @@ -225,13 +225,6 @@ func (t *Cortex) initIngester(cfg *Config) (err error) { return } - if cfg.Ingester.RecoverFromWAL { - if err := t.ingester.RecoverFromWALAndFlush(); err != nil { - return err - } - return t.stopIngester() - } - client.RegisterIngesterServer(t.server.GRPC, t.ingester) grpc_health_v1.RegisterHealthServer(t.server.GRPC, t.ingester) t.server.HTTP.Path("/ready").Handler(http.HandlerFunc(t.ingester.ReadinessHandler)) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index c5fdaebbc59..3a1a6e8ef92 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -119,7 +119,6 @@ type Config struct { ConcurrentFlushes int SpreadFlushes bool MinChunkLength int - RecoverFromWAL bool RateUpdatePeriod time.Duration @@ -143,7 +142,6 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) { f.IntVar(&cfg.MinChunkLength, "ingester.min-chunk-length", 0, "Minimum number of samples in an idle chunk to flush it to the store. Use with care, if chunks are less than this size they will be discarded.") f.IntVar(&cfg.ConcurrentFlushes, "ingester.concurrent-flushes", 50, "Number of concurrent goroutines flushing to dynamodb.") f.DurationVar(&cfg.RateUpdatePeriod, "ingester.rate-update-period", 15*time.Second, "Period with which to update the per-user ingestion rates.") - f.BoolVar(&cfg.RecoverFromWAL, "ingester.recover-from-wal", false, "If true, ingesters recovers from WAL and quits.") } // Ingester deals with "in flight" chunks. Based on Prometheus 1.x @@ -187,9 +185,6 @@ func New(cfg Config, clientConfig client.Config, limits *validation.Overrides, c if cfg.ingesterClientFactory == nil { cfg.ingesterClientFactory = client.MakeIngesterClient } - if cfg.RecoverFromWAL { - cfg.WALConfig.enabled = false - } i := &Ingester{ cfg: cfg, @@ -656,13 +651,3 @@ func (i *Ingester) ReadinessHandler(w http.ResponseWriter, r *http.Request) { http.Error(w, "Not ready: "+err.Error(), http.StatusServiceUnavailable) } } - -// RecoverFromWALAndFlush recovers chunks from the WAL with directory pointed by -// the WALConfig and then flushes it to the chunk store. -func (i *Ingester) RecoverFromWALAndFlush() error { - if err := recoverFromWAL(i); err != nil { - return err - } - i.Flush() - return nil -} diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index 5c3e3417047..e7e27cb48d9 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -118,6 +118,10 @@ func newWAL(cfg WALConfig, ingester *Ingester) (WAL, error) { ) } + if err := recoverFromWAL(ingester); err != nil { + return nil, err + } + w.wait.Add(1) go w.run() return w, nil From e691e0104299b27d0e278d9b8202afd86186b099 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Fri, 6 Sep 2019 16:59:53 +0530 Subject: [PATCH 17/56] Timer for WAL recovery time Signed-off-by: Ganesh Vernekar --- pkg/ingester/wal.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index e7e27cb48d9..1888dc094d2 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -118,9 +118,13 @@ func newWAL(cfg WALConfig, ingester *Ingester) (WAL, error) { ) } + level.Info(util.Logger).Log("msg", "recovering from WAL") + start := time.Now() if err := recoverFromWAL(ingester); err != nil { return nil, err } + elapsed := time.Since(start) + level.Info(util.Logger).Log("msg", "recovery from WAL completed", "time", elapsed.String()) w.wait.Add(1) go w.run() From dbd336ef129c37ac8f97e504702122a2834bcb8b Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Wed, 11 Sep 2019 15:30:12 +0530 Subject: [PATCH 18/56] More rigorous test for WAL Signed-off-by: Ganesh Vernekar --- pkg/ingester/wal_test.go | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/pkg/ingester/wal_test.go b/pkg/ingester/wal_test.go index a4eef4c2f4f..f750778c7b5 100644 --- a/pkg/ingester/wal_test.go +++ b/pkg/ingester/wal_test.go @@ -4,6 +4,7 @@ import ( "io/ioutil" "os" "testing" + "time" "github.com/cortexproject/cortex/pkg/util" "github.com/go-kit/kit/log" @@ -21,18 +22,37 @@ func TestWAL(t *testing.T) { cfg := defaultIngesterTestConfig() cfg.WALConfig.enabled = true cfg.WALConfig.dir = dirname + cfg.WALConfig.checkpointDuration = 10 * time.Millisecond + + numSeries := 10 + numSamplesPerSeriesPerPush := 1000 + numRestarts := 5 // Build an ingester, add some samples, then shut it down. _, ing := newTestStore(t, cfg, defaultClientTestConfig(), defaultLimitsTestConfig()) - userIDs, testData := pushTestSamples(t, ing, 10, 1000, 0) + userIDs, testData := pushTestSamples(t, ing, numSeries, numSamplesPerSeriesPerPush, 0) ing.Shutdown() - cfg.WALConfig.enabled = false - // Start a new ingester and recover the WAL. + for r := 0; r < numRestarts; r++ { + // Start a new ingester and recover the WAL. + _, ing = newTestStore(t, cfg, defaultClientTestConfig(), defaultLimitsTestConfig()) + require.NoError(t, recoverFromWAL(ing)) + + for i, userID := range userIDs { + testData[userID] = buildTestMatrix(numSeries, (r+1)*numSamplesPerSeriesPerPush, i) + } + // Check the samples are still there! + retrieveTestSamples(t, ing, userIDs, testData) + userIDs, testData = pushTestSamples(t, ing, numSeries, numSamplesPerSeriesPerPush, (r+1)*numSamplesPerSeriesPerPush) + ing.Shutdown() + } + _, ing = newTestStore(t, cfg, defaultClientTestConfig(), defaultLimitsTestConfig()) defer ing.Shutdown() require.NoError(t, recoverFromWAL(ing)) - // Check the samples are still there! + for i, userID := range userIDs { + testData[userID] = buildTestMatrix(numSeries, (numRestarts+1)*numSamplesPerSeriesPerPush, i) + } retrieveTestSamples(t, ing, userIDs, testData) } From 0e1577a2289f85ab0d65d9de9245ef96a313bd57 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Wed, 11 Sep 2019 15:58:22 +0530 Subject: [PATCH 19/56] More profiling in debug mode Signed-off-by: Ganesh Vernekar --- pkg/ingester/wal.go | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index 1888dc094d2..45ebb9e6c90 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -347,15 +347,22 @@ func recoverFromWAL(ingester *Ingester) (err error) { if err != nil { return err } + if idx >= 0 { + level.Debug(util.Logger).Log("msg", "recovering from checkpoint", "checkpoint", lastCheckpointDir) // Checkpoint exists. + start := time.Now() + numSeries := 0 + numChunks := 0 if err := recoverRecords(lastCheckpointDir, &Series{}, func(msg proto.Message) error { + numSeries++ walSeries := msg.(*Series) descs, err := fromWireChunks(walSeries.Chunks) if err != nil { return err } + numChunks += len(descs) state := userStates.getOrCreate(walSeries.UserId) @@ -366,7 +373,7 @@ func recoverFromWAL(ingester *Ingester) (err error) { Value: string(l.Value), }) } - series, err := state.createSeriesWithFingerprint(model.Fingerprint(walSeries.Fingerprint), la, &Record{}) + series, err := state.createSeriesWithFingerprint(model.Fingerprint(walSeries.Fingerprint), la, nil) if err != nil { return err } @@ -375,11 +382,20 @@ func recoverFromWAL(ingester *Ingester) (err error) { }); err != nil { return err } + elapsed := time.Since(start) + level.Debug(util.Logger).Log("msg", "recovered from checkpoint", "time", elapsed.String(), "num_series", numSeries, "num_chunks", numChunks) + } else { + level.Debug(util.Logger).Log("msg", "no checkpoint found") } + level.Debug(util.Logger).Log("msg", "recovering from segments", "dir", walDir) + numRecords := 0 + numSeries := 0 + numSamples := 0 + start := time.Now() if err := recoverRecords(walDir, &Record{}, func(msg proto.Message) error { record := msg.(*Record) - + numRecords++ state := userStates.getOrCreate(record.UserId) for _, labels := range record.Labels { @@ -395,10 +411,11 @@ func recoverFromWAL(ingester *Ingester) (err error) { Value: string(l.Value), }) } - _, err := state.createSeriesWithFingerprint(model.Fingerprint(labels.Fingerprint), la, &Record{}) + _, err := state.createSeriesWithFingerprint(model.Fingerprint(labels.Fingerprint), la, nil) if err != nil { return err } + numSeries++ } for _, sample := range record.Samples { @@ -419,11 +436,14 @@ func recoverFromWAL(ingester *Ingester) (err error) { } } } + numSamples += len(record.Samples) return nil }); err != nil { return err } + elapsed := time.Since(start) + level.Debug(util.Logger).Log("msg", "recovered from segments", "time", elapsed.String(), "num_new_series", numSeries, "num_records", numRecords, "num_samples", numSamples) ingester.userStatesMtx.Lock() ingester.userStates = userStates From eeaae15f399ccfec19d9da308ab6caa37489ffa4 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Wed, 11 Sep 2019 16:50:28 +0530 Subject: [PATCH 20/56] Fix race in test Signed-off-by: Ganesh Vernekar --- pkg/ingester/wal_test.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/pkg/ingester/wal_test.go b/pkg/ingester/wal_test.go index f750778c7b5..b8593f06155 100644 --- a/pkg/ingester/wal_test.go +++ b/pkg/ingester/wal_test.go @@ -36,7 +36,6 @@ func TestWAL(t *testing.T) { for r := 0; r < numRestarts; r++ { // Start a new ingester and recover the WAL. _, ing = newTestStore(t, cfg, defaultClientTestConfig(), defaultLimitsTestConfig()) - require.NoError(t, recoverFromWAL(ing)) for i, userID := range userIDs { testData[userID] = buildTestMatrix(numSeries, (r+1)*numSamplesPerSeriesPerPush, i) @@ -49,7 +48,6 @@ func TestWAL(t *testing.T) { _, ing = newTestStore(t, cfg, defaultClientTestConfig(), defaultLimitsTestConfig()) defer ing.Shutdown() - require.NoError(t, recoverFromWAL(ing)) for i, userID := range userIDs { testData[userID] = buildTestMatrix(numSeries, (numRestarts+1)*numSamplesPerSeriesPerPush, i) From 214b32e29e410a8572c15f08b9e0851f26683622 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Thu, 12 Sep 2019 15:57:54 +0530 Subject: [PATCH 21/56] No limits on number of series during recovery Signed-off-by: Ganesh Vernekar --- pkg/ingester/user_state.go | 8 ++++---- pkg/ingester/wal.go | 8 +++++--- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/pkg/ingester/user_state.go b/pkg/ingester/user_state.go index 1303fd47cbd..a8c15ddcb85 100644 --- a/pkg/ingester/user_state.go +++ b/pkg/ingester/user_state.go @@ -190,7 +190,7 @@ func (u *userState) getSeries(metric labelPairs, record *Record) (model.Fingerpr return fp, series, nil } - series, err := u.createSeriesWithFingerprint(fp, metric, record) + series, err := u.createSeriesWithFingerprint(fp, metric, record, false) if err != nil { u.fpLocker.Unlock(fp) return 0, nil, err @@ -199,13 +199,13 @@ func (u *userState) getSeries(metric labelPairs, record *Record) (model.Fingerpr return fp, series, nil } -func (u *userState) createSeriesWithFingerprint(fp model.Fingerprint, metric labelPairs, record *Record) (*memorySeries, error) { +func (u *userState) createSeriesWithFingerprint(fp model.Fingerprint, metric labelPairs, record *Record, recovery bool) (*memorySeries, error) { // There's theoretically a relatively harmless race here if multiple // goroutines get the length of the series map at the same time, then // all proceed to add a new series. This is likely not worth addressing, // as this should happen rarely (all samples from one push are added // serially), and the overshoot in allowed series would be minimal. - if u.fpToSeries.length() >= u.limits.MaxSeriesPerUser(u.userID) { + if !recovery && u.fpToSeries.length() >= u.limits.MaxSeriesPerUser(u.userID) { u.discardedSamples.WithLabelValues(perUserSeriesLimit).Inc() return nil, httpgrpc.Errorf(http.StatusTooManyRequests, "per-user series limit (%d) exceeded", u.limits.MaxSeriesPerUser(u.userID)) } @@ -215,7 +215,7 @@ func (u *userState) createSeriesWithFingerprint(fp model.Fingerprint, metric lab return nil, err } - if !u.canAddSeriesFor(string(metricName)) { + if !recovery && !u.canAddSeriesFor(string(metricName)) { u.discardedSamples.WithLabelValues(perMetricSeriesLimit).Inc() return nil, httpgrpc.Errorf(http.StatusTooManyRequests, "per-metric series limit (%d) exceeded for %s: %s", u.limits.MaxSeriesPerMetric(u.userID), metricName, metric) } diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index 45ebb9e6c90..9ffba806863 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -373,7 +373,7 @@ func recoverFromWAL(ingester *Ingester) (err error) { Value: string(l.Value), }) } - series, err := state.createSeriesWithFingerprint(model.Fingerprint(walSeries.Fingerprint), la, nil) + series, err := state.createSeriesWithFingerprint(model.Fingerprint(walSeries.Fingerprint), la, nil, true) if err != nil { return err } @@ -397,7 +397,9 @@ func recoverFromWAL(ingester *Ingester) (err error) { record := msg.(*Record) numRecords++ state := userStates.getOrCreate(record.UserId) - + if numRecords%10000 == 0 { + level.Debug(util.Logger).Log("msg", "records milestone", "num_records", numRecords) + } for _, labels := range record.Labels { _, ok := state.fpToSeries.get(model.Fingerprint(labels.Fingerprint)) if ok { @@ -411,7 +413,7 @@ func recoverFromWAL(ingester *Ingester) (err error) { Value: string(l.Value), }) } - _, err := state.createSeriesWithFingerprint(model.Fingerprint(labels.Fingerprint), la, nil) + _, err := state.createSeriesWithFingerprint(model.Fingerprint(labels.Fingerprint), la, nil, true) if err != nil { return err } From 43dbc9ead8cfbebf36ecd20aab6f6de48e74e4f9 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Fri, 13 Sep 2019 14:49:02 +0530 Subject: [PATCH 22/56] No rate limiting of checkpoint Signed-off-by: Ganesh Vernekar --- pkg/ingester/wal.go | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index 9ffba806863..a91403c03ef 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -149,15 +149,21 @@ func (w *walWrapper) Log(record *Record) error { func (w *walWrapper) run() { defer w.wait.Done() - for !w.isStopped() { - if err := w.checkpoint(); err != nil { - level.Error(util.Logger).Log("msg", "Error checkpointing series", "err", err) - continue - } + ticker := time.NewTicker(w.cfg.checkpointDuration) + defer ticker.Stop() - if err := w.truncateSamples(); err != nil { - level.Error(util.Logger).Log("msg", "Error truncating wal", "err", err) - continue + for !w.isStopped() { + select { + case <-ticker.C: + if err := w.checkpoint(); err != nil { + level.Error(util.Logger).Log("msg", "Error checkpointing series", "err", err) + continue + } + if err := w.truncateSamples(); err != nil { + level.Error(util.Logger).Log("msg", "Error truncating wal", "err", err) + } + case <-w.quit: + return } } } @@ -210,9 +216,6 @@ func (w *walWrapper) checkpoint() (err error) { if numSeries == 0 { return nil } - perSeriesDuration := w.cfg.checkpointDuration / time.Duration(numSeries) - ticker := time.NewTicker(perSeriesDuration) - defer ticker.Stop() for userID, state := range w.ingester.userStates.cp() { for pair := range state.fpToSeries.iter() { @@ -222,11 +225,6 @@ func (w *walWrapper) checkpoint() (err error) { if err != nil { return err } - - select { - case <-ticker.C: - case <-w.quit: // When we're trying to shutdown, finish the checkpoint as fast as possible. - } } } From 7a9fec0df70a0acf0be6bbf88ac48e5272e3737d Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Sat, 14 Sep 2019 10:46:01 +0530 Subject: [PATCH 23/56] Change segment deletion logic Signed-off-by: Ganesh Vernekar --- pkg/ingester/cortex-wal/00000005 | Bin 0 -> 229376 bytes pkg/ingester/cortex-wal/00000006 | 0 .../cortex-wal/checkpoint.001065/00000000 | Bin 0 -> 32768 bytes .../cortex-wal/checkpoint.001066/00000000 | Bin 0 -> 32768 bytes pkg/ingester/ingester.go | 2 +- pkg/ingester/transfer.go | 12 +- pkg/ingester/wal.go | 128 ++++++++++-------- 7 files changed, 78 insertions(+), 64 deletions(-) create mode 100644 pkg/ingester/cortex-wal/00000005 create mode 100644 pkg/ingester/cortex-wal/00000006 create mode 100644 pkg/ingester/cortex-wal/checkpoint.001065/00000000 create mode 100644 pkg/ingester/cortex-wal/checkpoint.001066/00000000 diff --git a/pkg/ingester/cortex-wal/00000005 b/pkg/ingester/cortex-wal/00000005 new file mode 100644 index 0000000000000000000000000000000000000000..f9735cb19995de241d7f4cd6810e8123228c6ada GIT binary patch literal 229376 zcmeI52YeLO_Q!|k@_*7IFX(7^RxA_=YV3&E6-5-WqoUXwcE$2kRK!9Hgn%GQ5s@NN zLQ{$g7Hl*Td%=Pg1Vt$#{(s+1$erEnF0-52C6MdqWBAU_%$+-D&iwYy-E+=Vzkltg z``>bP!I14*SFdtF-5s}GKl1C@(>D|>omZvyqxEx~Dpj_9wCLzVavbNeS~<>R%>|DO z9tXM#x&qw<-GJ_b?m!Pg51^-@Cy+152YLy50lfvifdWAR&_~b*C=?U|eFc4ieu93$ z6M`pz{(}C%0Kov@Nx_rAK*2!ZDZx{~(}JggL4rZRV8LKuh+qgXR4^2HM(_+UOfU?1 zR`4t^TreDXPVgKsLNEdtDHsWi5{v>y3q}La3!VqY2*v;}2wnij3dREC1ml45g7Lrv z!33a4Py|dAOavwgCIOQLlYuFMDZo_0RNzIyi@-~Qmw;)4X~1;BbYO;H1~5}F6L?wh zGVqGv72s9DtH3P5EMT@^Ht?F@HQ;r@>%bd=H-I+9>NFTr2Hdck_&Z^7Td2Ehj4AHhF@ zP^Sj&)EXgYV$-QLoCow_71ll|_!sbyU>l&bAOt)ts0wru{2%a$;NQTbf^7jLs2YF- zRR@ru8UPYh6F`D$0Z335fCOy^AVIYOB3AVG%!NYJ4G5_A}V1RV|_K}`W9=m-D_Y6c)dM*>Jta{viC3P6I629Tg*03_&G z00}w{K!T13kf0L)BU4nTsg z2auo+01|WqfCSwLAVD_)NYKpy5_AiI1lI=D^51>E;02Js+00kNdpg>OnDA3aY3N#2nfd&I8&=3Fx8VaC5 z&j2XUFaQO57C?c911QjQ017k$K!HXAD9|VX1sV;YK+gjx&=>#(dI3Oz#sVnNH~F302HVQK!GL#D9|JT1)2iF#8-N1+4xm7*02JsC00sIJK!H{R zD9{=J1zHQBK7C?Zi0SHia00F82AV4(%1gI8(0A&FP&~^X60XlDQc+66#>b_Eci-2en= zcK`v}13-Z40SM5Zg3z!=*L1dHAQb2r?1BOv3!p&90VvS%019*hfC8Nepg<=9C{POk z1v(i(fldKXpq2m%bSi)XwE|F}(*P9cbN~fv4WK}004Pu!00lY|K!MHzP@uB`6zCiP z1v(c%fzAU^pz{F~s4aj3T>zj!7Xm2IMF0xa4nTn}22h|&02HV_fC60#pg@-aDA45q z3Umd40$mB9Kvw}M(A5A6bPa$4T??Q<*8wQd^#BUg0YHIn08pSC0Tk#a00p`kK!I)n zP@r1@6zDbp1-czTf$jiMppF0vbSHoU-36dPcLONUJpc;yKL7=~7eImT15lv*1))y+ zy6d;KV<05xDGY-IJq;j1g8(FGFn|ON0g#}f021^JFibEEcvkQ%FkCPkcuw#fFhVc_ z7%3PDj1r6jMhiv*&kLRh#t6m$F9==$#tOy);{@Y?@q+Qd1i=KLNKgb!6ifsr2_^xP z1(Sg(f+@gM!BpTy!Hd94f|r14f@#2X!E|7TU3P6FD04UJc01EUCfC7CBpg`XND9};>1^OO9fqno`pdSGg=qCUL`WZlhegROR zWdI7a96*771yG5m0tiqe00G(;K!6$p2v9D705t&+p#1;@Xnz0!Isia` z@&E+rKmY+c2ta@i1`wb_00ihz00BA-K!6Sh5TK?20(1m`05t;;pd$eUs5yWD9R(mj zM*|4ZF#rN|EPwzV2OvPl0|?Lw00ML(fB>BYAV4hu1n6V{0XhXhfLa0w(5V0d)Cxd= zP6H62(*XpiHGlw}0U$ta00ihv00BA+K!DB$5TJ7a1n67<0Xh#rfX)XHptb-4bOC?> zT?imR7Xb)RI{*Q?7(jq70T7_}00ML=fB;Pp2LJ)O0YHFm1Q4K`00ih}00FuMK!9!q5TM%t1n71E0lEV~fI0#Q z(47DRbQgdC-3=f>_W%gc{{RH&UH}2Q4?uwK7lbz)N`2Y&E06>8j0w~b?01C7SK!H90P@u&C3iKg>0(}Ia zKpz7r&?f*2^eKP>eFmUFp93h+7XS+MC4d5b1)xAn02JtJ00sI6K!LsmP@wMs6lf`c z0(}plKtBK|(2oEL^b>#r{S2T$zW^xEG5`fy4xm840w~Z500mkJpg_L?DA4Z!3bYD9 zf&Ktcpg#c=Xf=QWtpQM=wEzmV4nTqa0#Km!01EUsfC6m*P@sPVp-xT0L!c>k2($}! zL4bAz5TM-v1ZZ~v0onsVfa(DV(4GJSv=@K??F}G6`v3?~eEn@*AbP)M8bE-~01%)y00ML-fB>BZAV6mW z2+%nI0(35b0G$UQK<5JpP+I^2x&T0cE(8#uivR?u9e@B`3?M+400>Zf00FubK!7d- z5TMHe1n3F?0lE@EfUW`%psN7{=o$b4x)wlyt^*LD>j4C)1AqYC03bj&R?XSzv%ZR< znb|vQMEgT}w%;Wu@t;tBeRog|H{OPtg}v9T)C_E%SxVe+c&%=&C1wZRR>|$2_z2GU z+>V~ci)!ULMcI3{-P9RxqS04!PcV%}Z7b*YTH{S$ry~a*Yg&}^Zo&p~)V`kWKibsx z;XcY4y$4w$TP$gQUx;;iazg#zJkJR)g*W&6$620RN8fIBsuVZ0!dG8>H12H3ZRf`G zIUoC{xJjt6f%|t&bKr{Z&z_!6O4;dc-aA^}?6l6lvvq=- zE+Bt4r+`13rC6Pe`x*Y6;;!-A+>o0ebenMx7HdVQUqg3-$E#y)6aGv?AFoIZ_L?yR z+T46}HM)_fA&dS`O^*`!a@_NdwZfU}RiI>5IaZ5NpGlFuH;NS5Efu^HMK00js2vk8 zMk;dRZ4<9`)_X0Gc-xA!))`#4#d*%|*>hT!hz_o+(RwQ1Z?Vp+?6AT$7cO-fj&-Q= zeCEt=^^ECQtd=5$H`W$$KXaCNb+R0S8}|rX{fe-gy2f+TJ@)k+s#sqy$il{Xo7K>f zPO8T=j|iHM$x$yi-8Y~6puilpL|1!N<)GyhJ(r=H`MI+>3> zlG?->nma2ktvf}zBTd5$2+Wd?9%_!l^WExWhZHw)7Ur&U+GLL^JG@Pk%W8HYNlPrx zj3rVuVAO7v*j+WJ->o|4$kTLMa@RP@h=~{FvN3Pj%sH&4v0Az@BuP_O9+lU$Ri0)R z#uZOdBt9pHK>=Erl3y0U+)O0H|tzVsFPRg8Yg(GKt zhr>-r^`k@i1xvF~@#j_3VgP4xt9?8tk?K6Er-`8Eop2k|Y+6pfay06UuUoyXv1Fv9 zCarYFW(lWE&1OC*a+)de4*H6oenehPEvN_-_t};4x6~&Ti@Ey;D&B~U1P`q z&rq{V-Y1_3X{`_*>-StE<<-Yto0E%MqBY25qy**`-*m zOHw_*rN-JK?q`Vxj&XOgD`Ni;=nSn}5dYaJd8=(9bM%Q%%vtOmWhdS?YHeW7Sv>8T zT{`NBI!@We+6#v(x@51Jy(!VMdS6PkG*~_9Xq9dWuc)SwHHD!B61LX3+LDknx#^PP z{OKx$xICz~{{=iG*aqk<2mucZssddE{|7uG_&4yVU|ZlZK{eoUL3N<3pa#%QP!s4b zs0H*8WC1+|+X4B4+CVSCe}LYCY@k3;2k0Z%9w-#-0Q42OKtI8bz!QRHAaeEz&b|u}Ir|M9wY* zh@5>NAaZsQK;-NP0Fkqc0U~EV1c;pd2q1FyV}QunPXHokKLv=K{R|*-_H%&9*)ISh zXTJo9oc#(Qa&`$oB*CxFP= zp8+Cge*uV`T?P<2yBr{L_E!J}S^=OyD*+VfHvk3t9YBFr0VvQP01EUcfC8-s0viL_ z(0y6c4uNC@1jr77WQ0mI1ge^|{p3Lu#LV7i?@-tJZoL|zu0>h(YV4v~PqIWeq=x5} z?f}m#Jpi6pdICJJKn014U)K!WxLkf414B&a@s1T_GVpoRbv)CfR=_63lj#sCtO3m`#F z03>KX014V3K!Oeckf1yO2|5r!f(`8>*S__~+>i`t!FMvtg^}yePzkv;c z4ZuHwe+0!vFC8a)M2kip_Oi9Vu}wO_10_F-%1Lg@%2j&Y}0}BLa01E|e zfcFJw0*eG^0Urp?26z-Z2e9KH86l(NAY1qnzq8{Y8NrT&N)+frT*8inWCWhX{=gbM zi7^gRpiQIQb}AcgqnY0I-VnC*rW1zp8@si$T>sHPv7^0WoksYx%9Ky2$ZtTTT5%0@OS3gD@2*gh}id@WPLDDH86tjMe<^UJ>*Pl*HZWFM!T z{3aDN+c1$Ao0HTdl*e!7#eOny%vHE73DOM`D;%*)j2bPq$ zr51M!(0ff&W&PM=@?`5?c$!UA2+J0AGloi$(pyTvN2dZQ>gMA(em5zbx2UnIr=Xi# z`u*m&#qf6N%CwCfr^f1%zTJ~c4ck=yqQ;!e=%}=8ta@Z(PzqGTzS4*>XZy1 zJWHtyZ_p;RIgFbuQ5>|%Q*5u2<}hv<4qB4O_~s=^&Pc}cg-9Ju?~JR83N1)u0k>GE zSMJZ8q7K)&Mp={9Qg89L8uzcAcExx93E-l|J*$!$ZfCYBbC5u1sXPYR>T9^A;;{-D zU+3;RyUaz1d)?{nQ! z&5G({t5$pcSL)qYgW@AoC3W)qiN6fIXUJ} z_^Y~ZumjIM*X%A4kt_PM0IsnTBHvqp1ZUj~cZUR;cZU#1UCZx>(;3C*j zkc?nQK{A3J1<45HAZ2V6)HwX!=gIcH&$bwZ+^#8dY==NH0x3x6VGUBT&j&scv;{sD zTmVqL?n2;mfgJ+L2zO%*QnBv=NX7mi@VDSzV1wX3fK=@J1-5>st%&uaeBni$!VZCC z1Um$h5i&Xivh_1%ah}2K5J*O_Lm(O9T;#aY4}q%YgqDu0x9sFlkNOnN>rs@|fwG!C zWuTrIC|{5d^b+&}dJB331%d*gkDw1wC@2K_3i<;51pR;~1Wy3{1^s~mf&svjf+vB2 zf`NdYERhlHNI^!hBLx{Dqa%d|s;*mHHcJ0KpgXb(cjSYDe*q5(wgEZ|LcqgL}&*95pn@UXh#4M+6h2}asWiAE`SLA7eIt|2JC1-MnHt@XhBA>qXik^Htq|4 zx4s>qX3HJGYC%U}jo?mTt>7+To#1Z3=Y)x?`Y%`2j0#Ik-(sk+q1)Y4=*4Hww|+AgI z)vW+>4)vCdQ;Xl3t)E%*lyjJp6%@~yj+z*;&NzYOq zGDgmNZz4#>4r>lnUYB)CFO-Uxd$L;15mKa0Ijl7If-KrjPIFW2P2AlgGZLw#<(68+ z?3AYLE^`1MRGgLAyjStpmX96cZj#}g#Hne_Nvupo;9_0Rl7z%jt;^)XrO9P|7928t zRdm%7H^wbxw(v`lZtHiiG}lYcsYrOFk@7QunP}b+*k*Z(^cc zcUG*n)Y%2IXcH98{lEj6XUSh0 zcvVVYpioSRo~Y2qUu{gR^jOIrj&WhZV`9?isnVm_*T!F?xA%6tGEck4B+3JC*4A8( z$dN!foYWj47>b;Diq)nhHMydKrjZIZOKLi{B~m4=)t2~DIg2|S?v{SvC*C${ZD7t> zJR_Q&S}&qHm5D`7ENv72xjYj+Nd6d3aP`>|h<5V*irT{#X50Adc9daKTnyJIa=Ehz z7uu|~){;5{xoewnx5IlS{dMmxCs^4Fvo%|utB@1@w|p{1S=RO1KRiD~y_R3Krb?Ba ze=S+DLryqhdIcdWBuuXaNSOW&;MK9e0~w9LYtnFJ$eGe~WEC5JhwW3U*25y~&h8bl1gjo8=Ke?b!d0{6mL!B}9NU>q=B zFdmp7m;e+BihzlNiNGYmBw(^&GB8Cj1(+(B3fSB}8DX)ku^4OEhDJx5E|C#n$FH%d>1~euYtXR z;evgD=LGeE5rPK5NI^qjl%NqXTCgwhyr3~KMvx2e7}f;fF>F78$FThY9>Wd*cnr$} zcnmub;4$nVfXA?d0UpB+0eB2M6yP!JFo4If!vUMKC?lXiHfK>rNPo^^$r#Aix#Z@v zW1tcRdI@Xr6gCauDQr5xQ`iiEr?8m-m3ChSsI>bEz_jhFz%0QmV76d3@S5N?;B~?4 zz#D=$fHwti0&@g&fVTv10dEW52Hp|81H3DE7kE$b9xzuh7nmoQ2h11D2Nnnx01E{R zf%gUP1B(QUfDZ&80CplsM%X+*95v=C%*h_nqS1!aRA8rqWQ6jSZm_j3Z3Qd)dzfycN-+#lkW!;_cT6h4DQe?PrWJ$yYGcIpPEBdn82{e6q|T2n%5J;<(|$j_LUPB%M^)1 zZ+&KVC!1qvxRL;jj$|87oWa|_2I|d-W&=Z^!uoEv>Tanw`79}98gK>&m33~~ z43kqP1;#1L-m`7_Vq25QG}IHSuY|z45eb3wH?>n_u0c7n4x8T>NI@abr4KCUP;Z58 zXWEUruIXD06*kZ#r8#g#_GM2mlPw)!445zhG3mTn&8c|mN1%{S>+Czr*1PZ=KO%1? zWncE#qGF{YruXf}#*T3*)UTmC!Q;_!H%wIGE$&XquZX3|%|}$W)0 zTiq?lw#$qtTyxyiRXD&NhMThaios^s1b%&RQIF?jyqa5gTg@#A4=K)FwAI{_Fj7n9=azo|CberY8>t8~zT~u>;%A6gw3=JSQnqrNTdElusoZ8vA0|z$Hdxa{Kf~(3JEt67 z{gz+v9eTk%1w*!LUA;=4l6TjSEY010tUkHB+dfuw^dZ`wS4lk1@3*{4qAS3wB)S2- zN}@Z^L(l{0Dd-9CDv5l+X2;10Hakv6ut8BW01Bj3bYHi_8ea-7 z2EGzp0xS`<2fh|u3Vb8D4ER=XIq;p}3Sg<=O5l6JRR9unHGl+N10X@y0!Ywx01|XP zfCO~_kf0j?B0-$ z9Vc@jK&cObY>(3$q04~BqN|e_uvvJ(Ek7w=w1K?x(`5s?iYkQ+3Yy%VaGr+ zf*k|N2zCr4BV@Ef0NHUN)$nq1Bue(B#xlaLNYWs|Zopu{?!XYi9>7pRJ>VI^p1?4H zEf63h9D_AT&pj3(J@+_(^xWeC(sNG$NY6bHAU*dafb`rJ0O`3W1ElAk0x%QZ5+FVI zRDkr{R=`}rX#nZDrvs$twgyPgJp&*;w+%pg?wJ7Txn}{S=bjCao_h{p3k1jr8Jz~o zB|Wzjwtqku;Q`=5!Gpj1Ta!C5*Q^I1&kJq2A&r@4~!9v0qk#KG6D+p4%R?{-UU#g_W%@VE`S2f15lv( z019La1jq;#{I{?s;UUlzI|NF78psZTxZ&+Akc_Z}%mP)-+3B;sWGwcMld)(5a*7K` zn|L6zCQOk*T;)s#OZz*!ZE|By@l{)*yt5@6lV}L>Qc6vp!@A_&UtSXTU8hQTSN+WD zCz~_eg~Wdpm$m^Xc1@UG>Zvt0!O{34{%nkDJ}(cYH^$aS&jLws%z!rA zB*%^EBPEKQYLcU#IZ71Shj+x?Boj%FrTP`O7s;5@3&Y-SAK3sx3uH- zjf{G&Na`8ou?$=}J`204$B#0e;dp_~aNMZ3K0K~uq}!`1+a@8+l*Nob2E>^2K-MIm5*Jmb|BNRN49+`ccJG z<0x?}QELN895XF$aaoR))m>Hiw_A0tTb_#krc)6X#=d1U=dcRuZt2F5Bn5JNZW%dl zp2o4`it8^Khb|AL_eo4zyPv0IK~zU0C(5E$Zq`Mc?dX#y`EYt~Z>2-*^S%51d zbGs{2VXffiQ3EvBme`atzHarlZmCyAI%?9z2#f_1PMMm`d{AU9t;9R%D|(J=PUa(D z!vy7IjrHuLiGR;h>d)=zm?%wr_7!PuXmCR}zpgQ4fPE`l{mmSvLU^p-bB&Z&AA4=8 z>YK5HkH*XX-{%qeMj!qLS>xdVn2^YRVRQOJag6%66GubcWU~i2v-A zd|$SZIr_vW<}CJ(vJ-C`wKmZlQq-o27mKGovrF&w#9MD%TYKSfMVIU~v-i?@Rcf!< zml7=v)=WBDrCY))sws0(k#Xh8UTj=#jso|a&M6L@wW0emrw!eg5sn}fjW=sG17--0 z1ZE1F11}4X0(i5Q&0drdF2WjHNZcA-ttb^7Z0J5$!)7na2sV3BMzGn7GD1eP7j5W% zi2&KqeSMENMUL%z{bYnn_FliLIonSjL_y5#ZT1dztM49BBh;-Z>xdeP%jzz_=#F3X z5cB|g3VH(hf_$Ktpcl|v&>JWa6aak$eSktiA<$RQ7w9MG2RtEo0_ZR34-60t0G<>) z2@Dhr1ng)*MzEs=8NrSgWQ2^478UGnP{FGm zQ1sGqvPZONbk<(sk}LL?BVDwle>t*+FYQl9@*Dfpk&JLR(q{`_+Q}gK!bE&wl3)@r zSuh!xBA5b96-)(Q6ub!7$sifQP6o*cJ|=?{UeoBXoi+y9Yl8J=%e)CZ?ukU5S9p<}SS1;5EY%ZOl3`>9Ls>tL)yY7U4OP16scksLyp;2` z(orxY-Ac@_|FLXae6b6?Ukc%mD#)J-nr)cKi;d+}JlC6|y4b7PW-A|*C_Xo3FLp~u z0j3A!AO2qJUguWjAMVs?8b1>c%o^RrY!D_=54WoxYya@-NL@_2C|1(Dw})0A@)1FP}ov2>-c@fM8#MzvDbojoQ`J)nXkvc)gVzWSo?>)B(gOIb{BMFC^SxD-fjx45k*zoKmVqK2%a zQyj9I!zc7(vzZU__eF0Siu7Rfd0g658l#4*sT8#cE{!qrfh+ELRE`x&Wn6i_TRLzG zEJHkK>)BLBo5~m-3ff%T@{}7aNoRcFl9I{O^4zA8I-1_cebp9Pkj6T0ZJ0(fHW_6OnLMTUNlW_r?YVBLSVeU&a{eo76+E_2d6-9&uH4SeNImJPxw2!Zt0>~ zQQ?U4ZKk6<^O&U+J4^YtayOFHu@O9TM5aj1Qrw29hGjN$R1M3sq-t1ZDQ*+7RxqqI zyHdW*EXCU1epYhNBsN62e4DXkq@z*{%dAV>P}EvxlTzF@x}~|f?fkYRp_^m2tw;_V zbyDSAJgh5k7Am0hOQ(hkY`M^JoK6!d{?f^;c|dEbRIzV7d|H8EPZJ2nJ68t*ymNIh zz&lrm0K9W`D8M^cp8M-D0!Lz_{!EnI7@lZzKovW{54c@u>I>0+u-vD^$>YD)X zT%7~(&Q+TOAR{cr8s7`P2e#rIfUES<=Sj^w)zqol_$8kd!Sf&+hu``<*}nDJ zmV%J8+ENfQf-MChBizO}Stf=+wt6O(w)L=N1e*dWBiIy38No*iWcVR$4fjrYD~Q|b znKB{*WQRa@2vjUUb_f(P1ge@7S~{-YvXetS>$_cQgnAZbb)l%Ht(+<07$ZxKL55!t zya0?9j0MIC#sT96mcsxlwj2)dxMC+vWCS}} zkP$LAVbYMwb)DpZih*p@zh+bxplC*= z8m43gMKh+OB1WupPV;9@Vcmn{$8{$@xHDL0*gbnr%Z-JW5cQe!y0_j?)XKz*&ivduZfWOYZJV^~nnn@@aht6&M$UR~8c48jda?t9lB#drs$gw4Q>97 z%k!j^mfk0FS!ShDv~zO5ub#`0MXU@>N1?Rb?BlB{1JkwM8i?ns=BrYSj&I}cWCR;`CnFqzCr%eM17--01ZE1F11}4X0$vf= zcQwigmFQiKirZh*IcvXK&Qn<}YK3~$cc<0}^(xAmO7MPfnW8t2Q6MM)`Uv^}g@Qt$ zub?l`PtXr|LhuC8U(g>IAQ%8VDR>eXC>RLXq&^wJCiTe(HmOfW$mn>Xf%{?gkTbFA zht=(X0WG4uoE7Ofo_6sKzBiR zpogFb&{NP8u=$EIg3VWy5#E+dyp2n|BX|dRSMVe!jcba62JEdOPAT+GeHJu%stSDzchFgD@;8XHVGXSP- zX97&yz6>yJ`wH-?;8g$xngyUhvjJP=TSizSYb?PUUkknlz7c!_d@J}C_)hR0uvAb< zihNVxvbbi$ht2F%5EtK01<44u&bN&4kz~M51<45MuJdh+Tp~R7_b?eD`QO88*4_4* zEB6Th4rxb2vLJKPpW{zo2)+Qm6nqJMCHM-c7^5MD@HDzJ+|uZ(sE(E+BaGS^aodeLOp1$6QSPVa5EUjcEHcGLQx`F_el%U-DLC(hvAUjy}CM6*GdLWT9+Zq?mVS5s_7a;L2KVl6Sd zN3~v_#j+_dPEq!rZGD}4+~X#ZX{hNkKbFw6m^TwNj$fG$Mdli$oW7-p%liT;DCC+; zA6U+z-U-{zv}?{i%@itZpr=c7;EL?ao?fOY0Ws;kS$VSOPDUY}*4cNqPH@8o*U+8d@#wg_Br5S1ccF&9ZLmns2?-bASQ?41SKdOFp zv*|#EnW2i=)h(oSn=xmsvq=84^gekhJ^ZTp>EY#=UQPZ)liI_UZF5U?L-7=z>A7;} z-In=Eh6|L&S{boDEKI(^B+fdxahYhAs{fy&ceKe9A`_woAV?{52e z(b0!!dm9`sBk&%H5&R^?dn85zyhmabFj_Dg;5`!012#BZMzF!*GQ#(AiSKcV9|S)D zKMH;XeiHly{4Dqx_(kvwuuQNFST3mG!Ql;fm6bB&y4Y9Qq#hn_UuA>06PszH!)1ha zSc7PUivgk$E&+%}Xb%vZc_~0_=4Ak}nU@2^W?lghn|URG1YHFnL01Dv&@})ObS;1c zT?ZgR*8@mU2LK7W0r*vLBd|hn6F^aqn*oY?+yYS4<5qwq-Hb(t+YC8OZo{Hvgyh4b zFrAHuk`cC?c&M-ft>D5{l(IrLLrxb10ZM%cWHaP!Ss^*lMcmdzWZE&1jBpRuSTFb= z@VDSzV1wX3;2**Lf>0-$A%{Ke7)VC2V;~v9mP(QlGFmE$47rdq*;WZieGFu$fe>*! z4J0GjA&`vlmxed~hd?$v&JKZcoEI^I9RkS+b_gURWONABBs>I~VuwI>2&8YWrYMmu z5FjH|jsgKyb9VZyFX@WC%cd)us2tv!IH-n+2Qq8I6dA--&P=ehzp&e8Hs%yxwI#|S z3M#Wv5g&D`&}?3CZ&TcinT>yY^1v#(Kf5MOFZJ{q!W9Z zWH)9&n{9^U#;hBHZp)G3XcXCpcf{RftI2T0b)9<6^Bhh!FYfvJU7;+;j;;~jpxznk z_watVvssQC?m7E*%md%{QDzLj{ ztZS@bZclQs0=e@vjvZHAd?|eS%Kaz3PhxUjH~w3dBPYtreJTMqV6z?F(yy@eWIIwK z?zdW(JGZDrTxj9&dS$A^g|8+vtzVsF`V!zJ4!S94y!Tz%k#D3=@mL8f#--S1;@u)& zO`CFsI5&>-BtCf5+Q_;@!Of#a9j3A^I^*kBZ|j;16x6eN!YR|Fj2u}=EAg)Sik{;+ znUB1Q*>U$&UY56Rq9*=5O+@5&$893kib#izwZ+!_S!i%WSKs!fjuqg!rm2Qja31UT zTqEVx$6lK%`Ie%&E4QcmuzT{GNLI&OXf8+O_kYrLUXnk)-#zRUXc7}#evO)z?VO=? z3*tXJC4a~*WR5=Zi8+hCqwK`nMy*X$Eh}o%#EZq#p4q8<)74%$T+tn+4*d;~JG22HcjzBMs8i!=V->jH zbgW}vY=dE*mSJ`U1_^cp1`Bovh6wfmh6?Hd&j|Jeh6(KZ{bU4N!9hl_*^4rQ4c&L1 zkQtsp9{LOV0|NvDfF}h{0s{pD0h_%jBiPV=8Nr6`%Lq1fA5rKfb@&g^TaXPD2}7vB!u%=ya%2l%+QOH*e=ov;Y}F?jfgi8^|8!&vU*b4Eembgg|Jr};b62zQZ`ecF zhu^S=0{n*k48U*L!+>W6&jP~*!-3}n&jI#DC^Ew9vc~IJ;|;+Zz?*_MfjNRXz*~a1 z0Q({o8KJ_z2t{E#jn3G~1n($x#+yB}!5immwhD$q`HkJ$S#BypLB6uWDsrAy&}Xdx z@O;xIZv*!%EmRIvo9s%FUKm+zKw)1VI~)?Qr-kyVy=d$l)qWtUTJqr&;vOODfJrTq z8mp)8yUg3blR6BkvC1{vSh44opzIu($WYdgV|6mnoI};icxoFgnV0fQ>e5j+Bi)`% z4Zm2n#S;4AuGYPASBAIPTz@KP(qSSmHm1_Nj`zEEamx2lqA=Z*y*Ta#Ck=5`t-j7} zN<-YK(=>i99+)+{i#abc5VxxycmGIlO1d_dIZlA&jBoY0=`fiHw#Hi;YP@+&ydq_9 zkGkwzYK^z3(+Fm!3N2MOu=&(@N4A3or3C51F{>NF7b@bG~W|El6Wwti&gp z;##Mu!*y=yH)qmm;Tf*sc4nJ0E)nSTl*ba^dB0ojMUA>%H?#|^ww~TWs{`KZJ0(fI}2rwn-rz@NlQ-q&be->SVi?La^fp$ z!s5REP-5t^1>6+E_2d6-9&uH4S zeU4IU^c`In3NKdRKa_7X9p#zFET!0OJ)oaZ?naV2HiBo4NclFi6t^L&VVTVwRm1Wu zsT!79irYl26$~rQu9R;xOR=`MpZU5;!sXkHB_kb`Z)meFDT^|jl;W;Y$6+?fZ%Y!o zIcD36$_S%njnP=+dBO9* z7{M68<^aeD@5mbOV2yVL?*i`$-UH?e<^uBs^MLt+`M?6f0>I_~$Oym68oy$V6@nGO zO2JCtH^Fbf?}E*d18|yN`aG%WRW)q}09V0g0APh_^2H;7>4Ij!48f7WOhI$tWx-Lv zD*~GVAS2ie02$$KT*77m*m4l^1)B*eBiK=pjIbP4sJNq`#^LuqPqy!Uw#6XithN}0 zj9`mF$OsuL20{9T(+QD$K!<$*cu-&`f@Fk2Sc3|_g8?e|+U!ReVJ6n#A?#&)SaP!$?Nfc^zA6TJ<HvKN+XIy3v-LA&ggmT4 zHJSqfs?i(-P>tqbfRcQN0F>lA6rd#EVE`rh4hJaUYllEGf*k_M2r-60RdYg1$JJYQ zGTBjsYJ~EOvIbFBlN+@c4#bVx8{kGQ0Ju^60NkjBfUTV=BXFY@VGVB7i2yh1B*50r zlo8&SHQvV>iv){+4+I|owrrG)@Vl(>JJwhwSOxqc_yhP;@F%cZuo_q+SOb)Iq|m@! zQN6fql>X&$#f}za1pC4f8NvQ$BqN;1P4={{uUaY<2ewHxNFo1+vw>0iGRu}X=U*2RxXG!x@ORS)HMm_1LWCg`DrlS}} ztaDEDXHH?=gX2ebCqB3{SZ3Hgdrr%etOQ(GuhB}yqOlHC)`i_t@4$u4Qmnm`|IB&a zTWTn&{hj%_brN?cY1cK!PZY*&w#pbe>%EB}89S^wPVb*qjg^U|A`DA%23x@s<6nzT0M5?LDC zrc#5$oKToDthxQ;4`Tyy;Ha|;X3-`ny5`o^1DR*ZUmCdcOT;7fa&HrfFA=GVt5xi= zBjw)K^`hKcK~~}DSUZIZ>$?L_F@1}*O=drH#)pqw^wzqh`%p}`H^I(3a913bgB#oEka zDul=SJ=dg*Uu>#Ko3Vr6E`+y>l-HzowHwDS{h}Q3W^G-gL=hu_ayV(w_Kuu%iq-no zGea8d?6#cGoS}6K&T~sWs@xsV^fZ3CI~=P4XK{zaQ?yCIv!U6+U71)X8sKM|RN5x~ zGkJgq3a}~17G;b16yiB#36vuinOfEr4-A~D<@N;WnHfwRHZ?YDx0xVLz9Lb6`$X< zV-*{7NAOfig&M*yNZX6upVh_O8yGIw2Y60UAK(RP4S`GVHK0>K%;LO~mVz~3_g0)Ni}2>d-8 zAn^Ab;6s6p(3cTz!x{ws-VPA>dj~+^Z%2T@-#Znvzo>K8ezlyZvRc#%^{($Os1fR2 zl(m4s{Q_-Jz?OXkeSktiA<$RQ7w9MG2RtEo0_ZR34-60t0G<>)2@Dhr1bn>J&?fX@ zD5|R1ggzPJQ>;OR>}LQGvY!KA2)+Qm6nqJMCHM+hB3J_0ggzPJA6ercStHb`fjhH$ z$eGx5rVY}!RSo17HlE=U6o@2>Z%+E zP*>%6fVwIt0B;CR1l|;!1n^e*765ORKN;Yy@}~e4*lh_=VE0sj0=umM3hbT+%oEhC zzuk+=R~}^NPGpPgG0HMQ2VlA227n54Hv%gJHvuaJHv`Px-U5&`eJeoHbjF54`?~A5 zwFRE+e31N(`YJEsfYSui0IKZTp^%L5G1ees`V+tog=B=l$~H76W7^3c(W24JX7(o} z{EO!?Ted+)IAWs+H3N7aI}+e|tT{lHxuXDE>RU!&Qu-pS@r9rrz@+rW0F%;}08C1^ z2bh$;6rjr7WdK#?E(fSGcLhL|xhnyx%v}ZiAh;U%QE(0Lli*t5XTf#AFM=&@7-Z{Q za_Q_aNJg;3AQ^!lu|LEmJ`#KcP&j5Q90n4Oppa{HEwH>py%n~d zX*ZS?((+lTuz?;g&4Cp!SzGTpsJ-tk}bbzP7dSjw#$b25Aq;qHG$(}nIm2_HX z-`P6B4Hu9vn^P#~JLFA6QMO2NF}-g$_R*&&hZhx9+x#TM{0ErmCcn zE#sE=5)Os(y8|=|F{< zrHXZ$TT1CRW6oG-k^E=reezO@_-XM|#LF?on*50-wTCU+=90O literal 0 HcmV?d00001 diff --git a/pkg/ingester/cortex-wal/00000006 b/pkg/ingester/cortex-wal/00000006 new file mode 100644 index 00000000000..e69de29bb2d diff --git a/pkg/ingester/cortex-wal/checkpoint.001065/00000000 b/pkg/ingester/cortex-wal/checkpoint.001065/00000000 new file mode 100644 index 0000000000000000000000000000000000000000..b227d8213d8ff9b4cf2dd4632a0213f36d4bd371 GIT binary patch literal 32768 zcmeI&SxgmY6bJCRFgn9h=BU&ejJ>tAxixJ%!UYj*qr~{oq%n2Jg@(35$auREMcas?;&r ztx5e3j;|z#G$wYPsx~W7spUn<&7s*XGJMw`B|rLCWt0rZG}xrG^T(8eDA`l^=!e_8&NF2Z;4tdDX=*L6$@_?XuG$~Ml zJerV)%Fm-xgM8#si9CAvd2G_*4)WN9Jd*f%^f;jod3=mKI{0~ zgxfwIZsg$>;Gx%^+4haa<4_Uu_+c;}2?Ka|r$;DU_2Uuh<)H@6qfvo!#zSg!YqG?z zqe6o`)KQ__yD4|_>&Vxk8+GKPjzoSPcbsqrb$o<6{^r+F=Yl<`V>arT&97sx3AcPY z;!#JufDWU<`PfsIjsq3=Y#DTTqy*9t2G{&_gn4zu2Cbt_f#Wzl+HiQp@$;zBpa^-? z;P8kn;+m_K?AfwK2M_Ytf;>|BdE9eCJ@Uv#9)0{gF1p|l@>qa87Vz^pY(kHZ#{}dt zL4Zf}-HI1RSv)RpL>_|(fiGMyYN2is33T3g2z1xiuImC)=~ zwPSrM*GF5*KH4%3a*#$D(zwAtIdXOAL>jqBBaWX&mlIAQjSrBPvhM8`jnms7O2a*XhIj1 zOt|CYk%T;wM0h+sEiKjJ(Y6O?$DnVtNr89_g=Rk*7=9kr8Wb=dQnmSn zD$D#l3U%m19)%IljaKEC_?XBcU?nZpU?a|rQpIF*gMaJD(cw12$icZ`^1JAC!bxbFK)1Kh<2rP2RER~MIH9I zgDYMue%+Z#Y40#r8exrCkq=TI6iFAh8eq8r>(Gjh_~7z@){EKH;M3)8JS^s~ZL+G;)@DNp4uOz%745*DT_ zH2f`qe*VVvvI~x2W15bQX*z#ndenqIUt^krjcJNNW2*l-_toi^57EYyN-_V%lzor8 zsXa=u)ory?EghnjDc!~edbrr*Htyw}tBzPaL_1TujSIxX7PoQ6%|#>Lv3Q7frqqcA z;$e$U%t-$|;U|lSXk|(_E`fO1;>Km`<5}*Vq0ZM=ppaduZN`Mle1SkPYfD)htC;>`<5}*Vq0ZM=ppaduZ zN`Mle1SkPYfD)htC;>`<5}*Vq0ZM=ppaduZN`Mle1SkPYfD)htC;>`<5}*Vq0ZQP1 Gn83ek-*=k; literal 0 HcmV?d00001 diff --git a/pkg/ingester/cortex-wal/checkpoint.001066/00000000 b/pkg/ingester/cortex-wal/checkpoint.001066/00000000 new file mode 100644 index 0000000000000000000000000000000000000000..f1c6a6e8ea06392b74d6a7bc37ffa9d74266bd65 GIT binary patch literal 32768 zcmeI&Sxi)C6bJA-aP?l6a*s;A7h|WEHdE84R~Qh%HcE^SO&U{oTxe)(cxV%~3*Br{ z)7EOWwY5!TQ9z7!c_{81n;?=jP1?HPf{L<b`xzg*R#k^UlVUhxSE`cikQ_rK(;=zRsSZ<N`Cb3$|xC*X|PFW=Z`4`QL?A-(GRnK ziRU-#(Bow1H%GAZ$ve2u_d4OMi=F>OW9M&`aGyWtf_)}CKX(K>KX*Ux`N;hybbcLW zWJ>HcNzmEfNI1;(H%g-9un^s7D#c$6_7{hJ5lkvM=y9P*H%(T|7hy>w>+=V-E6|!_Q-% z3AcSb+{nW%z(cP+v+Wy;$DtzR@xx#|5(e<_PLEKy>c=D0%R>#CN23DejEB_d)?|ra zM}-D?sG~x;cT?`<*O9M7H|oeo9f|xp?l|EL>i7tC{LQbU&INl=$86Lwn_tIX6K?r* z#G{UQ0Ubtz^RcHa9S17#*)r(xNC~7P46ga<2=nTQ4O&N=0>^Q9wBhiG~tvUptHh&%=z9`S*AIH19ghXZ-Y9g`=4WI8!k4z{06w6=~b3Y4OZE1}t~ zYRCFiu9voyy|iT-8?8mG5El!ki%4e#0!0$2QKgh*~iBh|sSQ3gbGC%vk$ z*O!HJ73W6JdM+IJ|Gr#vNTK#aS-p9(ApdTyO6sSfX z*O13p{>RF34GNLRapa-$KURu#xQjfBkjEr`9v&xLKpvkVj~;#=^)5JwJmw>h`TRT% znQ+I)BMEsViST%OT3V{bqiqk)jzQniCI#X#6q@~b4E6Fbg67epus5YFo)+X`@bjqB z*qc&zz6yCr{5%SD=tUj{$YUZuk3J`yLmtbJM<+jz^DfwrJkpRy8b6N%CUp6DBqEPQ z0Uj|m-)^n7c=T<+=`kpe2?Ka|-?Avs;>Sbr@)#X7k5&b$kVh->7{kw_T7v?{L#j5P zP-U5)N1+aV$fGdgxzVcp62FhTPB@P~K1Cnh{5~$Y-~jrVhd$=<`#5Mqx6j9T^f6w* z$FSa%`enKQ6=I%U5A_k}=VLl6#NuMtsBqa&2X2;0!*J=C_$1#-%63_A5!RrLk zU6h#ckI#jPE=&OzMtx=1v);Mk7A{KZu`;m3#iiI5j}^mnKV_YdhhR}kJ$fJ>_UO^S z+OTA&#Y40xr5-&H4_ox;Mt;w0)fNxYrZgPd{ML?e?^60q&`arc1!{2lxQ?;e+2a2R)wg}8exafQh zT=7~d$I>BMmeL(upvA)$cW^OM$F}b+9-?I_wP6oFS=cDS@nMTL%-Fp;ZH47Sv@NB( zmO$gf7I!Vt@AmzcXYt@HOYzf#-g{cQ!uQ`jaB|q<3g2iQx~0uaBLP+l7Gl(w2AUkU z=u2b9CM{p@<01IDV_H0Hym-jJPT67k5G_opPY*OcY|*F3+%VQOT0BGx(+IfkSC~e4 z3sYCng=xD2C$KPW$HFv~zc8)Upco6&S}aVX`3uvnI`p%`RN87jA1P1eFHG+{;Sv_6 zD>VERKtF$Ddf5d>urW=?#x$M3F+FNRpRX}Z!NxR2pfT0|ocro@%ZF%VN~M_pV#1`<5}*Vq0ZM=ppaduZN`Mle1SkPYfD)htC;>`<5}*Vq0ZM=ppaduZN`Mle1SkPY zfD)htC;>`<5}*Vq0ZM=ppaduZN`Mle1SkPYfD)htC;>`<5}*Vq0ZM=ppaduZN`Mmh HA13fGlPPzb literal 0 HcmV?d00001 diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index a0160b60044..3a56fdd6629 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -477,7 +477,7 @@ func (i *Ingester) QueryStream(req *client.QueryRequest, stream client.Ingester_ } numSeries++ - wireChunks, err := toWireChunks(chunks) + wireChunks, err := toWireChunks(chunks, nil) if err != nil { return err } diff --git a/pkg/ingester/transfer.go b/pkg/ingester/transfer.go index 4a915b00b6a..9ec0580fcf0 100644 --- a/pkg/ingester/transfer.go +++ b/pkg/ingester/transfer.go @@ -135,8 +135,12 @@ func (i *Ingester) TransferChunks(stream client.Ingester_TransferChunksServer) e return nil } -func toWireChunks(descs []*desc) ([]client.Chunk, error) { - wireChunks := make([]client.Chunk, 0, len(descs)) +// The passed wireChunks slice is for re-use. +func toWireChunks(descs []*desc, wireChunks []client.Chunk) ([]client.Chunk, error) { + if cap(wireChunks) < len(descs) { + wireChunks = make([]client.Chunk, 0, len(descs)) + } + wireChunks = wireChunks[:0] for _, d := range descs { wireChunk := client.Chunk{ StartTimestampMs: int64(d.FirstTime), @@ -229,6 +233,7 @@ func (i *Ingester) transferOut(ctx context.Context) error { return errors.Wrap(err, "TransferChunks") } + var chunks []client.Chunk for userID, state := range userStatesCopy { for pair := range state.fpToSeries.iter() { state.fpLocker.Lock(pair.fp) @@ -238,7 +243,8 @@ func (i *Ingester) transferOut(ctx context.Context) error { continue } - chunks, err := toWireChunks(pair.series.chunkDescs) + var err error + chunks, err = toWireChunks(pair.series.chunkDescs, chunks) if err != nil { state.fpLocker.Unlock(pair.fp) return errors.Wrap(err, "toWireChunks") diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index a91403c03ef..6a2a967acd8 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -61,8 +61,7 @@ type walWrapper struct { quit chan struct{} wait sync.WaitGroup - lastWalSegment int - wal *wal.WAL + wal *wal.WAL // Checkpoint metrics. checkpointDeleteFail prometheus.Counter @@ -76,6 +75,14 @@ func newWAL(cfg WALConfig, ingester *Ingester) (WAL, error) { return &noop{}, nil } + level.Info(util.Logger).Log("msg", "recovering from WAL") + start := time.Now() + if err := recoverFromWAL(ingester); err != nil { + return nil, err + } + elapsed := time.Since(start) + level.Info(util.Logger).Log("msg", "recovery from WAL completed", "time", elapsed.String()) + var walRegistry prometheus.Registerer if cfg.metricsRegisterer != nil { walRegistry = prometheus.WrapRegistererWith(prometheus.Labels{"kind": "wal"}, cfg.metricsRegisterer) @@ -86,11 +93,10 @@ func newWAL(cfg WALConfig, ingester *Ingester) (WAL, error) { } w := &walWrapper{ - cfg: cfg, - ingester: ingester, - quit: make(chan struct{}), - wal: tsdbWAL, - lastWalSegment: -1, + cfg: cfg, + ingester: ingester, + quit: make(chan struct{}), + wal: tsdbWAL, } w.checkpointDeleteFail = prometheus.NewCounter(prometheus.CounterOpts{ @@ -118,14 +124,6 @@ func newWAL(cfg WALConfig, ingester *Ingester) (WAL, error) { ) } - level.Info(util.Logger).Log("msg", "recovering from WAL") - start := time.Now() - if err := recoverFromWAL(ingester); err != nil { - return nil, err - } - elapsed := time.Since(start) - level.Info(util.Logger).Log("msg", "recovery from WAL completed", "time", elapsed.String()) - w.wait.Add(1) go w.run() return w, nil @@ -155,13 +153,14 @@ func (w *walWrapper) run() { for !w.isStopped() { select { case <-ticker.C: + start := time.Now() + level.Info(util.Logger).Log("msg", "starting checkpoint") if err := w.checkpoint(); err != nil { level.Error(util.Logger).Log("msg", "Error checkpointing series", "err", err) continue } - if err := w.truncateSamples(); err != nil { - level.Error(util.Logger).Log("msg", "Error truncating wal", "err", err) - } + elapsed := time.Since(start) + level.Info(util.Logger).Log("msg", "checkpoint done", "time", elapsed.String()) case <-w.quit: return } @@ -186,14 +185,21 @@ func (w *walWrapper) checkpoint() (err error) { w.checkpointCreationFail.Inc() } }() - _, last, err := lastCheckpoint(w.wal.Dir()) + + _, lastSegment, err := w.wal.Segments() if err != nil { return err } - newIdx := last + 1 + _, lastCh, err := lastCheckpoint(w.wal.Dir()) + if err != nil { + return err + } + + newIdx := lastCh + 1 cpdir := filepath.Join(w.wal.Dir(), fmt.Sprintf(checkpointPrefix+"%06d", newIdx)) + level.Info(util.Logger).Log("msg", "attempting checkpoint for", "dir", cpdir) cpdirtmp := cpdir + ".tmp" if err := os.MkdirAll(cpdirtmp, 0777); err != nil { @@ -208,19 +214,11 @@ func (w *walWrapper) checkpoint() (err error) { os.RemoveAll(cpdirtmp) }() - // Count number of series - we'll use this to rate limit checkpoints. - numSeries := 0 - for _, state := range w.ingester.userStates.cp() { - numSeries += state.fpToSeries.length() - } - if numSeries == 0 { - return nil - } - + var wireChunkBuf []client.Chunk for userID, state := range w.ingester.userStates.cp() { for pair := range state.fpToSeries.iter() { state.fpLocker.Lock(pair.fp) - err := w.checkpointSeries(cp, userID, pair.fp, pair.series) + wireChunkBuf, err = w.checkpointSeries(cp, userID, pair.fp, pair.series, wireChunkBuf) state.fpLocker.Unlock(pair.fp) if err != nil { return err @@ -235,8 +233,16 @@ func (w *walWrapper) checkpoint() (err error) { return errors.Wrap(err, "rename checkpoint directory") } - if last >= 0 { - return w.deleteCheckpoints(last) + // The last segment might still have been active during the checkpointing, + // hence delete only the segments before that. + if err := w.wal.Truncate(lastSegment - 1); err != nil { + return err + } + + if lastCh >= 0 { + if err := w.deleteCheckpoints(lastCh); err != nil { + level.Error(util.Logger).Log("msg", "error deleting old checkpoint", "err", err) + } } return nil @@ -298,10 +304,11 @@ func (w *walWrapper) deleteCheckpoints(maxIndex int) (err error) { return errs.Err() } -func (w *walWrapper) checkpointSeries(cp *wal.WAL, userID string, fp model.Fingerprint, series *memorySeries) error { - wireChunks, err := toWireChunks(series.chunkDescs) +func (w *walWrapper) checkpointSeries(cp *wal.WAL, userID string, fp model.Fingerprint, series *memorySeries, wireChunks []client.Chunk) ([]client.Chunk, error) { + var err error + wireChunks, err = toWireChunks(series.chunkDescs, wireChunks) if err != nil { - return err + return wireChunks, err } buf, err := proto.Marshal(&Series{ @@ -311,27 +318,10 @@ func (w *walWrapper) checkpointSeries(cp *wal.WAL, userID string, fp model.Finge Chunks: wireChunks, }) if err != nil { - return err + return wireChunks, err } - return cp.Log(buf) -} - -// truncateSamples removed the wal from before the checkpoint. -func (w *walWrapper) truncateSamples() error { - _, last, err := w.wal.Segments() - if err != nil { - return err - } - - // The last segment might still have been active after the checpoint, - // hence delete only the segments before that. - if err := w.wal.Truncate(w.lastWalSegment - 1); err != nil { - return err - } - - w.lastWalSegment = last - return nil + return wireChunks, cp.Log(buf) } func recoverFromWAL(ingester *Ingester) (err error) { @@ -347,7 +337,7 @@ func recoverFromWAL(ingester *Ingester) (err error) { } if idx >= 0 { - level.Debug(util.Logger).Log("msg", "recovering from checkpoint", "checkpoint", lastCheckpointDir) + level.Info(util.Logger).Log("msg", "recovering from checkpoint", "checkpoint", lastCheckpointDir) // Checkpoint exists. start := time.Now() numSeries := 0 @@ -381,12 +371,17 @@ func recoverFromWAL(ingester *Ingester) (err error) { return err } elapsed := time.Since(start) - level.Debug(util.Logger).Log("msg", "recovered from checkpoint", "time", elapsed.String(), "num_series", numSeries, "num_chunks", numChunks) + level.Info(util.Logger).Log("msg", "recovered from checkpoint", "time", elapsed.String(), "num_series", numSeries, "num_chunks", numChunks) } else { - level.Debug(util.Logger).Log("msg", "no checkpoint found") + level.Info(util.Logger).Log("msg", "no checkpoint found") + } + + if segExists, err := segmentsExist(walDir); err == nil && !segExists { + level.Info(util.Logger).Log("msg", "no segments found, skipping recover from segments") + return nil } - level.Debug(util.Logger).Log("msg", "recovering from segments", "dir", walDir) + level.Info(util.Logger).Log("msg", "recovering from segments", "dir", walDir) numRecords := 0 numSeries := 0 numSamples := 0 @@ -396,7 +391,7 @@ func recoverFromWAL(ingester *Ingester) (err error) { numRecords++ state := userStates.getOrCreate(record.UserId) if numRecords%10000 == 0 { - level.Debug(util.Logger).Log("msg", "records milestone", "num_records", numRecords) + level.Info(util.Logger).Log("msg", "records milestone", "num_records", numRecords) } for _, labels := range record.Labels { _, ok := state.fpToSeries.get(model.Fingerprint(labels.Fingerprint)) @@ -443,7 +438,7 @@ func recoverFromWAL(ingester *Ingester) (err error) { return err } elapsed := time.Since(start) - level.Debug(util.Logger).Log("msg", "recovered from segments", "time", elapsed.String(), "num_new_series", numSeries, "num_records", numRecords, "num_samples", numSamples) + level.Info(util.Logger).Log("msg", "recovered from segments", "time", elapsed.String(), "num_new_series", numSeries, "num_records", numRecords, "num_samples", numSamples) ingester.userStatesMtx.Lock() ingester.userStates = userStates @@ -476,3 +471,16 @@ func recoverRecords(name string, ty proto.Message, callback func(proto.Message) return nil } + +func segmentsExist(dir string) (bool, error) { + files, err := fileutil.ReadDir(dir) + if err != nil { + return false, err + } + for _, fn := range files { + if _, err := strconv.Atoi(fn); err == nil { + return true, nil + } + } + return false, nil +} From 2d85a982516ca433803e6c09d8e2015f3d084ec9 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Tue, 17 Sep 2019 22:33:40 +0530 Subject: [PATCH 24/56] Process WAL records in parallel. Also create a checkpoint during graceful shutdown. Signed-off-by: Ganesh Vernekar --- pkg/ingester/wal.go | 241 ++++++++++++++++++++++++++++---------------- 1 file changed, 153 insertions(+), 88 deletions(-) diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index 6a2a967acd8..b5aa1720843 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -6,6 +6,7 @@ import ( "io/ioutil" "os" "path/filepath" + "runtime" "strconv" "strings" "sync" @@ -156,12 +157,15 @@ func (w *walWrapper) run() { start := time.Now() level.Info(util.Logger).Log("msg", "starting checkpoint") if err := w.checkpoint(); err != nil { - level.Error(util.Logger).Log("msg", "Error checkpointing series", "err", err) + level.Error(util.Logger).Log("msg", "error checkpointing series", "err", err) continue } elapsed := time.Since(start) level.Info(util.Logger).Log("msg", "checkpoint done", "time", elapsed.String()) case <-w.quit: + if err := w.checkpoint(); err != nil { + level.Error(util.Logger).Log("msg", "error checkpointing series during shutdown", "err", err) + } return } } @@ -329,49 +333,20 @@ func recoverFromWAL(ingester *Ingester) (err error) { // Use a local userStates, so we don't need to worry about locking. userStates := newUserStates(ingester.limits, ingester.cfg) - la := []client.LabelAdapter{} - lastCheckpointDir, idx, err := lastCheckpoint(walDir) if err != nil { return err } if idx >= 0 { - level.Info(util.Logger).Log("msg", "recovering from checkpoint", "checkpoint", lastCheckpointDir) // Checkpoint exists. + level.Info(util.Logger).Log("msg", "recovering from checkpoint", "checkpoint", lastCheckpointDir) start := time.Now() - numSeries := 0 - numChunks := 0 - if err := recoverRecords(lastCheckpointDir, &Series{}, func(msg proto.Message) error { - numSeries++ - walSeries := msg.(*Series) - - descs, err := fromWireChunks(walSeries.Chunks) - if err != nil { - return err - } - numChunks += len(descs) - - state := userStates.getOrCreate(walSeries.UserId) - - la = la[:0] - for _, l := range walSeries.Labels { - la = append(la, client.LabelAdapter{ - Name: string(l.Name), - Value: string(l.Value), - }) - } - series, err := state.createSeriesWithFingerprint(model.Fingerprint(walSeries.Fingerprint), la, nil, true) - if err != nil { - return err - } - - return series.setChunks(descs) - }); err != nil { + if err := processCheckpoint(lastCheckpointDir, userStates); err != nil { return err } elapsed := time.Since(start) - level.Info(util.Logger).Log("msg", "recovered from checkpoint", "time", elapsed.String(), "num_series", numSeries, "num_chunks", numChunks) + level.Info(util.Logger).Log("msg", "recovered from checkpoint", "time", elapsed.String()) } else { level.Info(util.Logger).Log("msg", "no checkpoint found") } @@ -382,72 +357,107 @@ func recoverFromWAL(ingester *Ingester) (err error) { } level.Info(util.Logger).Log("msg", "recovering from segments", "dir", walDir) - numRecords := 0 - numSeries := 0 - numSamples := 0 start := time.Now() - if err := recoverRecords(walDir, &Record{}, func(msg proto.Message) error { - record := msg.(*Record) - numRecords++ - state := userStates.getOrCreate(record.UserId) - if numRecords%10000 == 0 { - level.Info(util.Logger).Log("msg", "records milestone", "num_records", numRecords) + if err := processWAL(walDir, userStates); err != nil { + return err + } + elapsed := time.Since(start) + level.Info(util.Logger).Log("msg", "recovered from segments", "time", elapsed.String()) + + ingester.userStatesMtx.Lock() + ingester.userStates = userStates + ingester.userStatesMtx.Unlock() + + return nil +} + +func processCheckpoint(name string, userStates *userStates) error { + numSeries := 0 + numChunks := 0 + walSeries := &Series{} + + segmentReader, err := wal.NewSegmentsReader(name) + if err != nil { + return err + } + defer segmentReader.Close() + + la := []client.LabelAdapter{} + reader := wal.NewReader(segmentReader) + for reader.Next() { + walSeries.Reset() + if err := proto.Unmarshal(reader.Record(), walSeries); err != nil { + return err } - for _, labels := range record.Labels { - _, ok := state.fpToSeries.get(model.Fingerprint(labels.Fingerprint)) - if ok { - continue - } - la = la[:0] - for _, l := range labels.Labels { - la = append(la, client.LabelAdapter{ - Name: string(l.Name), - Value: string(l.Value), - }) - } - _, err := state.createSeriesWithFingerprint(model.Fingerprint(labels.Fingerprint), la, nil, true) - if err != nil { - return err - } - numSeries++ + numSeries++ + + descs, err := fromWireChunks(walSeries.Chunks) + if err != nil { + return err } + numChunks += len(descs) - for _, sample := range record.Samples { - series, ok := state.fpToSeries.get(model.Fingerprint(sample.Fingerprint)) - if !ok { - return nil - } + state := userStates.getOrCreate(walSeries.UserId) - err := series.add(model.SamplePair{ - Timestamp: model.Time(sample.Timestamp), - Value: model.SampleValue(sample.Value), + la = la[:0] + for _, l := range walSeries.Labels { + la = append(la, client.LabelAdapter{ + Name: string(l.Name), + Value: string(l.Value), }) - if err != nil { - // We can ignore memorySeriesError because duplicate (or) out-of-order samples are possible - // here because the WAL is not truncated to align with the checkpoint. - if _, ok := err.(*memorySeriesError); !ok { - return err - } - } } - numSamples += len(record.Samples) + series, err := state.createSeriesWithFingerprint(model.Fingerprint(walSeries.Fingerprint), la, nil, true) + if err != nil { + return err + } - return nil - }); err != nil { + if err := series.setChunks(descs); err != nil { + return err + } + } + if err := reader.Err(); err != nil { return err } - elapsed := time.Since(start) - level.Info(util.Logger).Log("msg", "recovered from segments", "time", elapsed.String(), "num_new_series", numSeries, "num_records", numRecords, "num_samples", numSamples) - ingester.userStatesMtx.Lock() - ingester.userStates = userStates - ingester.userStatesMtx.Unlock() + level.Info(util.Logger).Log("msg", "checkpoint recovery stats", "num_series", numSeries, "num_chunks", numChunks) return nil } -func recoverRecords(name string, ty proto.Message, callback func(proto.Message) error) error { +func processWAL(name string, userStates *userStates) error { + var ( + chanErr error + numProcs = runtime.GOMAXPROCS(0) + recordChan = make(chan *Record, 1000) + errChan = make(chan error, numProcs) + wg = sync.WaitGroup{} + recordPool = sync.Pool{ + New: func() interface{} { + return &Record{} + }, + } + ) + + wg.Add(numProcs) + for i := 0; i < numProcs; i++ { + go func() { + defer func() { + wg.Done() + }() + var err error + var la []client.LabelAdapter + for record := range recordChan { + la, err = processWALRecord(record, userStates, la) + recordPool.Put(record) + if err != nil { + errChan <- err + return + } + } + }() + } + segmentReader, err := wal.NewSegmentsReader(name) if err != nil { return err @@ -455,23 +465,78 @@ func recoverRecords(name string, ty proto.Message, callback func(proto.Message) defer segmentReader.Close() reader := wal.NewReader(segmentReader) +Loop: for reader.Next() { - ty.Reset() - if err := proto.Unmarshal(reader.Record(), ty); err != nil { + msg := recordPool.Get().(*Record) + if err := proto.Unmarshal(reader.Record(), msg); err != nil { return err } - if err := callback(ty); err != nil { - return err + select { + case chanErr = <-errChan: + break Loop + case recordChan <- msg: } } + close(recordChan) + wg.Wait() + + if chanErr != nil { + return chanErr + } if err := reader.Err(); err != nil { return err } - return nil } +func processWALRecord(record *Record, userStates *userStates, la []client.LabelAdapter) ([]client.LabelAdapter, error) { + state := userStates.getOrCreate(record.UserId) + for _, labels := range record.Labels { + _, ok := state.fpToSeries.get(model.Fingerprint(labels.Fingerprint)) + if ok { + continue + } + + la = la[:0] + for _, l := range labels.Labels { + la = append(la, client.LabelAdapter{ + Name: string(l.Name), + Value: string(l.Value), + }) + } + _, err := state.createSeriesWithFingerprint(model.Fingerprint(labels.Fingerprint), la, nil, true) + if err != nil { + return la, err + } + } + + for _, sample := range record.Samples { + state.fpLocker.Lock(model.Fingerprint(sample.Fingerprint)) + series, ok := state.fpToSeries.get(model.Fingerprint(sample.Fingerprint)) + if !ok { + state.fpLocker.Unlock(model.Fingerprint(sample.Fingerprint)) + return la, nil + } + + err := series.add(model.SamplePair{ + Timestamp: model.Time(sample.Timestamp), + Value: model.SampleValue(sample.Value), + }) + if err != nil { + // We can ignore memorySeriesError because duplicate (or) out-of-order samples are possible + // here because the WAL is not truncated to align with the checkpoint. + if _, ok := err.(*memorySeriesError); !ok { + state.fpLocker.Unlock(model.Fingerprint(sample.Fingerprint)) + return la, err + } + } + state.fpLocker.Unlock(model.Fingerprint(sample.Fingerprint)) + } + + return la, nil +} + func segmentsExist(dir string) (bool, error) { files, err := fileutil.ReadDir(dir) if err != nil { From 50d22c82073214d9c5dbab13e8aa5adc834a6774 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Wed, 18 Sep 2019 00:21:16 +0530 Subject: [PATCH 25/56] Added comments and some refactoring and not returning on no-series for samples Signed-off-by: Ganesh Vernekar --- pkg/ingester/wal.go | 65 ++++++++++++++++++++++++++++++++------------- 1 file changed, 47 insertions(+), 18 deletions(-) diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index b5aa1720843..932c9b7feb8 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -3,6 +3,7 @@ package ingester import ( "flag" "fmt" + "io" "io/ioutil" "os" "path/filepath" @@ -308,6 +309,7 @@ func (w *walWrapper) deleteCheckpoints(maxIndex int) (err error) { return errs.Err() } +// checkpointSeries write the chunks of the series to the checkpoint. func (w *walWrapper) checkpointSeries(cp *wal.WAL, userID string, fp model.Fingerprint, series *memorySeries, wireChunks []client.Chunk) ([]client.Chunk, error) { var err error wireChunks, err = toWireChunks(series.chunkDescs, wireChunks) @@ -371,19 +373,20 @@ func recoverFromWAL(ingester *Ingester) (err error) { return nil } +// processCheckpoint loads the chunks of the series present in the last checkpoint. func processCheckpoint(name string, userStates *userStates) error { numSeries := 0 numChunks := 0 walSeries := &Series{} - segmentReader, err := wal.NewSegmentsReader(name) + la := []client.LabelAdapter{} + + reader, closer, err := walReader(name) if err != nil { return err } - defer segmentReader.Close() + defer closer.Close() - la := []client.LabelAdapter{} - reader := wal.NewReader(segmentReader) for reader.Next() { walSeries.Reset() if err := proto.Unmarshal(reader.Record(), walSeries); err != nil { @@ -425,11 +428,13 @@ func processCheckpoint(name string, userStates *userStates) error { return nil } +// processWAL processes the records in the WAL concurrently. func processWAL(name string, userStates *userStates) error { var ( - chanErr error numProcs = runtime.GOMAXPROCS(0) - recordChan = make(chan *Record, 1000) + recordChan = make(chan *Record, 1024) + // errChan is to capture the errors from goroutine. + // The channel size is numProcs to not block any worker if all of them error out. errChan = make(chan error, numProcs) wg = sync.WaitGroup{} recordPool = sync.Pool{ @@ -439,6 +444,13 @@ func processWAL(name string, userStates *userStates) error { } ) + reader, closer, err := walReader(name) + if err != nil { + return err + } + defer closer.Close() + + // Creating workers. wg.Add(numProcs) for i := 0; i < numProcs; i++ { go func() { @@ -458,13 +470,8 @@ func processWAL(name string, userStates *userStates) error { }() } - segmentReader, err := wal.NewSegmentsReader(name) - if err != nil { - return err - } - defer segmentReader.Close() - - reader := wal.NewReader(segmentReader) + // Iterating the WAL records. + var errFromChan error Loop: for reader.Next() { msg := recordPool.Get().(*Record) @@ -473,7 +480,9 @@ Loop: } select { - case chanErr = <-errChan: + case errFromChan = <-errChan: + // Exit early on an error. + // Only acts upon the first error received. break Loop case recordChan <- msg: } @@ -481,8 +490,8 @@ Loop: close(recordChan) wg.Wait() - if chanErr != nil { - return chanErr + if errFromChan != nil { + return errFromChan } if err := reader.Err(); err != nil { return err @@ -492,6 +501,8 @@ Loop: func processWALRecord(record *Record, userStates *userStates, la []client.LabelAdapter) ([]client.LabelAdapter, error) { state := userStates.getOrCreate(record.UserId) + + // Create the series from labels which do not exist. for _, labels := range record.Labels { _, ok := state.fpToSeries.get(model.Fingerprint(labels.Fingerprint)) if ok { @@ -515,8 +526,13 @@ func processWALRecord(record *Record, userStates *userStates, la []client.LabelA state.fpLocker.Lock(model.Fingerprint(sample.Fingerprint)) series, ok := state.fpToSeries.get(model.Fingerprint(sample.Fingerprint)) if !ok { + // This should ideally not happen. + // If the series was not created in recovering checkpoint or + // from the labels of any records previous to this, there + // is no way to get the labels for this fingerprint. state.fpLocker.Unlock(model.Fingerprint(sample.Fingerprint)) - return la, nil + level.Warn(util.Logger).Log("msg", "series not found for sample during wal recovery", "fingerprint", model.Fingerprint(sample.Fingerprint).String()) + continue } err := series.add(model.SamplePair{ @@ -525,7 +541,7 @@ func processWALRecord(record *Record, userStates *userStates, la []client.LabelA }) if err != nil { // We can ignore memorySeriesError because duplicate (or) out-of-order samples are possible - // here because the WAL is not truncated to align with the checkpoint. + // here because the WAL is not truncated to exactly align with the checkpoint. if _, ok := err.(*memorySeriesError); !ok { state.fpLocker.Unlock(model.Fingerprint(sample.Fingerprint)) return la, err @@ -537,6 +553,8 @@ func processWALRecord(record *Record, userStates *userStates, la []client.LabelA return la, nil } +// segmentsExist is a stripped down version of +// https://github.com/prometheus/prometheus/blob/4c648eddf47d7e07fbc74d0b18244402200dca9e/tsdb/wal/wal.go#L739-L760. func segmentsExist(dir string) (bool, error) { files, err := fileutil.ReadDir(dir) if err != nil { @@ -544,8 +562,19 @@ func segmentsExist(dir string) (bool, error) { } for _, fn := range files { if _, err := strconv.Atoi(fn); err == nil { + // First filename which is a number. + // This is how Prometheus stores and this + // is how it checks too. return true, nil } } return false, nil } + +func walReader(name string) (*wal.Reader, io.Closer, error) { + segmentReader, err := wal.NewSegmentsReader(name) + if err != nil { + return nil, nil, err + } + return wal.NewReader(segmentReader), segmentReader, nil +} From 7282ee8d1a882ecdb9abdb3e85f39aa1033f7e15 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Wed, 18 Sep 2019 11:16:38 +0530 Subject: [PATCH 26/56] Process checkpoint series in parallel Signed-off-by: Ganesh Vernekar --- pkg/ingester/wal.go | 233 ++++++++++++++++++++++---------------------- 1 file changed, 119 insertions(+), 114 deletions(-) diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index 932c9b7feb8..e4395b849aa 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -373,130 +373,92 @@ func recoverFromWAL(ingester *Ingester) (err error) { return nil } -// processCheckpoint loads the chunks of the series present in the last checkpoint. -func processCheckpoint(name string, userStates *userStates) error { - numSeries := 0 - numChunks := 0 - walSeries := &Series{} - - la := []client.LabelAdapter{} - - reader, closer, err := walReader(name) +// segmentsExist is a stripped down version of +// https://github.com/prometheus/prometheus/blob/4c648eddf47d7e07fbc74d0b18244402200dca9e/tsdb/wal/wal.go#L739-L760. +func segmentsExist(dir string) (bool, error) { + files, err := fileutil.ReadDir(dir) if err != nil { - return err + return false, err } - defer closer.Close() - - for reader.Next() { - walSeries.Reset() - if err := proto.Unmarshal(reader.Record(), walSeries); err != nil { - return err + for _, fn := range files { + if _, err := strconv.Atoi(fn); err == nil { + // First filename which is a number. + // This is how Prometheus stores and this + // is how it checks too. + return true, nil } + } + return false, nil +} - numSeries++ - - descs, err := fromWireChunks(walSeries.Chunks) - if err != nil { - return err - } - numChunks += len(descs) +// processCheckpoint loads the chunks of the series present in the last checkpoint. +func processCheckpoint(name string, userStates *userStates) error { + seriesPool := &sync.Pool{ + New: func() interface{} { + return &Series{} + }, + } + return processRecords( + name, seriesPool, + func(seriesChan chan proto.Message, errChan chan error) { + var la []client.LabelAdapter + for s := range seriesChan { + walSeries := s.(*Series) - state := userStates.getOrCreate(walSeries.UserId) + state := userStates.getOrCreate(walSeries.UserId) - la = la[:0] - for _, l := range walSeries.Labels { - la = append(la, client.LabelAdapter{ - Name: string(l.Name), - Value: string(l.Value), - }) - } - series, err := state.createSeriesWithFingerprint(model.Fingerprint(walSeries.Fingerprint), la, nil, true) - if err != nil { - return err - } + la = la[:0] + for _, l := range walSeries.Labels { + la = append(la, client.LabelAdapter{ + Name: string(l.Name), + Value: string(l.Value), + }) + } + series, err := state.createSeriesWithFingerprint(model.Fingerprint(walSeries.Fingerprint), la, nil, true) + if err != nil { + errChan <- err + return + } - if err := series.setChunks(descs); err != nil { - return err - } - } - if err := reader.Err(); err != nil { - return err - } + descs, err := fromWireChunks(walSeries.Chunks) + if err != nil { + errChan <- err + return + } - level.Info(util.Logger).Log("msg", "checkpoint recovery stats", "num_series", numSeries, "num_chunks", numChunks) + if err := series.setChunks(descs); err != nil { + errChan <- err + return + } - return nil + seriesPool.Put(s) + } + }, + ) } // processWAL processes the records in the WAL concurrently. func processWAL(name string, userStates *userStates) error { - var ( - numProcs = runtime.GOMAXPROCS(0) - recordChan = make(chan *Record, 1024) - // errChan is to capture the errors from goroutine. - // The channel size is numProcs to not block any worker if all of them error out. - errChan = make(chan error, numProcs) - wg = sync.WaitGroup{} - recordPool = sync.Pool{ - New: func() interface{} { - return &Record{} - }, - } - ) - - reader, closer, err := walReader(name) - if err != nil { - return err - } - defer closer.Close() - - // Creating workers. - wg.Add(numProcs) - for i := 0; i < numProcs; i++ { - go func() { - defer func() { - wg.Done() - }() + recordPool := &sync.Pool{ + New: func() interface{} { + return &Record{} + }, + } + return processRecords( + name, recordPool, + func(recordChan chan proto.Message, errChan chan error) { var err error var la []client.LabelAdapter for record := range recordChan { - la, err = processWALRecord(record, userStates, la) + la, err = processWALRecord(record.(*Record), userStates, la) recordPool.Put(record) if err != nil { errChan <- err return } } - }() - } - - // Iterating the WAL records. - var errFromChan error -Loop: - for reader.Next() { - msg := recordPool.Get().(*Record) - if err := proto.Unmarshal(reader.Record(), msg); err != nil { - return err - } - - select { - case errFromChan = <-errChan: - // Exit early on an error. - // Only acts upon the first error received. - break Loop - case recordChan <- msg: - } - } - close(recordChan) - wg.Wait() - - if errFromChan != nil { - return errFromChan - } - if err := reader.Err(); err != nil { - return err - } - return nil + }, + ) } func processWALRecord(record *Record, userStates *userStates, la []client.LabelAdapter) ([]client.LabelAdapter, error) { @@ -553,22 +515,65 @@ func processWALRecord(record *Record, userStates *userStates, la []client.LabelA return la, nil } -// segmentsExist is a stripped down version of -// https://github.com/prometheus/prometheus/blob/4c648eddf47d7e07fbc74d0b18244402200dca9e/tsdb/wal/wal.go#L739-L760. -func segmentsExist(dir string) (bool, error) { - files, err := fileutil.ReadDir(dir) +func processRecords(name string, msgPool *sync.Pool, workerFunc func(chan proto.Message, chan error)) error { + var ( + numProcs = runtime.GOMAXPROCS(0) + msgChan = make(chan proto.Message, 128*numProcs) + // errChan is to capture the errors from goroutine. + // The channel size is numProcs to not block any worker if all of them error out. + errChan = make(chan error, numProcs) + wg = sync.WaitGroup{} + ) + + reader, closer, err := walReader(name) if err != nil { - return false, err + return err } - for _, fn := range files { - if _, err := strconv.Atoi(fn); err == nil { - // First filename which is a number. - // This is how Prometheus stores and this - // is how it checks too. - return true, nil + defer closer.Close() + + // Creating workers. + wg.Add(numProcs) + for i := 0; i < numProcs; i++ { + go func() { + defer func() { + wg.Done() + }() + workerFunc(msgChan, errChan) + }() + } + + // Iterating the WAL records. + var errFromChan error +Loop: + for reader.Next() { + msg := msgPool.Get().(proto.Message) + if err := proto.Unmarshal(reader.Record(), msg); err != nil { + return err + } + + select { + case errFromChan = <-errChan: + // Exit early on an error. + // Only acts upon the first error received. + break Loop + case msgChan <- msg: } } - return false, nil + close(msgChan) + wg.Wait() + + if errFromChan != nil { + return errFromChan + } + select { + case errFromChan = <-errChan: + return errFromChan + default: + if err := reader.Err(); err != nil { + return err + } + } + return nil } func walReader(name string) (*wal.Reader, io.Closer, error) { From 959cf2009f164da40d1c26bca812473d660b273d Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Tue, 24 Sep 2019 15:04:41 +0530 Subject: [PATCH 27/56] Fix race in processing WAL Adapted Prometheus style concurrency handling for processing the WAL segments. Signed-off-by: Ganesh Vernekar --- pkg/ingester/wal.go | 354 ++++++++++++++++++++++++--------------- pkg/ingester/wal_test.go | 24 +-- 2 files changed, 232 insertions(+), 146 deletions(-) diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index e4395b849aa..d3165991556 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -29,6 +29,7 @@ import ( // WALConfig is config for the Write Ahead Log. type WALConfig struct { enabled bool + recover bool dir string checkpointDuration time.Duration metricsRegisterer prometheus.Registerer @@ -37,6 +38,7 @@ type WALConfig struct { // RegisterFlags adds the flags required to config this to the given FlagSet func (cfg *WALConfig) RegisterFlags(f *flag.FlagSet) { f.BoolVar(&cfg.enabled, "ingester.wal-enable", false, "Enable the WAL.") + f.BoolVar(&cfg.recover, "ingester.recover-from-wal", false, "Recover data from existing WAL.") f.StringVar(&cfg.dir, "ingester.wal-dir", "", "Directory to store the WAL.") f.DurationVar(&cfg.checkpointDuration, "ingester.checkpoint-duration", 1*time.Hour, "Duration over which to checkpoint.") } @@ -73,17 +75,19 @@ type walWrapper struct { } func newWAL(cfg WALConfig, ingester *Ingester) (WAL, error) { - if !cfg.enabled { - return &noop{}, nil + if cfg.recover { + level.Info(util.Logger).Log("msg", "recovering from WAL") + start := time.Now() + if err := recoverFromWAL(ingester); err != nil { + return nil, err + } + elapsed := time.Since(start) + level.Info(util.Logger).Log("msg", "recovery from WAL completed", "time", elapsed.String()) } - level.Info(util.Logger).Log("msg", "recovering from WAL") - start := time.Now() - if err := recoverFromWAL(ingester); err != nil { - return nil, err + if !cfg.enabled { + return &noop{}, nil } - elapsed := time.Since(start) - level.Info(util.Logger).Log("msg", "recovery from WAL completed", "time", elapsed.String()) var walRegistry prometheus.Registerer if cfg.metricsRegisterer != nil { @@ -393,173 +397,214 @@ func segmentsExist(dir string) (bool, error) { // processCheckpoint loads the chunks of the series present in the last checkpoint. func processCheckpoint(name string, userStates *userStates) error { - seriesPool := &sync.Pool{ - New: func() interface{} { - return &Series{} - }, - } - return processRecords( - name, seriesPool, - func(seriesChan chan proto.Message, errChan chan error) { - var la []client.LabelAdapter - for s := range seriesChan { - walSeries := s.(*Series) - - state := userStates.getOrCreate(walSeries.UserId) + var ( + nWorkers = runtime.GOMAXPROCS(0) + seriesChan = make(chan *Series, 128*nWorkers) + // errChan is to capture the errors from goroutine. + // The channel size is nWorkers to not block any worker if all of them error out. + errChan = make(chan error, nWorkers) + wg = sync.WaitGroup{} + seriesPool = &sync.Pool{ + New: func() interface{} { + return &Series{} + }, + } + ) - la = la[:0] - for _, l := range walSeries.Labels { - la = append(la, client.LabelAdapter{ - Name: string(l.Name), - Value: string(l.Value), - }) - } - series, err := state.createSeriesWithFingerprint(model.Fingerprint(walSeries.Fingerprint), la, nil, true) - if err != nil { - errChan <- err - return - } + reader, closer, err := newWalReader(name) + if err != nil { + return err + } + defer closer.Close() - descs, err := fromWireChunks(walSeries.Chunks) - if err != nil { - errChan <- err - return - } + wg.Add(nWorkers) + for i := 0; i < nWorkers; i++ { + go func() { + processCheckpointRecord(userStates, seriesPool, seriesChan, errChan) + wg.Done() + }() + } - if err := series.setChunks(descs); err != nil { - errChan <- err - return - } + var errFromChan error +Loop: + for reader.Next() { + s := seriesPool.Get().(*Series) + if err := proto.Unmarshal(reader.Record(), s); err != nil { + return err + } - seriesPool.Put(s) - } - }, - ) -} + select { + case errFromChan = <-errChan: + // Exit early on an error. + // Only acts upon the first error received. + break Loop + case seriesChan <- s: + } + } + close(seriesChan) + wg.Wait() -// processWAL processes the records in the WAL concurrently. -func processWAL(name string, userStates *userStates) error { - recordPool := &sync.Pool{ - New: func() interface{} { - return &Record{} - }, - } - return processRecords( - name, recordPool, - func(recordChan chan proto.Message, errChan chan error) { - var err error - var la []client.LabelAdapter - for record := range recordChan { - la, err = processWALRecord(record.(*Record), userStates, la) - recordPool.Put(record) - if err != nil { - errChan <- err - return - } - } - }, - ) + if errFromChan != nil { + return errFromChan + } + select { + case errFromChan = <-errChan: + return errFromChan + default: + if err := reader.Err(); err != nil { + return err + } + } + return nil } -func processWALRecord(record *Record, userStates *userStates, la []client.LabelAdapter) ([]client.LabelAdapter, error) { - state := userStates.getOrCreate(record.UserId) - - // Create the series from labels which do not exist. - for _, labels := range record.Labels { - _, ok := state.fpToSeries.get(model.Fingerprint(labels.Fingerprint)) - if ok { - continue - } +func processCheckpointRecord(userStates *userStates, seriesPool *sync.Pool, seriesChan chan *Series, errChan chan error) { + var la []client.LabelAdapter + for s := range seriesChan { + state := userStates.getOrCreate(s.UserId) la = la[:0] - for _, l := range labels.Labels { + for _, l := range s.Labels { la = append(la, client.LabelAdapter{ Name: string(l.Name), Value: string(l.Value), }) } - _, err := state.createSeriesWithFingerprint(model.Fingerprint(labels.Fingerprint), la, nil, true) + series, err := state.createSeriesWithFingerprint(model.Fingerprint(s.Fingerprint), la, nil, true) if err != nil { - return la, err + errChan <- err + return } - } - for _, sample := range record.Samples { - state.fpLocker.Lock(model.Fingerprint(sample.Fingerprint)) - series, ok := state.fpToSeries.get(model.Fingerprint(sample.Fingerprint)) - if !ok { - // This should ideally not happen. - // If the series was not created in recovering checkpoint or - // from the labels of any records previous to this, there - // is no way to get the labels for this fingerprint. - state.fpLocker.Unlock(model.Fingerprint(sample.Fingerprint)) - level.Warn(util.Logger).Log("msg", "series not found for sample during wal recovery", "fingerprint", model.Fingerprint(sample.Fingerprint).String()) - continue + descs, err := fromWireChunks(s.Chunks) + if err != nil { + errChan <- err + return } - err := series.add(model.SamplePair{ - Timestamp: model.Time(sample.Timestamp), - Value: model.SampleValue(sample.Value), - }) - if err != nil { - // We can ignore memorySeriesError because duplicate (or) out-of-order samples are possible - // here because the WAL is not truncated to exactly align with the checkpoint. - if _, ok := err.(*memorySeriesError); !ok { - state.fpLocker.Unlock(model.Fingerprint(sample.Fingerprint)) - return la, err - } + if err := series.setChunks(descs); err != nil { + errChan <- err + return } - state.fpLocker.Unlock(model.Fingerprint(sample.Fingerprint)) + + seriesPool.Put(s) } +} - return la, nil +type sampleWithUserID struct { + Sample + userID string } -func processRecords(name string, msgPool *sync.Pool, workerFunc func(chan proto.Message, chan error)) error { +// processWAL processes the records in the WAL concurrently. +func processWAL(name string, userStates *userStates) error { var ( - numProcs = runtime.GOMAXPROCS(0) - msgChan = make(chan proto.Message, 128*numProcs) + wg sync.WaitGroup + nWorkers = runtime.GOMAXPROCS(0) + inputs = make([]chan []sampleWithUserID, nWorkers) + outputs = make([]chan []sampleWithUserID, nWorkers) // errChan is to capture the errors from goroutine. - // The channel size is numProcs to not block any worker if all of them error out. - errChan = make(chan error, numProcs) - wg = sync.WaitGroup{} + // The channel size is nWorkers to not block any worker if all of them error out. + errChan = make(chan error, nWorkers) ) - reader, closer, err := walReader(name) + wg.Add(nWorkers) + for i := 0; i < nWorkers; i++ { + outputs[i] = make(chan []sampleWithUserID, 300) + inputs[i] = make(chan []sampleWithUserID, 300) + + go func(input <-chan []sampleWithUserID, output chan<- []sampleWithUserID) { + processWALSamples(userStates, input, output, errChan) + wg.Done() + }(inputs[i], outputs[i]) + } + + reader, closer, err := newWalReader(name) if err != nil { return err } defer closer.Close() - // Creating workers. - wg.Add(numProcs) - for i := 0; i < numProcs; i++ { - go func() { - defer func() { - wg.Done() - }() - workerFunc(msgChan, errChan) - }() - } - - // Iterating the WAL records. - var errFromChan error + var ( + la []client.LabelAdapter + errFromChan error + record = &Record{} + shards = make([][]sampleWithUserID, nWorkers) + ) Loop: for reader.Next() { - msg := msgPool.Get().(proto.Message) - if err := proto.Unmarshal(reader.Record(), msg); err != nil { - return err - } - select { case errFromChan = <-errChan: // Exit early on an error. // Only acts upon the first error received. break Loop - case msgChan <- msg: + default: + } + if err := proto.Unmarshal(reader.Record(), record); err != nil { + return err + } + + if len(record.Labels) > 0 { + state := userStates.getOrCreate(record.UserId) + // Create the series from labels which do not exist. + for _, labels := range record.Labels { + _, ok := state.fpToSeries.get(model.Fingerprint(labels.Fingerprint)) + if ok { + continue + } + + la = la[:0] + for _, l := range labels.Labels { + la = append(la, client.LabelAdapter{ + Name: string(l.Name), + Value: string(l.Value), + }) + } + _, err := state.createSeriesWithFingerprint(model.Fingerprint(labels.Fingerprint), la, nil, true) + if err != nil { + return err + } + } + } + + // We split up the samples into chunks of 5000 samples or less. + // With O(300 * #cores) in-flight sample batches, large scrapes could otherwise + // cause thousands of very large in flight buffers occupying large amounts + // of unused memory. + for len(record.Samples) > 0 { + m := 5000 + if len(record.Samples) < m { + m = len(record.Samples) + } + for i := 0; i < nWorkers; i++ { + var buf []sampleWithUserID + select { + case buf = <-outputs[i]: + default: + } + shards[i] = buf[:0] + } + for _, sam := range record.Samples[:m] { + mod := sam.Fingerprint % uint64(nWorkers) + shards[mod] = append(shards[mod], sampleWithUserID{ + Sample: sam, + userID: record.UserId, + }) + } + for i := 0; i < nWorkers; i++ { + if len(shards[i]) > 0 { + inputs[i] <- shards[i] + } + } + record.Samples = record.Samples[m:] + } + } + + for i := 0; i < nWorkers; i++ { + close(inputs[i]) + for range outputs[i] { } } - close(msgChan) wg.Wait() if errFromChan != nil { @@ -573,10 +618,51 @@ Loop: return err } } + + if err != nil { + return err + } + return nil } -func walReader(name string) (*wal.Reader, io.Closer, error) { +func processWALSamples(userStates *userStates, input <-chan []sampleWithUserID, output chan<- []sampleWithUserID, errChan chan error) { + defer close(output) + + sp := model.SamplePair{} + for samples := range input { + for _, sample := range samples { + state := userStates.getOrCreate(sample.userID) + state.fpLocker.Lock(model.Fingerprint(sample.Fingerprint)) + series, ok := state.fpToSeries.get(model.Fingerprint(sample.Fingerprint)) + if !ok { + // This should ideally not happen. + // If the series was not created in recovering checkpoint or + // from the labels of any records previous to this, there + // is no way to get the labels for this fingerprint. + state.fpLocker.Unlock(model.Fingerprint(sample.Fingerprint)) + level.Warn(util.Logger).Log("msg", "series not found for sample during wal recovery", "userid", sample.userID, "fingerprint", model.Fingerprint(sample.Fingerprint).String()) + continue + } + + sp.Timestamp = model.Time(sample.Timestamp) + sp.Value = model.SampleValue(sample.Value) + err := series.add(sp) + if err != nil { + // We can ignore memorySeriesError because duplicate (or) out-of-order samples are possible + // here because the WAL is not truncated to exactly align with the checkpoint. + if _, ok := err.(*memorySeriesError); !ok { + state.fpLocker.Unlock(model.Fingerprint(sample.Fingerprint)) + errChan <- err + return + } + } + state.fpLocker.Unlock(model.Fingerprint(sample.Fingerprint)) + } + } +} + +func newWalReader(name string) (*wal.Reader, io.Closer, error) { segmentReader, err := wal.NewSegmentsReader(name) if err != nil { return nil, nil, err diff --git a/pkg/ingester/wal_test.go b/pkg/ingester/wal_test.go index b8593f06155..f01838bb55d 100644 --- a/pkg/ingester/wal_test.go +++ b/pkg/ingester/wal_test.go @@ -21,12 +21,13 @@ func TestWAL(t *testing.T) { cfg := defaultIngesterTestConfig() cfg.WALConfig.enabled = true + cfg.WALConfig.recover = true cfg.WALConfig.dir = dirname - cfg.WALConfig.checkpointDuration = 10 * time.Millisecond + cfg.WALConfig.checkpointDuration = 100 * time.Millisecond - numSeries := 10 - numSamplesPerSeriesPerPush := 1000 - numRestarts := 5 + numSeries := 100 + numSamplesPerSeriesPerPush := 10 + numRestarts := 3 // Build an ingester, add some samples, then shut it down. _, ing := newTestStore(t, cfg, defaultClientTestConfig(), defaultLimitsTestConfig()) @@ -34,6 +35,9 @@ func TestWAL(t *testing.T) { ing.Shutdown() for r := 0; r < numRestarts; r++ { + if r == numRestarts-1 { + cfg.WALConfig.enabled = false + } // Start a new ingester and recover the WAL. _, ing = newTestStore(t, cfg, defaultClientTestConfig(), defaultLimitsTestConfig()) @@ -42,15 +46,11 @@ func TestWAL(t *testing.T) { } // Check the samples are still there! retrieveTestSamples(t, ing, userIDs, testData) - userIDs, testData = pushTestSamples(t, ing, numSeries, numSamplesPerSeriesPerPush, (r+1)*numSamplesPerSeriesPerPush) - ing.Shutdown() - } - _, ing = newTestStore(t, cfg, defaultClientTestConfig(), defaultLimitsTestConfig()) - defer ing.Shutdown() + if r != numRestarts-1 { + userIDs, testData = pushTestSamples(t, ing, numSeries, numSamplesPerSeriesPerPush, (r+1)*numSamplesPerSeriesPerPush) + } - for i, userID := range userIDs { - testData[userID] = buildTestMatrix(numSeries, (numRestarts+1)*numSamplesPerSeriesPerPush, i) + ing.Shutdown() } - retrieveTestSamples(t, ing, userIDs, testData) } From d954865de7d30a1a97a6dc3209c5b10345d75851 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Fri, 27 Sep 2019 21:01:27 +0530 Subject: [PATCH 28/56] Small enhancements Signed-off-by: Ganesh Vernekar --- pkg/ingester/ingester.go | 2 +- pkg/ingester/wal.go | 30 ++++++++++++++++++------------ pkg/ingester/wal_test.go | 6 ++++-- 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index c8c6ba3e845..5b3bdefee54 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -272,7 +272,7 @@ func (i *Ingester) Push(ctx old_ctx.Context, req *client.WriteRequest) (*client. } var record *Record - if i.cfg.WALConfig.enabled { + if i.cfg.WALConfig.walEnabled { record = &Record{ UserId: userID, Samples: make([]Sample, 0, len(req.Timeseries)), diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index d3165991556..f4f96c803ea 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -28,7 +28,8 @@ import ( // WALConfig is config for the Write Ahead Log. type WALConfig struct { - enabled bool + walEnabled bool + checkpointEnabled bool recover bool dir string checkpointDuration time.Duration @@ -37,7 +38,8 @@ type WALConfig struct { // RegisterFlags adds the flags required to config this to the given FlagSet func (cfg *WALConfig) RegisterFlags(f *flag.FlagSet) { - f.BoolVar(&cfg.enabled, "ingester.wal-enable", false, "Enable the WAL.") + f.BoolVar(&cfg.walEnabled, "ingester.wal-enable", false, "Enable the WAL.") + f.BoolVar(&cfg.walEnabled, "ingester.checkpoint-enable", false, "Enable checkpointing.") f.BoolVar(&cfg.recover, "ingester.recover-from-wal", false, "Recover data from existing WAL.") f.StringVar(&cfg.dir, "ingester.wal-dir", "", "Directory to store the WAL.") f.DurationVar(&cfg.checkpointDuration, "ingester.checkpoint-duration", 1*time.Hour, "Duration over which to checkpoint.") @@ -85,7 +87,7 @@ func newWAL(cfg WALConfig, ingester *Ingester) (WAL, error) { level.Info(util.Logger).Log("msg", "recovery from WAL completed", "time", elapsed.String()) } - if !cfg.enabled { + if !cfg.walEnabled { return &noop{}, nil } @@ -188,6 +190,9 @@ func (w *walWrapper) isStopped() bool { const checkpointPrefix = "checkpoint." func (w *walWrapper) checkpoint() (err error) { + if !w.cfg.checkpointEnabled { + return nil + } w.checkpointCreationTotal.Inc() defer func() { if err != nil { @@ -606,6 +611,12 @@ Loop: } } wg.Wait() + // If any worker errored out, some input channels might not be empty. + // Hence drain them. + for i := 0; i < nWorkers; i++ { + for range inputs[i] { + } + } if errFromChan != nil { return errFromChan @@ -633,31 +644,26 @@ func processWALSamples(userStates *userStates, input <-chan []sampleWithUserID, for samples := range input { for _, sample := range samples { state := userStates.getOrCreate(sample.userID) - state.fpLocker.Lock(model.Fingerprint(sample.Fingerprint)) series, ok := state.fpToSeries.get(model.Fingerprint(sample.Fingerprint)) if !ok { // This should ideally not happen. // If the series was not created in recovering checkpoint or // from the labels of any records previous to this, there // is no way to get the labels for this fingerprint. - state.fpLocker.Unlock(model.Fingerprint(sample.Fingerprint)) level.Warn(util.Logger).Log("msg", "series not found for sample during wal recovery", "userid", sample.userID, "fingerprint", model.Fingerprint(sample.Fingerprint).String()) continue } sp.Timestamp = model.Time(sample.Timestamp) sp.Value = model.SampleValue(sample.Value) - err := series.add(sp) - if err != nil { - // We can ignore memorySeriesError because duplicate (or) out-of-order samples are possible - // here because the WAL is not truncated to exactly align with the checkpoint. - if _, ok := err.(*memorySeriesError); !ok { - state.fpLocker.Unlock(model.Fingerprint(sample.Fingerprint)) + // There can be many out of order samples because of checkpoint and WAL overlap. + // Checking this beforehand avoids the allocation of lots of error messages. + if sp.Timestamp.After(series.lastTime) { + if err := series.add(sp); err != nil { errChan <- err return } } - state.fpLocker.Unlock(model.Fingerprint(sample.Fingerprint)) } } } diff --git a/pkg/ingester/wal_test.go b/pkg/ingester/wal_test.go index f01838bb55d..c0baa0989a9 100644 --- a/pkg/ingester/wal_test.go +++ b/pkg/ingester/wal_test.go @@ -20,7 +20,8 @@ func TestWAL(t *testing.T) { require.NoError(t, err) cfg := defaultIngesterTestConfig() - cfg.WALConfig.enabled = true + cfg.WALConfig.walEnabled = true + cfg.WALConfig.checkpointEnabled = true cfg.WALConfig.recover = true cfg.WALConfig.dir = dirname cfg.WALConfig.checkpointDuration = 100 * time.Millisecond @@ -36,7 +37,8 @@ func TestWAL(t *testing.T) { for r := 0; r < numRestarts; r++ { if r == numRestarts-1 { - cfg.WALConfig.enabled = false + cfg.WALConfig.walEnabled = false + cfg.WALConfig.checkpointEnabled = false } // Start a new ingester and recover the WAL. _, ing = newTestStore(t, cfg, defaultClientTestConfig(), defaultLimitsTestConfig()) From fb75c9ccca878fa771475b776d7e45a3037250b3 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Mon, 30 Sep 2019 14:40:08 +0530 Subject: [PATCH 29/56] Cache the user states and series when processing samples Signed-off-by: Ganesh Vernekar --- pkg/ingester/wal.go | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index f4f96c803ea..28315c17d10 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -640,18 +640,28 @@ Loop: func processWALSamples(userStates *userStates, input <-chan []sampleWithUserID, output chan<- []sampleWithUserID, errChan chan error) { defer close(output) + stateCache := make(map[string]*userState) + seriesCache := make(map[string]map[uint64]*memorySeries) sp := model.SamplePair{} for samples := range input { for _, sample := range samples { - state := userStates.getOrCreate(sample.userID) - series, ok := state.fpToSeries.get(model.Fingerprint(sample.Fingerprint)) + state, ok := stateCache[sample.userID] if !ok { - // This should ideally not happen. - // If the series was not created in recovering checkpoint or - // from the labels of any records previous to this, there - // is no way to get the labels for this fingerprint. - level.Warn(util.Logger).Log("msg", "series not found for sample during wal recovery", "userid", sample.userID, "fingerprint", model.Fingerprint(sample.Fingerprint).String()) - continue + state = userStates.getOrCreate(sample.userID) + stateCache[sample.userID] = state + seriesCache[sample.userID] = make(map[uint64]*memorySeries) + } + series, ok := seriesCache[sample.userID][sample.Fingerprint] + if !ok { + series, ok = state.fpToSeries.get(model.Fingerprint(sample.Fingerprint)) + if !ok { + // This should ideally not happen. + // If the series was not created in recovering checkpoint or + // from the labels of any records previous to this, there + // is no way to get the labels for this fingerprint. + level.Warn(util.Logger).Log("msg", "series not found for sample during wal recovery", "userid", sample.userID, "fingerprint", model.Fingerprint(sample.Fingerprint).String()) + continue + } } sp.Timestamp = model.Time(sample.Timestamp) From 51beb93c0e6e9586d912b66c32c6a761e96a827e Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Mon, 30 Sep 2019 15:22:49 +0530 Subject: [PATCH 30/56] Enhancement in the user state cache and fix in samples buffer Signed-off-by: Ganesh Vernekar --- pkg/ingester/wal.go | 59 ++++++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 28 deletions(-) diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index 28315c17d10..f8297413875 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -321,7 +321,7 @@ func (w *walWrapper) deleteCheckpoints(maxIndex int) (err error) { // checkpointSeries write the chunks of the series to the checkpoint. func (w *walWrapper) checkpointSeries(cp *wal.WAL, userID string, fp model.Fingerprint, series *memorySeries, wireChunks []client.Chunk) ([]client.Chunk, error) { var err error - wireChunks, err = toWireChunks(series.chunkDescs, wireChunks) + wireChunks, err = toWireChunks(series.chunkDescs, wireChunks[:0]) if err != nil { return wireChunks, err } @@ -496,9 +496,9 @@ func processCheckpointRecord(userStates *userStates, seriesPool *sync.Pool, seri } } -type sampleWithUserID struct { - Sample - userID string +type samplesWithUserID struct { + samples []Sample + userID string } // processWAL processes the records in the WAL concurrently. @@ -506,8 +506,8 @@ func processWAL(name string, userStates *userStates) error { var ( wg sync.WaitGroup nWorkers = runtime.GOMAXPROCS(0) - inputs = make([]chan []sampleWithUserID, nWorkers) - outputs = make([]chan []sampleWithUserID, nWorkers) + inputs = make([]chan *samplesWithUserID, nWorkers) + outputs = make([]chan *samplesWithUserID, nWorkers) // errChan is to capture the errors from goroutine. // The channel size is nWorkers to not block any worker if all of them error out. errChan = make(chan error, nWorkers) @@ -515,10 +515,10 @@ func processWAL(name string, userStates *userStates) error { wg.Add(nWorkers) for i := 0; i < nWorkers; i++ { - outputs[i] = make(chan []sampleWithUserID, 300) - inputs[i] = make(chan []sampleWithUserID, 300) + outputs[i] = make(chan *samplesWithUserID, 300) + inputs[i] = make(chan *samplesWithUserID, 300) - go func(input <-chan []sampleWithUserID, output chan<- []sampleWithUserID) { + go func(input <-chan *samplesWithUserID, output chan<- *samplesWithUserID) { processWALSamples(userStates, input, output, errChan) wg.Done() }(inputs[i], outputs[i]) @@ -534,7 +534,7 @@ func processWAL(name string, userStates *userStates) error { la []client.LabelAdapter errFromChan error record = &Record{} - shards = make([][]sampleWithUserID, nWorkers) + shards = make([]*samplesWithUserID, nWorkers) ) Loop: for reader.Next() { @@ -582,22 +582,23 @@ Loop: m = len(record.Samples) } for i := 0; i < nWorkers; i++ { - var buf []sampleWithUserID select { - case buf = <-outputs[i]: + case buf := <-outputs[i]: + buf.samples = buf.samples[:0] + buf.userID = record.UserId + shards[i] = buf default: + shards[i] = &samplesWithUserID{ + userID: record.UserId, + } } - shards[i] = buf[:0] } for _, sam := range record.Samples[:m] { mod := sam.Fingerprint % uint64(nWorkers) - shards[mod] = append(shards[mod], sampleWithUserID{ - Sample: sam, - userID: record.UserId, - }) + shards[mod].samples = append(shards[mod].samples, sam) } for i := 0; i < nWorkers; i++ { - if len(shards[i]) > 0 { + if len(shards[i].samples) > 0 { inputs[i] <- shards[i] } } @@ -637,21 +638,22 @@ Loop: return nil } -func processWALSamples(userStates *userStates, input <-chan []sampleWithUserID, output chan<- []sampleWithUserID, errChan chan error) { +func processWALSamples(userStates *userStates, input <-chan *samplesWithUserID, output chan<- *samplesWithUserID, errChan chan error) { defer close(output) stateCache := make(map[string]*userState) seriesCache := make(map[string]map[uint64]*memorySeries) sp := model.SamplePair{} for samples := range input { - for _, sample := range samples { - state, ok := stateCache[sample.userID] - if !ok { - state = userStates.getOrCreate(sample.userID) - stateCache[sample.userID] = state - seriesCache[sample.userID] = make(map[uint64]*memorySeries) - } - series, ok := seriesCache[sample.userID][sample.Fingerprint] + state, ok := stateCache[samples.userID] + if !ok { + state = userStates.getOrCreate(samples.userID) + stateCache[samples.userID] = state + seriesCache[samples.userID] = make(map[uint64]*memorySeries) + } + sc := seriesCache[samples.userID] + for _, sample := range samples.samples { + series, ok := sc[sample.Fingerprint] if !ok { series, ok = state.fpToSeries.get(model.Fingerprint(sample.Fingerprint)) if !ok { @@ -659,7 +661,7 @@ func processWALSamples(userStates *userStates, input <-chan []sampleWithUserID, // If the series was not created in recovering checkpoint or // from the labels of any records previous to this, there // is no way to get the labels for this fingerprint. - level.Warn(util.Logger).Log("msg", "series not found for sample during wal recovery", "userid", sample.userID, "fingerprint", model.Fingerprint(sample.Fingerprint).String()) + level.Warn(util.Logger).Log("msg", "series not found for sample during wal recovery", "userid", samples.userID, "fingerprint", model.Fingerprint(sample.Fingerprint).String()) continue } } @@ -675,6 +677,7 @@ func processWALSamples(userStates *userStates, input <-chan []sampleWithUserID, } } } + output <- samples } } From 47a84345fe3af92cf04c41f3c0437300acc82723 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Tue, 1 Oct 2019 13:53:19 +0530 Subject: [PATCH 31/56] Cache user states and series right from the checkpoint Signed-off-by: Ganesh Vernekar --- pkg/ingester/wal.go | 75 ++++++++++++++++++++++++++++++--------------- 1 file changed, 51 insertions(+), 24 deletions(-) diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index f8297413875..99fe15dbb12 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -349,11 +349,19 @@ func recoverFromWAL(ingester *Ingester) (err error) { return err } + nWorkers := runtime.GOMAXPROCS(0) + stateCache := make([]map[string]*userState, nWorkers) + seriesCache := make([]map[string]map[uint64]*memorySeries, nWorkers) + for i := 0; i < nWorkers; i++ { + stateCache[i] = make(map[string]*userState) + seriesCache[i] = make(map[string]map[uint64]*memorySeries) + } + if idx >= 0 { // Checkpoint exists. level.Info(util.Logger).Log("msg", "recovering from checkpoint", "checkpoint", lastCheckpointDir) start := time.Now() - if err := processCheckpoint(lastCheckpointDir, userStates); err != nil { + if err := processCheckpoint(lastCheckpointDir, userStates, nWorkers, stateCache, seriesCache); err != nil { return err } elapsed := time.Since(start) @@ -369,7 +377,7 @@ func recoverFromWAL(ingester *Ingester) (err error) { level.Info(util.Logger).Log("msg", "recovering from segments", "dir", walDir) start := time.Now() - if err := processWAL(walDir, userStates); err != nil { + if err := processWAL(walDir, userStates, nWorkers, stateCache, seriesCache); err != nil { return err } elapsed := time.Since(start) @@ -401,10 +409,10 @@ func segmentsExist(dir string) (bool, error) { } // processCheckpoint loads the chunks of the series present in the last checkpoint. -func processCheckpoint(name string, userStates *userStates) error { +func processCheckpoint(name string, userStates *userStates, nWorkers int, + stateCache []map[string]*userState, seriesCache []map[string]map[uint64]*memorySeries) error { var ( - nWorkers = runtime.GOMAXPROCS(0) - seriesChan = make(chan *Series, 128*nWorkers) + inputs = make([]chan *Series, nWorkers) // errChan is to capture the errors from goroutine. // The channel size is nWorkers to not block any worker if all of them error out. errChan = make(chan error, nWorkers) @@ -424,10 +432,11 @@ func processCheckpoint(name string, userStates *userStates) error { wg.Add(nWorkers) for i := 0; i < nWorkers; i++ { - go func() { - processCheckpointRecord(userStates, seriesPool, seriesChan, errChan) + inputs[i] = make(chan *Series, 300) + go func(input <-chan *Series, stateCache map[string]*userState, seriesCache map[string]map[uint64]*memorySeries) { + processCheckpointRecord(userStates, seriesPool, stateCache, seriesCache, input, errChan) wg.Done() - }() + }(inputs[i], stateCache[i], seriesCache[i]) } var errFromChan error @@ -443,10 +452,14 @@ Loop: // Exit early on an error. // Only acts upon the first error received. break Loop - case seriesChan <- s: + default: + mod := s.Fingerprint % uint64(nWorkers) + inputs[mod] <- s } } - close(seriesChan) + for i := 0; i < nWorkers; i++ { + close(inputs[i]) + } wg.Wait() if errFromChan != nil { @@ -463,10 +476,16 @@ Loop: return nil } -func processCheckpointRecord(userStates *userStates, seriesPool *sync.Pool, seriesChan chan *Series, errChan chan error) { +func processCheckpointRecord(userStates *userStates, seriesPool *sync.Pool, stateCache map[string]*userState, + seriesCache map[string]map[uint64]*memorySeries, seriesChan <-chan *Series, errChan chan error) { var la []client.LabelAdapter for s := range seriesChan { - state := userStates.getOrCreate(s.UserId) + state, ok := stateCache[s.UserId] + if !ok { + state = userStates.getOrCreate(s.UserId) + stateCache[s.UserId] = state + seriesCache[s.UserId] = make(map[uint64]*memorySeries) + } la = la[:0] for _, l := range s.Labels { @@ -492,6 +511,7 @@ func processCheckpointRecord(userStates *userStates, seriesPool *sync.Pool, seri return } + seriesCache[s.UserId][s.Fingerprint] = series seriesPool.Put(s) } } @@ -502,26 +522,29 @@ type samplesWithUserID struct { } // processWAL processes the records in the WAL concurrently. -func processWAL(name string, userStates *userStates) error { +func processWAL(name string, userStates *userStates, nWorkers int, + stateCache []map[string]*userState, seriesCache []map[string]map[uint64]*memorySeries) error { var ( - wg sync.WaitGroup - nWorkers = runtime.GOMAXPROCS(0) - inputs = make([]chan *samplesWithUserID, nWorkers) - outputs = make([]chan *samplesWithUserID, nWorkers) + wg sync.WaitGroup + inputs = make([]chan *samplesWithUserID, nWorkers) + outputs = make([]chan *samplesWithUserID, nWorkers) // errChan is to capture the errors from goroutine. // The channel size is nWorkers to not block any worker if all of them error out. errChan = make(chan error, nWorkers) + shards = make([]*samplesWithUserID, nWorkers) ) wg.Add(nWorkers) for i := 0; i < nWorkers; i++ { outputs[i] = make(chan *samplesWithUserID, 300) inputs[i] = make(chan *samplesWithUserID, 300) + shards[i] = &samplesWithUserID{} - go func(input <-chan *samplesWithUserID, output chan<- *samplesWithUserID) { - processWALSamples(userStates, input, output, errChan) + go func(input <-chan *samplesWithUserID, output chan<- *samplesWithUserID, + stateCache map[string]*userState, seriesCache map[string]map[uint64]*memorySeries) { + processWALSamples(userStates, stateCache, seriesCache, input, output, errChan) wg.Done() - }(inputs[i], outputs[i]) + }(inputs[i], outputs[i], stateCache[i], seriesCache[i]) } reader, closer, err := newWalReader(name) @@ -534,7 +557,6 @@ func processWAL(name string, userStates *userStates) error { la []client.LabelAdapter errFromChan error record = &Record{} - shards = make([]*samplesWithUserID, nWorkers) ) Loop: for reader.Next() { @@ -582,6 +604,12 @@ Loop: m = len(record.Samples) } for i := 0; i < nWorkers; i++ { + if len(shards[i].samples) == 0 { + // It is possible that the previous iteration did not put + // anything in this shard. In that case no need to get a new buffer. + shards[i].userID = record.UserId + continue + } select { case buf := <-outputs[i]: buf.samples = buf.samples[:0] @@ -638,11 +666,10 @@ Loop: return nil } -func processWALSamples(userStates *userStates, input <-chan *samplesWithUserID, output chan<- *samplesWithUserID, errChan chan error) { +func processWALSamples(userStates *userStates, stateCache map[string]*userState, seriesCache map[string]map[uint64]*memorySeries, + input <-chan *samplesWithUserID, output chan<- *samplesWithUserID, errChan chan error) { defer close(output) - stateCache := make(map[string]*userState) - seriesCache := make(map[string]map[uint64]*memorySeries) sp := model.SamplePair{} for samples := range input { state, ok := stateCache[samples.userID] From 248566aeaceed89c1c8297ae97760d06ad648047 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Wed, 2 Oct 2019 20:25:59 +0530 Subject: [PATCH 32/56] Small enhancements Signed-off-by: Ganesh Vernekar --- pkg/ingester/wal.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index 99fe15dbb12..f1967a8753e 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -679,22 +679,22 @@ func processWALSamples(userStates *userStates, stateCache map[string]*userState, seriesCache[samples.userID] = make(map[uint64]*memorySeries) } sc := seriesCache[samples.userID] - for _, sample := range samples.samples { - series, ok := sc[sample.Fingerprint] + for i := range samples.samples { + series, ok := sc[samples.samples[i].Fingerprint] if !ok { - series, ok = state.fpToSeries.get(model.Fingerprint(sample.Fingerprint)) + series, ok = state.fpToSeries.get(model.Fingerprint(samples.samples[i].Fingerprint)) if !ok { // This should ideally not happen. // If the series was not created in recovering checkpoint or // from the labels of any records previous to this, there // is no way to get the labels for this fingerprint. - level.Warn(util.Logger).Log("msg", "series not found for sample during wal recovery", "userid", samples.userID, "fingerprint", model.Fingerprint(sample.Fingerprint).String()) + level.Warn(util.Logger).Log("msg", "series not found for sample during wal recovery", "userid", samples.userID, "fingerprint", model.Fingerprint(samples.samples[i].Fingerprint).String()) continue } } - sp.Timestamp = model.Time(sample.Timestamp) - sp.Value = model.SampleValue(sample.Value) + sp.Timestamp = model.Time(samples.samples[i].Timestamp) + sp.Value = model.SampleValue(samples.samples[i].Value) // There can be many out of order samples because of checkpoint and WAL overlap. // Checking this beforehand avoids the allocation of lots of error messages. if sp.Timestamp.After(series.lastTime) { From f1475704fa46289bd7cfae920567fcf15274a50d Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Thu, 3 Oct 2019 15:29:10 +0530 Subject: [PATCH 33/56] Fix the flag Signed-off-by: Ganesh Vernekar --- pkg/ingester/wal.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index f1967a8753e..58289eeb5da 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -39,7 +39,7 @@ type WALConfig struct { // RegisterFlags adds the flags required to config this to the given FlagSet func (cfg *WALConfig) RegisterFlags(f *flag.FlagSet) { f.BoolVar(&cfg.walEnabled, "ingester.wal-enable", false, "Enable the WAL.") - f.BoolVar(&cfg.walEnabled, "ingester.checkpoint-enable", false, "Enable checkpointing.") + f.BoolVar(&cfg.checkpointEnabled, "ingester.checkpoint-enable", false, "Enable checkpointing.") f.BoolVar(&cfg.recover, "ingester.recover-from-wal", false, "Recover data from existing WAL.") f.StringVar(&cfg.dir, "ingester.wal-dir", "", "Directory to store the WAL.") f.DurationVar(&cfg.checkpointDuration, "ingester.checkpoint-duration", 1*time.Hour, "Duration over which to checkpoint.") @@ -170,9 +170,9 @@ func (w *walWrapper) run() { elapsed := time.Since(start) level.Info(util.Logger).Log("msg", "checkpoint done", "time", elapsed.String()) case <-w.quit: - if err := w.checkpoint(); err != nil { - level.Error(util.Logger).Log("msg", "error checkpointing series during shutdown", "err", err) - } + // if err := w.checkpoint(); err != nil { + // level.Error(util.Logger).Log("msg", "error checkpointing series during shutdown", "err", err) + // } return } } From bc8a1942762304ac064759437db0c60b005f962d Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Thu, 10 Oct 2019 17:25:54 +0530 Subject: [PATCH 34/56] Remove test files Signed-off-by: Ganesh Vernekar --- pkg/ingester/cortex-wal/00000005 | Bin 229376 -> 0 bytes pkg/ingester/cortex-wal/00000006 | 0 .../cortex-wal/checkpoint.001065/00000000 | Bin 32768 -> 0 bytes .../cortex-wal/checkpoint.001066/00000000 | Bin 32768 -> 0 bytes pkg/ingester/wal.go | 6 +++--- 5 files changed, 3 insertions(+), 3 deletions(-) delete mode 100644 pkg/ingester/cortex-wal/00000005 delete mode 100644 pkg/ingester/cortex-wal/00000006 delete mode 100644 pkg/ingester/cortex-wal/checkpoint.001065/00000000 delete mode 100644 pkg/ingester/cortex-wal/checkpoint.001066/00000000 diff --git a/pkg/ingester/cortex-wal/00000005 b/pkg/ingester/cortex-wal/00000005 deleted file mode 100644 index f9735cb19995de241d7f4cd6810e8123228c6ada..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 229376 zcmeI52YeLO_Q!|k@_*7IFX(7^RxA_=YV3&E6-5-WqoUXwcE$2kRK!9Hgn%GQ5s@NN zLQ{$g7Hl*Td%=Pg1Vt$#{(s+1$erEnF0-52C6MdqWBAU_%$+-D&iwYy-E+=Vzkltg z``>bP!I14*SFdtF-5s}GKl1C@(>D|>omZvyqxEx~Dpj_9wCLzVavbNeS~<>R%>|DO z9tXM#x&qw<-GJ_b?m!Pg51^-@Cy+152YLy50lfvifdWAR&_~b*C=?U|eFc4ieu93$ z6M`pz{(}C%0Kov@Nx_rAK*2!ZDZx{~(}JggL4rZRV8LKuh+qgXR4^2HM(_+UOfU?1 zR`4t^TreDXPVgKsLNEdtDHsWi5{v>y3q}La3!VqY2*v;}2wnij3dREC1ml45g7Lrv z!33a4Py|dAOavwgCIOQLlYuFMDZo_0RNzIyi@-~Qmw;)4X~1;BbYO;H1~5}F6L?wh zGVqGv72s9DtH3P5EMT@^Ht?F@HQ;r@>%bd=H-I+9>NFTr2Hdck_&Z^7Td2Ehj4AHhF@ zP^Sj&)EXgYV$-QLoCow_71ll|_!sbyU>l&bAOt)ts0wru{2%a$;NQTbf^7jLs2YF- zRR@ru8UPYh6F`D$0Z335fCOy^AVIYOB3AVG%!NYJ4G5_A}V1RV|_K}`W9=m-D_Y6c)dM*>Jta{viC3P6I629Tg*03_&G z00}w{K!T13kf0L)BU4nTsg z2auo+01|WqfCSwLAVD_)NYKpy5_AiI1lI=D^51>E;02Js+00kNdpg>OnDA3aY3N#2nfd&I8&=3Fx8VaC5 z&j2XUFaQO57C?c911QjQ017k$K!HXAD9|VX1sV;YK+gjx&=>#(dI3Oz#sVnNH~F302HVQK!GL#D9|JT1)2iF#8-N1+4xm7*02JsC00sIJK!H{R zD9{=J1zHQBK7C?Zi0SHia00F82AV4(%1gI8(0A&FP&~^X60XlDQc+66#>b_Eci-2en= zcK`v}13-Z40SM5Zg3z!=*L1dHAQb2r?1BOv3!p&90VvS%019*hfC8Nepg<=9C{POk z1v(i(fldKXpq2m%bSi)XwE|F}(*P9cbN~fv4WK}004Pu!00lY|K!MHzP@uB`6zCiP z1v(c%fzAU^pz{F~s4aj3T>zj!7Xm2IMF0xa4nTn}22h|&02HV_fC60#pg@-aDA45q z3Umd40$mB9Kvw}M(A5A6bPa$4T??Q<*8wQd^#BUg0YHIn08pSC0Tk#a00p`kK!I)n zP@r1@6zDbp1-czTf$jiMppF0vbSHoU-36dPcLONUJpc;yKL7=~7eImT15lv*1))y+ zy6d;KV<05xDGY-IJq;j1g8(FGFn|ON0g#}f021^JFibEEcvkQ%FkCPkcuw#fFhVc_ z7%3PDj1r6jMhiv*&kLRh#t6m$F9==$#tOy);{@Y?@q+Qd1i=KLNKgb!6ifsr2_^xP z1(Sg(f+@gM!BpTy!Hd94f|r14f@#2X!E|7TU3P6FD04UJc01EUCfC7CBpg`XND9};>1^OO9fqno`pdSGg=qCUL`WZlhegROR zWdI7a96*771yG5m0tiqe00G(;K!6$p2v9D705t&+p#1;@Xnz0!Isia` z@&E+rKmY+c2ta@i1`wb_00ihz00BA-K!6Sh5TK?20(1m`05t;;pd$eUs5yWD9R(mj zM*|4ZF#rN|EPwzV2OvPl0|?Lw00ML(fB>BYAV4hu1n6V{0XhXhfLa0w(5V0d)Cxd= zP6H62(*XpiHGlw}0U$ta00ihv00BA+K!DB$5TJ7a1n67<0Xh#rfX)XHptb-4bOC?> zT?imR7Xb)RI{*Q?7(jq70T7_}00ML=fB;Pp2LJ)O0YHFm1Q4K`00ih}00FuMK!9!q5TM%t1n71E0lEV~fI0#Q z(47DRbQgdC-3=f>_W%gc{{RH&UH}2Q4?uwK7lbz)N`2Y&E06>8j0w~b?01C7SK!H90P@u&C3iKg>0(}Ia zKpz7r&?f*2^eKP>eFmUFp93h+7XS+MC4d5b1)xAn02JtJ00sI6K!LsmP@wMs6lf`c z0(}plKtBK|(2oEL^b>#r{S2T$zW^xEG5`fy4xm840w~Z500mkJpg_L?DA4Z!3bYD9 zf&Ktcpg#c=Xf=QWtpQM=wEzmV4nTqa0#Km!01EUsfC6m*P@sPVp-xT0L!c>k2($}! zL4bAz5TM-v1ZZ~v0onsVfa(DV(4GJSv=@K??F}G6`v3?~eEn@*AbP)M8bE-~01%)y00ML-fB>BZAV6mW z2+%nI0(35b0G$UQK<5JpP+I^2x&T0cE(8#uivR?u9e@B`3?M+400>Zf00FubK!7d- z5TMHe1n3F?0lE@EfUW`%psN7{=o$b4x)wlyt^*LD>j4C)1AqYC03bj&R?XSzv%ZR< znb|vQMEgT}w%;Wu@t;tBeRog|H{OPtg}v9T)C_E%SxVe+c&%=&C1wZRR>|$2_z2GU z+>V~ci)!ULMcI3{-P9RxqS04!PcV%}Z7b*YTH{S$ry~a*Yg&}^Zo&p~)V`kWKibsx z;XcY4y$4w$TP$gQUx;;iazg#zJkJR)g*W&6$620RN8fIBsuVZ0!dG8>H12H3ZRf`G zIUoC{xJjt6f%|t&bKr{Z&z_!6O4;dc-aA^}?6l6lvvq=- zE+Bt4r+`13rC6Pe`x*Y6;;!-A+>o0ebenMx7HdVQUqg3-$E#y)6aGv?AFoIZ_L?yR z+T46}HM)_fA&dS`O^*`!a@_NdwZfU}RiI>5IaZ5NpGlFuH;NS5Efu^HMK00js2vk8 zMk;dRZ4<9`)_X0Gc-xA!))`#4#d*%|*>hT!hz_o+(RwQ1Z?Vp+?6AT$7cO-fj&-Q= zeCEt=^^ECQtd=5$H`W$$KXaCNb+R0S8}|rX{fe-gy2f+TJ@)k+s#sqy$il{Xo7K>f zPO8T=j|iHM$x$yi-8Y~6puilpL|1!N<)GyhJ(r=H`MI+>3> zlG?->nma2ktvf}zBTd5$2+Wd?9%_!l^WExWhZHw)7Ur&U+GLL^JG@Pk%W8HYNlPrx zj3rVuVAO7v*j+WJ->o|4$kTLMa@RP@h=~{FvN3Pj%sH&4v0Az@BuP_O9+lU$Ri0)R z#uZOdBt9pHK>=Erl3y0U+)O0H|tzVsFPRg8Yg(GKt zhr>-r^`k@i1xvF~@#j_3VgP4xt9?8tk?K6Er-`8Eop2k|Y+6pfay06UuUoyXv1Fv9 zCarYFW(lWE&1OC*a+)de4*H6oenehPEvN_-_t};4x6~&Ti@Ey;D&B~U1P`q z&rq{V-Y1_3X{`_*>-StE<<-Yto0E%MqBY25qy**`-*m zOHw_*rN-JK?q`Vxj&XOgD`Ni;=nSn}5dYaJd8=(9bM%Q%%vtOmWhdS?YHeW7Sv>8T zT{`NBI!@We+6#v(x@51Jy(!VMdS6PkG*~_9Xq9dWuc)SwHHD!B61LX3+LDknx#^PP z{OKx$xICz~{{=iG*aqk<2mucZssddE{|7uG_&4yVU|ZlZK{eoUL3N<3pa#%QP!s4b zs0H*8WC1+|+X4B4+CVSCe}LYCY@k3;2k0Z%9w-#-0Q42OKtI8bz!QRHAaeEz&b|u}Ir|M9wY* zh@5>NAaZsQK;-NP0Fkqc0U~EV1c;pd2q1FyV}QunPXHokKLv=K{R|*-_H%&9*)ISh zXTJo9oc#(Qa&`$oB*CxFP= zp8+Cge*uV`T?P<2yBr{L_E!J}S^=OyD*+VfHvk3t9YBFr0VvQP01EUcfC8-s0viL_ z(0y6c4uNC@1jr77WQ0mI1ge^|{p3Lu#LV7i?@-tJZoL|zu0>h(YV4v~PqIWeq=x5} z?f}m#Jpi6pdICJJKn014U)K!WxLkf414B&a@s1T_GVpoRbv)CfR=_63lj#sCtO3m`#F z03>KX014V3K!Oeckf1yO2|5r!f(`8>*S__~+>i`t!FMvtg^}yePzkv;c z4ZuHwe+0!vFC8a)M2kip_Oi9Vu}wO_10_F-%1Lg@%2j&Y}0}BLa01E|e zfcFJw0*eG^0Urp?26z-Z2e9KH86l(NAY1qnzq8{Y8NrT&N)+frT*8inWCWhX{=gbM zi7^gRpiQIQb}AcgqnY0I-VnC*rW1zp8@si$T>sHPv7^0WoksYx%9Ky2$ZtTTT5%0@OS3gD@2*gh}id@WPLDDH86tjMe<^UJ>*Pl*HZWFM!T z{3aDN+c1$Ao0HTdl*e!7#eOny%vHE73DOM`D;%*)j2bPq$ zr51M!(0ff&W&PM=@?`5?c$!UA2+J0AGloi$(pyTvN2dZQ>gMA(em5zbx2UnIr=Xi# z`u*m&#qf6N%CwCfr^f1%zTJ~c4ck=yqQ;!e=%}=8ta@Z(PzqGTzS4*>XZy1 zJWHtyZ_p;RIgFbuQ5>|%Q*5u2<}hv<4qB4O_~s=^&Pc}cg-9Ju?~JR83N1)u0k>GE zSMJZ8q7K)&Mp={9Qg89L8uzcAcExx93E-l|J*$!$ZfCYBbC5u1sXPYR>T9^A;;{-D zU+3;RyUaz1d)?{nQ! z&5G({t5$pcSL)qYgW@AoC3W)qiN6fIXUJ} z_^Y~ZumjIM*X%A4kt_PM0IsnTBHvqp1ZUj~cZUR;cZU#1UCZx>(;3C*j zkc?nQK{A3J1<45HAZ2V6)HwX!=gIcH&$bwZ+^#8dY==NH0x3x6VGUBT&j&scv;{sD zTmVqL?n2;mfgJ+L2zO%*QnBv=NX7mi@VDSzV1wX3fK=@J1-5>st%&uaeBni$!VZCC z1Um$h5i&Xivh_1%ah}2K5J*O_Lm(O9T;#aY4}q%YgqDu0x9sFlkNOnN>rs@|fwG!C zWuTrIC|{5d^b+&}dJB331%d*gkDw1wC@2K_3i<;51pR;~1Wy3{1^s~mf&svjf+vB2 zf`NdYERhlHNI^!hBLx{Dqa%d|s;*mHHcJ0KpgXb(cjSYDe*q5(wgEZ|LcqgL}&*95pn@UXh#4M+6h2}asWiAE`SLA7eIt|2JC1-MnHt@XhBA>qXik^Htq|4 zx4s>qX3HJGYC%U}jo?mTt>7+To#1Z3=Y)x?`Y%`2j0#Ik-(sk+q1)Y4=*4Hww|+AgI z)vW+>4)vCdQ;Xl3t)E%*lyjJp6%@~yj+z*;&NzYOq zGDgmNZz4#>4r>lnUYB)CFO-Uxd$L;15mKa0Ijl7If-KrjPIFW2P2AlgGZLw#<(68+ z?3AYLE^`1MRGgLAyjStpmX96cZj#}g#Hne_Nvupo;9_0Rl7z%jt;^)XrO9P|7928t zRdm%7H^wbxw(v`lZtHiiG}lYcsYrOFk@7QunP}b+*k*Z(^cc zcUG*n)Y%2IXcH98{lEj6XUSh0 zcvVVYpioSRo~Y2qUu{gR^jOIrj&WhZV`9?isnVm_*T!F?xA%6tGEck4B+3JC*4A8( z$dN!foYWj47>b;Diq)nhHMydKrjZIZOKLi{B~m4=)t2~DIg2|S?v{SvC*C${ZD7t> zJR_Q&S}&qHm5D`7ENv72xjYj+Nd6d3aP`>|h<5V*irT{#X50Adc9daKTnyJIa=Ehz z7uu|~){;5{xoewnx5IlS{dMmxCs^4Fvo%|utB@1@w|p{1S=RO1KRiD~y_R3Krb?Ba ze=S+DLryqhdIcdWBuuXaNSOW&;MK9e0~w9LYtnFJ$eGe~WEC5JhwW3U*25y~&h8bl1gjo8=Ke?b!d0{6mL!B}9NU>q=B zFdmp7m;e+BihzlNiNGYmBw(^&GB8Cj1(+(B3fSB}8DX)ku^4OEhDJx5E|C#n$FH%d>1~euYtXR z;evgD=LGeE5rPK5NI^qjl%NqXTCgwhyr3~KMvx2e7}f;fF>F78$FThY9>Wd*cnr$} zcnmub;4$nVfXA?d0UpB+0eB2M6yP!JFo4If!vUMKC?lXiHfK>rNPo^^$r#Aix#Z@v zW1tcRdI@Xr6gCauDQr5xQ`iiEr?8m-m3ChSsI>bEz_jhFz%0QmV76d3@S5N?;B~?4 zz#D=$fHwti0&@g&fVTv10dEW52Hp|81H3DE7kE$b9xzuh7nmoQ2h11D2Nnnx01E{R zf%gUP1B(QUfDZ&80CplsM%X+*95v=C%*h_nqS1!aRA8rqWQ6jSZm_j3Z3Qd)dzfycN-+#lkW!;_cT6h4DQe?PrWJ$yYGcIpPEBdn82{e6q|T2n%5J;<(|$j_LUPB%M^)1 zZ+&KVC!1qvxRL;jj$|87oWa|_2I|d-W&=Z^!uoEv>Tanw`79}98gK>&m33~~ z43kqP1;#1L-m`7_Vq25QG}IHSuY|z45eb3wH?>n_u0c7n4x8T>NI@abr4KCUP;Z58 zXWEUruIXD06*kZ#r8#g#_GM2mlPw)!445zhG3mTn&8c|mN1%{S>+Czr*1PZ=KO%1? zWncE#qGF{YruXf}#*T3*)UTmC!Q;_!H%wIGE$&XquZX3|%|}$W)0 zTiq?lw#$qtTyxyiRXD&NhMThaios^s1b%&RQIF?jyqa5gTg@#A4=K)FwAI{_Fj7n9=azo|CberY8>t8~zT~u>;%A6gw3=JSQnqrNTdElusoZ8vA0|z$Hdxa{Kf~(3JEt67 z{gz+v9eTk%1w*!LUA;=4l6TjSEY010tUkHB+dfuw^dZ`wS4lk1@3*{4qAS3wB)S2- zN}@Z^L(l{0Dd-9CDv5l+X2;10Hakv6ut8BW01Bj3bYHi_8ea-7 z2EGzp0xS`<2fh|u3Vb8D4ER=XIq;p}3Sg<=O5l6JRR9unHGl+N10X@y0!Ywx01|XP zfCO~_kf0j?B0-$ z9Vc@jK&cObY>(3$q04~BqN|e_uvvJ(Ek7w=w1K?x(`5s?iYkQ+3Yy%VaGr+ zf*k|N2zCr4BV@Ef0NHUN)$nq1Bue(B#xlaLNYWs|Zopu{?!XYi9>7pRJ>VI^p1?4H zEf63h9D_AT&pj3(J@+_(^xWeC(sNG$NY6bHAU*dafb`rJ0O`3W1ElAk0x%QZ5+FVI zRDkr{R=`}rX#nZDrvs$twgyPgJp&*;w+%pg?wJ7Txn}{S=bjCao_h{p3k1jr8Jz~o zB|Wzjwtqku;Q`=5!Gpj1Ta!C5*Q^I1&kJq2A&r@4~!9v0qk#KG6D+p4%R?{-UU#g_W%@VE`S2f15lv( z019La1jq;#{I{?s;UUlzI|NF78psZTxZ&+Akc_Z}%mP)-+3B;sWGwcMld)(5a*7K` zn|L6zCQOk*T;)s#OZz*!ZE|By@l{)*yt5@6lV}L>Qc6vp!@A_&UtSXTU8hQTSN+WD zCz~_eg~Wdpm$m^Xc1@UG>Zvt0!O{34{%nkDJ}(cYH^$aS&jLws%z!rA zB*%^EBPEKQYLcU#IZ71Shj+x?Boj%FrTP`O7s;5@3&Y-SAK3sx3uH- zjf{G&Na`8ou?$=}J`204$B#0e;dp_~aNMZ3K0K~uq}!`1+a@8+l*Nob2E>^2K-MIm5*Jmb|BNRN49+`ccJG z<0x?}QELN895XF$aaoR))m>Hiw_A0tTb_#krc)6X#=d1U=dcRuZt2F5Bn5JNZW%dl zp2o4`it8^Khb|AL_eo4zyPv0IK~zU0C(5E$Zq`Mc?dX#y`EYt~Z>2-*^S%51d zbGs{2VXffiQ3EvBme`atzHarlZmCyAI%?9z2#f_1PMMm`d{AU9t;9R%D|(J=PUa(D z!vy7IjrHuLiGR;h>d)=zm?%wr_7!PuXmCR}zpgQ4fPE`l{mmSvLU^p-bB&Z&AA4=8 z>YK5HkH*XX-{%qeMj!qLS>xdVn2^YRVRQOJag6%66GubcWU~i2v-A zd|$SZIr_vW<}CJ(vJ-C`wKmZlQq-o27mKGovrF&w#9MD%TYKSfMVIU~v-i?@Rcf!< zml7=v)=WBDrCY))sws0(k#Xh8UTj=#jso|a&M6L@wW0emrw!eg5sn}fjW=sG17--0 z1ZE1F11}4X0(i5Q&0drdF2WjHNZcA-ttb^7Z0J5$!)7na2sV3BMzGn7GD1eP7j5W% zi2&KqeSMENMUL%z{bYnn_FliLIonSjL_y5#ZT1dztM49BBh;-Z>xdeP%jzz_=#F3X z5cB|g3VH(hf_$Ktpcl|v&>JWa6aak$eSktiA<$RQ7w9MG2RtEo0_ZR34-60t0G<>) z2@Dhr1ng)*MzEs=8NrSgWQ2^478UGnP{FGm zQ1sGqvPZONbk<(sk}LL?BVDwle>t*+FYQl9@*Dfpk&JLR(q{`_+Q}gK!bE&wl3)@r zSuh!xBA5b96-)(Q6ub!7$sifQP6o*cJ|=?{UeoBXoi+y9Yl8J=%e)CZ?ukU5S9p<}SS1;5EY%ZOl3`>9Ls>tL)yY7U4OP16scksLyp;2` z(orxY-Ac@_|FLXae6b6?Ukc%mD#)J-nr)cKi;d+}JlC6|y4b7PW-A|*C_Xo3FLp~u z0j3A!AO2qJUguWjAMVs?8b1>c%o^RrY!D_=54WoxYya@-NL@_2C|1(Dw})0A@)1FP}ov2>-c@fM8#MzvDbojoQ`J)nXkvc)gVzWSo?>)B(gOIb{BMFC^SxD-fjx45k*zoKmVqK2%a zQyj9I!zc7(vzZU__eF0Siu7Rfd0g658l#4*sT8#cE{!qrfh+ELRE`x&Wn6i_TRLzG zEJHkK>)BLBo5~m-3ff%T@{}7aNoRcFl9I{O^4zA8I-1_cebp9Pkj6T0ZJ0(fHW_6OnLMTUNlW_r?YVBLSVeU&a{eo76+E_2d6-9&uH4SeNImJPxw2!Zt0>~ zQQ?U4ZKk6<^O&U+J4^YtayOFHu@O9TM5aj1Qrw29hGjN$R1M3sq-t1ZDQ*+7RxqqI zyHdW*EXCU1epYhNBsN62e4DXkq@z*{%dAV>P}EvxlTzF@x}~|f?fkYRp_^m2tw;_V zbyDSAJgh5k7Am0hOQ(hkY`M^JoK6!d{?f^;c|dEbRIzV7d|H8EPZJ2nJ68t*ymNIh zz&lrm0K9W`D8M^cp8M-D0!Lz_{!EnI7@lZzKovW{54c@u>I>0+u-vD^$>YD)X zT%7~(&Q+TOAR{cr8s7`P2e#rIfUES<=Sj^w)zqol_$8kd!Sf&+hu``<*}nDJ zmV%J8+ENfQf-MChBizO}Stf=+wt6O(w)L=N1e*dWBiIy38No*iWcVR$4fjrYD~Q|b znKB{*WQRa@2vjUUb_f(P1ge@7S~{-YvXetS>$_cQgnAZbb)l%Ht(+<07$ZxKL55!t zya0?9j0MIC#sT96mcsxlwj2)dxMC+vWCS}} zkP$LAVbYMwb)DpZih*p@zh+bxplC*= z8m43gMKh+OB1WupPV;9@Vcmn{$8{$@xHDL0*gbnr%Z-JW5cQe!y0_j?)XKz*&ivduZfWOYZJV^~nnn@@aht6&M$UR~8c48jda?t9lB#drs$gw4Q>97 z%k!j^mfk0FS!ShDv~zO5ub#`0MXU@>N1?Rb?BlB{1JkwM8i?ns=BrYSj&I}cWCR;`CnFqzCr%eM17--01ZE1F11}4X0$vf= zcQwigmFQiKirZh*IcvXK&Qn<}YK3~$cc<0}^(xAmO7MPfnW8t2Q6MM)`Uv^}g@Qt$ zub?l`PtXr|LhuC8U(g>IAQ%8VDR>eXC>RLXq&^wJCiTe(HmOfW$mn>Xf%{?gkTbFA zht=(X0WG4uoE7Ofo_6sKzBiR zpogFb&{NP8u=$EIg3VWy5#E+dyp2n|BX|dRSMVe!jcba62JEdOPAT+GeHJu%stSDzchFgD@;8XHVGXSP- zX97&yz6>yJ`wH-?;8g$xngyUhvjJP=TSizSYb?PUUkknlz7c!_d@J}C_)hR0uvAb< zihNVxvbbi$ht2F%5EtK01<44u&bN&4kz~M51<45MuJdh+Tp~R7_b?eD`QO88*4_4* zEB6Th4rxb2vLJKPpW{zo2)+Qm6nqJMCHM-c7^5MD@HDzJ+|uZ(sE(E+BaGS^aodeLOp1$6QSPVa5EUjcEHcGLQx`F_el%U-DLC(hvAUjy}CM6*GdLWT9+Zq?mVS5s_7a;L2KVl6Sd zN3~v_#j+_dPEq!rZGD}4+~X#ZX{hNkKbFw6m^TwNj$fG$Mdli$oW7-p%liT;DCC+; zA6U+z-U-{zv}?{i%@itZpr=c7;EL?ao?fOY0Ws;kS$VSOPDUY}*4cNqPH@8o*U+8d@#wg_Br5S1ccF&9ZLmns2?-bASQ?41SKdOFp zv*|#EnW2i=)h(oSn=xmsvq=84^gekhJ^ZTp>EY#=UQPZ)liI_UZF5U?L-7=z>A7;} z-In=Eh6|L&S{boDEKI(^B+fdxahYhAs{fy&ceKe9A`_woAV?{52e z(b0!!dm9`sBk&%H5&R^?dn85zyhmabFj_Dg;5`!012#BZMzF!*GQ#(AiSKcV9|S)D zKMH;XeiHly{4Dqx_(kvwuuQNFST3mG!Ql;fm6bB&y4Y9Qq#hn_UuA>06PszH!)1ha zSc7PUivgk$E&+%}Xb%vZc_~0_=4Ak}nU@2^W?lghn|URG1YHFnL01Dv&@})ObS;1c zT?ZgR*8@mU2LK7W0r*vLBd|hn6F^aqn*oY?+yYS4<5qwq-Hb(t+YC8OZo{Hvgyh4b zFrAHuk`cC?c&M-ft>D5{l(IrLLrxb10ZM%cWHaP!Ss^*lMcmdzWZE&1jBpRuSTFb= z@VDSzV1wX3;2**Lf>0-$A%{Ke7)VC2V;~v9mP(QlGFmE$47rdq*;WZieGFu$fe>*! z4J0GjA&`vlmxed~hd?$v&JKZcoEI^I9RkS+b_gURWONABBs>I~VuwI>2&8YWrYMmu z5FjH|jsgKyb9VZyFX@WC%cd)us2tv!IH-n+2Qq8I6dA--&P=ehzp&e8Hs%yxwI#|S z3M#Wv5g&D`&}?3CZ&TcinT>yY^1v#(Kf5MOFZJ{q!W9Z zWH)9&n{9^U#;hBHZp)G3XcXCpcf{RftI2T0b)9<6^Bhh!FYfvJU7;+;j;;~jpxznk z_watVvssQC?m7E*%md%{QDzLj{ ztZS@bZclQs0=e@vjvZHAd?|eS%Kaz3PhxUjH~w3dBPYtreJTMqV6z?F(yy@eWIIwK z?zdW(JGZDrTxj9&dS$A^g|8+vtzVsF`V!zJ4!S94y!Tz%k#D3=@mL8f#--S1;@u)& zO`CFsI5&>-BtCf5+Q_;@!Of#a9j3A^I^*kBZ|j;16x6eN!YR|Fj2u}=EAg)Sik{;+ znUB1Q*>U$&UY56Rq9*=5O+@5&$893kib#izwZ+!_S!i%WSKs!fjuqg!rm2Qja31UT zTqEVx$6lK%`Ie%&E4QcmuzT{GNLI&OXf8+O_kYrLUXnk)-#zRUXc7}#evO)z?VO=? z3*tXJC4a~*WR5=Zi8+hCqwK`nMy*X$Eh}o%#EZq#p4q8<)74%$T+tn+4*d;~JG22HcjzBMs8i!=V->jH zbgW}vY=dE*mSJ`U1_^cp1`Bovh6wfmh6?Hd&j|Jeh6(KZ{bU4N!9hl_*^4rQ4c&L1 zkQtsp9{LOV0|NvDfF}h{0s{pD0h_%jBiPV=8Nr6`%Lq1fA5rKfb@&g^TaXPD2}7vB!u%=ya%2l%+QOH*e=ov;Y}F?jfgi8^|8!&vU*b4Eembgg|Jr};b62zQZ`ecF zhu^S=0{n*k48U*L!+>W6&jP~*!-3}n&jI#DC^Ew9vc~IJ;|;+Zz?*_MfjNRXz*~a1 z0Q({o8KJ_z2t{E#jn3G~1n($x#+yB}!5immwhD$q`HkJ$S#BypLB6uWDsrAy&}Xdx z@O;xIZv*!%EmRIvo9s%FUKm+zKw)1VI~)?Qr-kyVy=d$l)qWtUTJqr&;vOODfJrTq z8mp)8yUg3blR6BkvC1{vSh44opzIu($WYdgV|6mnoI};icxoFgnV0fQ>e5j+Bi)`% z4Zm2n#S;4AuGYPASBAIPTz@KP(qSSmHm1_Nj`zEEamx2lqA=Z*y*Ta#Ck=5`t-j7} zN<-YK(=>i99+)+{i#abc5VxxycmGIlO1d_dIZlA&jBoY0=`fiHw#Hi;YP@+&ydq_9 zkGkwzYK^z3(+Fm!3N2MOu=&(@N4A3or3C51F{>NF7b@bG~W|El6Wwti&gp z;##Mu!*y=yH)qmm;Tf*sc4nJ0E)nSTl*ba^dB0ojMUA>%H?#|^ww~TWs{`KZJ0(fI}2rwn-rz@NlQ-q&be->SVi?La^fp$ z!s5REP-5t^1>6+E_2d6-9&uH4S zeU4IU^c`In3NKdRKa_7X9p#zFET!0OJ)oaZ?naV2HiBo4NclFi6t^L&VVTVwRm1Wu zsT!79irYl26$~rQu9R;xOR=`MpZU5;!sXkHB_kb`Z)meFDT^|jl;W;Y$6+?fZ%Y!o zIcD36$_S%njnP=+dBO9* z7{M68<^aeD@5mbOV2yVL?*i`$-UH?e<^uBs^MLt+`M?6f0>I_~$Oym68oy$V6@nGO zO2JCtH^Fbf?}E*d18|yN`aG%WRW)q}09V0g0APh_^2H;7>4Ij!48f7WOhI$tWx-Lv zD*~GVAS2ie02$$KT*77m*m4l^1)B*eBiK=pjIbP4sJNq`#^LuqPqy!Uw#6XithN}0 zj9`mF$OsuL20{9T(+QD$K!<$*cu-&`f@Fk2Sc3|_g8?e|+U!ReVJ6n#A?#&)SaP!$?Nfc^zA6TJ<HvKN+XIy3v-LA&ggmT4 zHJSqfs?i(-P>tqbfRcQN0F>lA6rd#EVE`rh4hJaUYllEGf*k_M2r-60RdYg1$JJYQ zGTBjsYJ~EOvIbFBlN+@c4#bVx8{kGQ0Ju^60NkjBfUTV=BXFY@VGVB7i2yh1B*50r zlo8&SHQvV>iv){+4+I|owrrG)@Vl(>JJwhwSOxqc_yhP;@F%cZuo_q+SOb)Iq|m@! zQN6fql>X&$#f}za1pC4f8NvQ$BqN;1P4={{uUaY<2ewHxNFo1+vw>0iGRu}X=U*2RxXG!x@ORS)HMm_1LWCg`DrlS}} ztaDEDXHH?=gX2ebCqB3{SZ3Hgdrr%etOQ(GuhB}yqOlHC)`i_t@4$u4Qmnm`|IB&a zTWTn&{hj%_brN?cY1cK!PZY*&w#pbe>%EB}89S^wPVb*qjg^U|A`DA%23x@s<6nzT0M5?LDC zrc#5$oKToDthxQ;4`Tyy;Ha|;X3-`ny5`o^1DR*ZUmCdcOT;7fa&HrfFA=GVt5xi= zBjw)K^`hKcK~~}DSUZIZ>$?L_F@1}*O=drH#)pqw^wzqh`%p}`H^I(3a913bgB#oEka zDul=SJ=dg*Uu>#Ko3Vr6E`+y>l-HzowHwDS{h}Q3W^G-gL=hu_ayV(w_Kuu%iq-no zGea8d?6#cGoS}6K&T~sWs@xsV^fZ3CI~=P4XK{zaQ?yCIv!U6+U71)X8sKM|RN5x~ zGkJgq3a}~17G;b16yiB#36vuinOfEr4-A~D<@N;WnHfwRHZ?YDx0xVLz9Lb6`$X< zV-*{7NAOfig&M*yNZX6upVh_O8yGIw2Y60UAK(RP4S`GVHK0>K%;LO~mVz~3_g0)Ni}2>d-8 zAn^Ab;6s6p(3cTz!x{ws-VPA>dj~+^Z%2T@-#Znvzo>K8ezlyZvRc#%^{($Os1fR2 zl(m4s{Q_-Jz?OXkeSktiA<$RQ7w9MG2RtEo0_ZR34-60t0G<>)2@Dhr1bn>J&?fX@ zD5|R1ggzPJQ>;OR>}LQGvY!KA2)+Qm6nqJMCHM+hB3J_0ggzPJA6ercStHb`fjhH$ z$eGx5rVY}!RSo17HlE=U6o@2>Z%+E zP*>%6fVwIt0B;CR1l|;!1n^e*765ORKN;Yy@}~e4*lh_=VE0sj0=umM3hbT+%oEhC zzuk+=R~}^NPGpPgG0HMQ2VlA227n54Hv%gJHvuaJHv`Px-U5&`eJeoHbjF54`?~A5 zwFRE+e31N(`YJEsfYSui0IKZTp^%L5G1ees`V+tog=B=l$~H76W7^3c(W24JX7(o} z{EO!?Ted+)IAWs+H3N7aI}+e|tT{lHxuXDE>RU!&Qu-pS@r9rrz@+rW0F%;}08C1^ z2bh$;6rjr7WdK#?E(fSGcLhL|xhnyx%v}ZiAh;U%QE(0Lli*t5XTf#AFM=&@7-Z{Q za_Q_aNJg;3AQ^!lu|LEmJ`#KcP&j5Q90n4Oppa{HEwH>py%n~d zX*ZS?((+lTuz?;g&4Cp!SzGTpsJ-tk}bbzP7dSjw#$b25Aq;qHG$(}nIm2_HX z-`P6B4Hu9vn^P#~JLFA6QMO2NF}-g$_R*&&hZhx9+x#TM{0ErmCcn zE#sE=5)Os(y8|=|F{< zrHXZ$TT1CRW6oG-k^E=reezO@_-XM|#LF?on*50-wTCU+=90O diff --git a/pkg/ingester/cortex-wal/00000006 b/pkg/ingester/cortex-wal/00000006 deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/pkg/ingester/cortex-wal/checkpoint.001065/00000000 b/pkg/ingester/cortex-wal/checkpoint.001065/00000000 deleted file mode 100644 index b227d8213d8ff9b4cf2dd4632a0213f36d4bd371..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 32768 zcmeI&SxgmY6bJCRFgn9h=BU&ejJ>tAxixJ%!UYj*qr~{oq%n2Jg@(35$auREMcas?;&r ztx5e3j;|z#G$wYPsx~W7spUn<&7s*XGJMw`B|rLCWt0rZG}xrG^T(8eDA`l^=!e_8&NF2Z;4tdDX=*L6$@_?XuG$~Ml zJerV)%Fm-xgM8#si9CAvd2G_*4)WN9Jd*f%^f;jod3=mKI{0~ zgxfwIZsg$>;Gx%^+4haa<4_Uu_+c;}2?Ka|r$;DU_2Uuh<)H@6qfvo!#zSg!YqG?z zqe6o`)KQ__yD4|_>&Vxk8+GKPjzoSPcbsqrb$o<6{^r+F=Yl<`V>arT&97sx3AcPY z;!#JufDWU<`PfsIjsq3=Y#DTTqy*9t2G{&_gn4zu2Cbt_f#Wzl+HiQp@$;zBpa^-? z;P8kn;+m_K?AfwK2M_Ytf;>|BdE9eCJ@Uv#9)0{gF1p|l@>qa87Vz^pY(kHZ#{}dt zL4Zf}-HI1RSv)RpL>_|(fiGMyYN2is33T3g2z1xiuImC)=~ zwPSrM*GF5*KH4%3a*#$D(zwAtIdXOAL>jqBBaWX&mlIAQjSrBPvhM8`jnms7O2a*XhIj1 zOt|CYk%T;wM0h+sEiKjJ(Y6O?$DnVtNr89_g=Rk*7=9kr8Wb=dQnmSn zD$D#l3U%m19)%IljaKEC_?XBcU?nZpU?a|rQpIF*gMaJD(cw12$icZ`^1JAC!bxbFK)1Kh<2rP2RER~MIH9I zgDYMue%+Z#Y40#r8exrCkq=TI6iFAh8eq8r>(Gjh_~7z@){EKH;M3)8JS^s~ZL+G;)@DNp4uOz%745*DT_ zH2f`qe*VVvvI~x2W15bQX*z#ndenqIUt^krjcJNNW2*l-_toi^57EYyN-_V%lzor8 zsXa=u)ory?EghnjDc!~edbrr*Htyw}tBzPaL_1TujSIxX7PoQ6%|#>Lv3Q7frqqcA z;$e$U%t-$|;U|lSXk|(_E`fO1;>Km`<5}*Vq0ZM=ppaduZN`Mle1SkPYfD)htC;>`<5}*Vq0ZM=ppaduZ zN`Mle1SkPYfD)htC;>`<5}*Vq0ZM=ppaduZN`Mle1SkPYfD)htC;>`<5}*Vq0ZQP1 Gn83ek-*=k; diff --git a/pkg/ingester/cortex-wal/checkpoint.001066/00000000 b/pkg/ingester/cortex-wal/checkpoint.001066/00000000 deleted file mode 100644 index f1c6a6e8ea06392b74d6a7bc37ffa9d74266bd65..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 32768 zcmeI&Sxi)C6bJA-aP?l6a*s;A7h|WEHdE84R~Qh%HcE^SO&U{oTxe)(cxV%~3*Br{ z)7EOWwY5!TQ9z7!c_{81n;?=jP1?HPf{L<b`xzg*R#k^UlVUhxSE`cikQ_rK(;=zRsSZ<N`Cb3$|xC*X|PFW=Z`4`QL?A-(GRnK ziRU-#(Bow1H%GAZ$ve2u_d4OMi=F>OW9M&`aGyWtf_)}CKX(K>KX*Ux`N;hybbcLW zWJ>HcNzmEfNI1;(H%g-9un^s7D#c$6_7{hJ5lkvM=y9P*H%(T|7hy>w>+=V-E6|!_Q-% z3AcSb+{nW%z(cP+v+Wy;$DtzR@xx#|5(e<_PLEKy>c=D0%R>#CN23DejEB_d)?|ra zM}-D?sG~x;cT?`<*O9M7H|oeo9f|xp?l|EL>i7tC{LQbU&INl=$86Lwn_tIX6K?r* z#G{UQ0Ubtz^RcHa9S17#*)r(xNC~7P46ga<2=nTQ4O&N=0>^Q9wBhiG~tvUptHh&%=z9`S*AIH19ghXZ-Y9g`=4WI8!k4z{06w6=~b3Y4OZE1}t~ zYRCFiu9voyy|iT-8?8mG5El!ki%4e#0!0$2QKgh*~iBh|sSQ3gbGC%vk$ z*O!HJ73W6JdM+IJ|Gr#vNTK#aS-p9(ApdTyO6sSfX z*O13p{>RF34GNLRapa-$KURu#xQjfBkjEr`9v&xLKpvkVj~;#=^)5JwJmw>h`TRT% znQ+I)BMEsViST%OT3V{bqiqk)jzQniCI#X#6q@~b4E6Fbg67epus5YFo)+X`@bjqB z*qc&zz6yCr{5%SD=tUj{$YUZuk3J`yLmtbJM<+jz^DfwrJkpRy8b6N%CUp6DBqEPQ z0Uj|m-)^n7c=T<+=`kpe2?Ka|-?Avs;>Sbr@)#X7k5&b$kVh->7{kw_T7v?{L#j5P zP-U5)N1+aV$fGdgxzVcp62FhTPB@P~K1Cnh{5~$Y-~jrVhd$=<`#5Mqx6j9T^f6w* z$FSa%`enKQ6=I%U5A_k}=VLl6#NuMtsBqa&2X2;0!*J=C_$1#-%63_A5!RrLk zU6h#ckI#jPE=&OzMtx=1v);Mk7A{KZu`;m3#iiI5j}^mnKV_YdhhR}kJ$fJ>_UO^S z+OTA&#Y40xr5-&H4_ox;Mt;w0)fNxYrZgPd{ML?e?^60q&`arc1!{2lxQ?;e+2a2R)wg}8exafQh zT=7~d$I>BMmeL(upvA)$cW^OM$F}b+9-?I_wP6oFS=cDS@nMTL%-Fp;ZH47Sv@NB( zmO$gf7I!Vt@AmzcXYt@HOYzf#-g{cQ!uQ`jaB|q<3g2iQx~0uaBLP+l7Gl(w2AUkU z=u2b9CM{p@<01IDV_H0Hym-jJPT67k5G_opPY*OcY|*F3+%VQOT0BGx(+IfkSC~e4 z3sYCng=xD2C$KPW$HFv~zc8)Upco6&S}aVX`3uvnI`p%`RN87jA1P1eFHG+{;Sv_6 zD>VERKtF$Ddf5d>urW=?#x$M3F+FNRpRX}Z!NxR2pfT0|ocro@%ZF%VN~M_pV#1`<5}*Vq0ZM=ppaduZN`Mle1SkPYfD)htC;>`<5}*Vq0ZM=ppaduZN`Mle1SkPY zfD)htC;>`<5}*Vq0ZM=ppaduZN`Mle1SkPYfD)htC;>`<5}*Vq0ZM=ppaduZN`Mmh HA13fGlPPzb diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index 58289eeb5da..6ed719193c0 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -170,9 +170,9 @@ func (w *walWrapper) run() { elapsed := time.Since(start) level.Info(util.Logger).Log("msg", "checkpoint done", "time", elapsed.String()) case <-w.quit: - // if err := w.checkpoint(); err != nil { - // level.Error(util.Logger).Log("msg", "error checkpointing series during shutdown", "err", err) - // } + if err := w.checkpoint(); err != nil { + level.Error(util.Logger).Log("msg", "error checkpointing series during shutdown", "err", err) + } return } } From f6ddbdf1259925e1ac33cd048218027d985330ad Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Thu, 10 Oct 2019 17:38:24 +0530 Subject: [PATCH 35/56] Use tsdb from prometheus/prometheus repo Signed-off-by: Ganesh Vernekar --- go.mod | 1 - go.sum | 3 - pkg/ingester/wal.go | 6 +- vendor/github.com/prometheus/tsdb/LICENSE | 201 ----- .../prometheus/tsdb/errors/errors.go | 62 -- .../prometheus/tsdb/fileutil/dir_unix.go | 22 - .../prometheus/tsdb/fileutil/dir_windows.go | 46 - .../prometheus/tsdb/fileutil/fileutil.go | 159 ---- .../prometheus/tsdb/fileutil/flock.go | 41 - .../prometheus/tsdb/fileutil/flock_plan9.go | 32 - .../prometheus/tsdb/fileutil/flock_solaris.go | 59 -- .../prometheus/tsdb/fileutil/flock_unix.go | 54 -- .../prometheus/tsdb/fileutil/flock_windows.go | 36 - .../prometheus/tsdb/fileutil/mmap.go | 61 -- .../prometheus/tsdb/fileutil/mmap_386.go | 18 - .../prometheus/tsdb/fileutil/mmap_amd64.go | 18 - .../prometheus/tsdb/fileutil/mmap_unix.go | 30 - .../prometheus/tsdb/fileutil/mmap_windows.go | 46 - .../prometheus/tsdb/fileutil/preallocate.go | 54 -- .../tsdb/fileutil/preallocate_darwin.go | 41 - .../tsdb/fileutil/preallocate_linux.go | 47 - .../tsdb/fileutil/preallocate_other.go | 25 - .../prometheus/tsdb/fileutil/sync.go | 24 - .../prometheus/tsdb/fileutil/sync_darwin.go | 27 - .../prometheus/tsdb/fileutil/sync_linux.go | 29 - .../prometheus/tsdb/wal/live_reader.go | 322 ------- .../github.com/prometheus/tsdb/wal/reader.go | 200 ---- vendor/github.com/prometheus/tsdb/wal/wal.go | 852 ------------------ vendor/modules.txt | 4 - 29 files changed, 3 insertions(+), 2517 deletions(-) delete mode 100644 vendor/github.com/prometheus/tsdb/LICENSE delete mode 100644 vendor/github.com/prometheus/tsdb/errors/errors.go delete mode 100644 vendor/github.com/prometheus/tsdb/fileutil/dir_unix.go delete mode 100644 vendor/github.com/prometheus/tsdb/fileutil/dir_windows.go delete mode 100644 vendor/github.com/prometheus/tsdb/fileutil/fileutil.go delete mode 100644 vendor/github.com/prometheus/tsdb/fileutil/flock.go delete mode 100644 vendor/github.com/prometheus/tsdb/fileutil/flock_plan9.go delete mode 100644 vendor/github.com/prometheus/tsdb/fileutil/flock_solaris.go delete mode 100644 vendor/github.com/prometheus/tsdb/fileutil/flock_unix.go delete mode 100644 vendor/github.com/prometheus/tsdb/fileutil/flock_windows.go delete mode 100644 vendor/github.com/prometheus/tsdb/fileutil/mmap.go delete mode 100644 vendor/github.com/prometheus/tsdb/fileutil/mmap_386.go delete mode 100644 vendor/github.com/prometheus/tsdb/fileutil/mmap_amd64.go delete mode 100644 vendor/github.com/prometheus/tsdb/fileutil/mmap_unix.go delete mode 100644 vendor/github.com/prometheus/tsdb/fileutil/mmap_windows.go delete mode 100644 vendor/github.com/prometheus/tsdb/fileutil/preallocate.go delete mode 100644 vendor/github.com/prometheus/tsdb/fileutil/preallocate_darwin.go delete mode 100644 vendor/github.com/prometheus/tsdb/fileutil/preallocate_linux.go delete mode 100644 vendor/github.com/prometheus/tsdb/fileutil/preallocate_other.go delete mode 100644 vendor/github.com/prometheus/tsdb/fileutil/sync.go delete mode 100644 vendor/github.com/prometheus/tsdb/fileutil/sync_darwin.go delete mode 100644 vendor/github.com/prometheus/tsdb/fileutil/sync_linux.go delete mode 100644 vendor/github.com/prometheus/tsdb/wal/live_reader.go delete mode 100644 vendor/github.com/prometheus/tsdb/wal/reader.go delete mode 100644 vendor/github.com/prometheus/tsdb/wal/wal.go diff --git a/go.mod b/go.mod index a3f74c2542c..24962c178c0 100644 --- a/go.mod +++ b/go.mod @@ -56,7 +56,6 @@ require ( github.com/prometheus/client_golang v1.1.0 github.com/prometheus/common v0.7.0 github.com/prometheus/prometheus v1.8.2-0.20190918104050-8744afdd1ea0 - github.com/prometheus/tsdb v0.10.0 github.com/satori/go.uuid v1.2.0 // indirect github.com/segmentio/fasthash v0.0.0-20180216231524-a72b379d632e github.com/sercand/kuberesolver v2.1.0+incompatible // indirect diff --git a/go.sum b/go.sum index 01580c527f0..b7caf0a226b 100644 --- a/go.sum +++ b/go.sum @@ -114,7 +114,6 @@ github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs github.com/dgrijalva/jwt-go v0.0.0-20160705203006-01aeca54ebda/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ= github.com/dgrijalva/jwt-go v3.2.0+incompatible h1:7qlOGliEKZXTDg6OTjfoBKDXWrumCAMpl/TFQ4/5kLM= github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ= -github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no= github.com/dgryski/go-sip13 v0.0.0-20190329191031-25c5027a8c7b/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no= github.com/docker/go-units v0.3.3/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= github.com/docker/go-units v0.4.0 h1:3uh0PgVws3nIA0Q+MwDC8yjEPf9zjRfZZWXZYDct3Tw= @@ -496,8 +495,6 @@ github.com/prometheus/prometheus v0.0.0-20190818123050-43acd0e2e93f h1:7C9G4yUog github.com/prometheus/prometheus v0.0.0-20190818123050-43acd0e2e93f/go.mod h1:rMTlmxGCvukf2KMu3fClMDKLLoJ5hl61MhcJ7xKakf0= github.com/prometheus/prometheus v1.8.2-0.20190918104050-8744afdd1ea0 h1:W4dTblzSVIBNfDimJhh70OpZQQMwLVpwK50scXdH94w= github.com/prometheus/prometheus v1.8.2-0.20190918104050-8744afdd1ea0/go.mod h1:elNqjVbwD3sCZJqKzyN7uEuwGcCpeJvv67D6BrHsDbw= -github.com/prometheus/tsdb v0.10.0 h1:If5rVCMTp6W2SiRAQFlbpJNgVlgMEd+U2GZckwK38ic= -github.com/prometheus/tsdb v0.10.0/go.mod h1:oi49uRhEe9dPUTlS3JRZOwJuVi6tmh10QSgwXEyGCt4= github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6SoW27p1b0cqNHllgS5HIMJraePCO15w5zCzIWYg= github.com/rs/cors v1.6.0 h1:G9tHG9lebljV9mfp9SNPDL36nCDxmo3zTlAf1YgvzmI= github.com/rs/cors v1.6.0/go.mod h1:gFx+x8UowdsKA9AchylcLynDq+nNFfI8FkUZdN/jGCU= diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index 6ed719193c0..7f710823de3 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -18,9 +18,9 @@ import ( "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/model" - tsdb_errors "github.com/prometheus/tsdb/errors" - "github.com/prometheus/tsdb/fileutil" - "github.com/prometheus/tsdb/wal" + tsdb_errors "github.com/prometheus/prometheus/tsdb/errors" + "github.com/prometheus/prometheus/tsdb/fileutil" + "github.com/prometheus/prometheus/tsdb/wal" "github.com/cortexproject/cortex/pkg/ingester/client" "github.com/cortexproject/cortex/pkg/util" diff --git a/vendor/github.com/prometheus/tsdb/LICENSE b/vendor/github.com/prometheus/tsdb/LICENSE deleted file mode 100644 index 261eeb9e9f8..00000000000 --- a/vendor/github.com/prometheus/tsdb/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/vendor/github.com/prometheus/tsdb/errors/errors.go b/vendor/github.com/prometheus/tsdb/errors/errors.go deleted file mode 100644 index 69d36624800..00000000000 --- a/vendor/github.com/prometheus/tsdb/errors/errors.go +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright 2016 The etcd Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package errors - -import ( - "bytes" - "fmt" -) - -// The MultiError type implements the error interface, and contains the -// Errors used to construct it. -type MultiError []error - -// Returns a concatenated string of the contained errors -func (es MultiError) Error() string { - var buf bytes.Buffer - - if len(es) > 1 { - fmt.Fprintf(&buf, "%d errors: ", len(es)) - } - - for i, err := range es { - if i != 0 { - buf.WriteString("; ") - } - buf.WriteString(err.Error()) - } - - return buf.String() -} - -// Add adds the error to the error list if it is not nil. -func (es *MultiError) Add(err error) { - if err == nil { - return - } - if merr, ok := err.(MultiError); ok { - *es = append(*es, merr...) - } else { - *es = append(*es, err) - } -} - -// Err returns the error list as an error or nil if it is empty. -func (es MultiError) Err() error { - if len(es) == 0 { - return nil - } - return es -} diff --git a/vendor/github.com/prometheus/tsdb/fileutil/dir_unix.go b/vendor/github.com/prometheus/tsdb/fileutil/dir_unix.go deleted file mode 100644 index 58a77dfc1a9..00000000000 --- a/vendor/github.com/prometheus/tsdb/fileutil/dir_unix.go +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright 2016 The etcd Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build !windows - -package fileutil - -import "os" - -// OpenDir opens a directory for syncing. -func OpenDir(path string) (*os.File, error) { return os.Open(path) } diff --git a/vendor/github.com/prometheus/tsdb/fileutil/dir_windows.go b/vendor/github.com/prometheus/tsdb/fileutil/dir_windows.go deleted file mode 100644 index c123395c004..00000000000 --- a/vendor/github.com/prometheus/tsdb/fileutil/dir_windows.go +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2016 The etcd Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build windows - -package fileutil - -import ( - "os" - "syscall" -) - -// OpenDir opens a directory in windows with write access for syncing. -func OpenDir(path string) (*os.File, error) { - fd, err := openDir(path) - if err != nil { - return nil, err - } - return os.NewFile(uintptr(fd), path), nil -} - -func openDir(path string) (fd syscall.Handle, err error) { - if len(path) == 0 { - return syscall.InvalidHandle, syscall.ERROR_FILE_NOT_FOUND - } - pathp, err := syscall.UTF16PtrFromString(path) - if err != nil { - return syscall.InvalidHandle, err - } - access := uint32(syscall.GENERIC_READ | syscall.GENERIC_WRITE) - sharemode := uint32(syscall.FILE_SHARE_READ | syscall.FILE_SHARE_WRITE) - createmode := uint32(syscall.OPEN_EXISTING) - fl := uint32(syscall.FILE_FLAG_BACKUP_SEMANTICS) - return syscall.CreateFile(pathp, access, sharemode, nil, createmode, fl, 0) -} diff --git a/vendor/github.com/prometheus/tsdb/fileutil/fileutil.go b/vendor/github.com/prometheus/tsdb/fileutil/fileutil.go deleted file mode 100644 index 4088f522aeb..00000000000 --- a/vendor/github.com/prometheus/tsdb/fileutil/fileutil.go +++ /dev/null @@ -1,159 +0,0 @@ -// Copyright 2018 The Prometheus Authors -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package fileutil provides utility methods used when dealing with the filesystem in tsdb. -// It is largely copied from github.com/coreos/etcd/pkg/fileutil to avoid the -// dependency chain it brings with it. -// Please check github.com/coreos/etcd for licensing information. -package fileutil - -import ( - "io/ioutil" - "os" - "path/filepath" - "sort" - "strings" -) - -// CopyDirs copies all directories, subdirectories and files recursively including the empty folders. -// Source and destination must be full paths. -func CopyDirs(src, dest string) error { - if err := os.MkdirAll(dest, 0777); err != nil { - return err - } - files, err := readDirs(src) - if err != nil { - return err - } - - for _, f := range files { - dp := filepath.Join(dest, f) - sp := filepath.Join(src, f) - - stat, err := os.Stat(sp) - if err != nil { - return err - } - - // Empty directories are also created. - if stat.IsDir() { - if err := os.MkdirAll(dp, 0777); err != nil { - return err - } - continue - } - - if err := copyFile(sp, dp); err != nil { - return err - } - } - return nil -} - -func copyFile(src, dest string) error { - data, err := ioutil.ReadFile(src) - if err != nil { - return err - } - - err = ioutil.WriteFile(dest, data, 0644) - if err != nil { - return err - } - return nil -} - -// readDirs reads the source directory recursively and -// returns relative paths to all files and empty directories. -func readDirs(src string) ([]string, error) { - var files []string - - err := filepath.Walk(src, func(path string, f os.FileInfo, err error) error { - relativePath := strings.TrimPrefix(path, src) - if len(relativePath) > 0 { - files = append(files, relativePath) - } - return nil - }) - if err != nil { - return nil, err - } - return files, nil -} - -// ReadDir returns the filenames in the given directory in sorted order. -func ReadDir(dirpath string) ([]string, error) { - dir, err := os.Open(dirpath) - if err != nil { - return nil, err - } - defer dir.Close() - names, err := dir.Readdirnames(-1) - if err != nil { - return nil, err - } - sort.Strings(names) - return names, nil -} - -// Rename safely renames a file. -func Rename(from, to string) error { - if err := os.Rename(from, to); err != nil { - return err - } - - // Directory was renamed; sync parent dir to persist rename. - pdir, err := OpenDir(filepath.Dir(to)) - if err != nil { - return err - } - - if err = pdir.Sync(); err != nil { - pdir.Close() - return err - } - return pdir.Close() -} - -// Replace moves a file or directory to a new location and deletes any previous data. -// It is not atomic. -func Replace(from, to string) error { - // Remove destination only if it is a dir otherwise leave it to os.Rename - // as it replaces the destination file and is atomic. - { - f, err := os.Stat(to) - if !os.IsNotExist(err) { - if err == nil && f.IsDir() { - if err := os.RemoveAll(to); err != nil { - return err - } - } - } - } - - if err := os.Rename(from, to); err != nil { - return err - } - - // Directory was renamed; sync parent dir to persist rename. - pdir, err := OpenDir(filepath.Dir(to)) - if err != nil { - return err - } - - if err = pdir.Sync(); err != nil { - pdir.Close() - return err - } - return pdir.Close() -} diff --git a/vendor/github.com/prometheus/tsdb/fileutil/flock.go b/vendor/github.com/prometheus/tsdb/fileutil/flock.go deleted file mode 100644 index d5eaa7ca2ad..00000000000 --- a/vendor/github.com/prometheus/tsdb/fileutil/flock.go +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2016 The Prometheus Authors -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package fileutil - -import ( - "os" - "path/filepath" -) - -// Releaser provides the Release method to release a file lock. -type Releaser interface { - Release() error -} - -// Flock locks the file with the provided name. If the file does not exist, it is -// created. The returned Releaser is used to release the lock. existed is true -// if the file to lock already existed. A non-nil error is returned if the -// locking has failed. Neither this function nor the returned Releaser is -// goroutine-safe. -func Flock(fileName string) (r Releaser, existed bool, err error) { - if err = os.MkdirAll(filepath.Dir(fileName), 0755); err != nil { - return nil, false, err - } - - _, err = os.Stat(fileName) - existed = err == nil - - r, err = newLock(fileName) - return r, existed, err -} diff --git a/vendor/github.com/prometheus/tsdb/fileutil/flock_plan9.go b/vendor/github.com/prometheus/tsdb/fileutil/flock_plan9.go deleted file mode 100644 index 8a3d44c5e14..00000000000 --- a/vendor/github.com/prometheus/tsdb/fileutil/flock_plan9.go +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright 2016 The Prometheus Authors -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package fileutil - -import "os" - -type plan9Lock struct { - f *os.File -} - -func (l *plan9Lock) Release() error { - return l.f.Close() -} - -func newLock(fileName string) (Releaser, error) { - f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE, os.ModeExclusive|0644) - if err != nil { - return nil, err - } - return &plan9Lock{f}, nil -} diff --git a/vendor/github.com/prometheus/tsdb/fileutil/flock_solaris.go b/vendor/github.com/prometheus/tsdb/fileutil/flock_solaris.go deleted file mode 100644 index 7f527ae6c49..00000000000 --- a/vendor/github.com/prometheus/tsdb/fileutil/flock_solaris.go +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright 2016 The Prometheus Authors -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build solaris - -package fileutil - -import ( - "os" - "syscall" -) - -type unixLock struct { - f *os.File -} - -func (l *unixLock) Release() error { - if err := l.set(false); err != nil { - return err - } - return l.f.Close() -} - -func (l *unixLock) set(lock bool) error { - flock := syscall.Flock_t{ - Type: syscall.F_UNLCK, - Start: 0, - Len: 0, - Whence: 1, - } - if lock { - flock.Type = syscall.F_WRLCK - } - return syscall.FcntlFlock(l.f.Fd(), syscall.F_SETLK, &flock) -} - -func newLock(fileName string) (Releaser, error) { - f, err := os.OpenFile(fileName, os.O_RDWR|os.O_CREATE, 0644) - if err != nil { - return nil, err - } - l := &unixLock{f} - err = l.set(true) - if err != nil { - f.Close() - return nil, err - } - return l, nil -} diff --git a/vendor/github.com/prometheus/tsdb/fileutil/flock_unix.go b/vendor/github.com/prometheus/tsdb/fileutil/flock_unix.go deleted file mode 100644 index f493fbd831d..00000000000 --- a/vendor/github.com/prometheus/tsdb/fileutil/flock_unix.go +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright 2016 The Prometheus Authors -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build darwin dragonfly freebsd linux netbsd openbsd - -package fileutil - -import ( - "os" - "syscall" -) - -type unixLock struct { - f *os.File -} - -func (l *unixLock) Release() error { - if err := l.set(false); err != nil { - return err - } - return l.f.Close() -} - -func (l *unixLock) set(lock bool) error { - how := syscall.LOCK_UN - if lock { - how = syscall.LOCK_EX - } - return syscall.Flock(int(l.f.Fd()), how|syscall.LOCK_NB) -} - -func newLock(fileName string) (Releaser, error) { - f, err := os.OpenFile(fileName, os.O_RDWR|os.O_CREATE, 0644) - if err != nil { - return nil, err - } - l := &unixLock{f} - err = l.set(true) - if err != nil { - f.Close() - return nil, err - } - return l, nil -} diff --git a/vendor/github.com/prometheus/tsdb/fileutil/flock_windows.go b/vendor/github.com/prometheus/tsdb/fileutil/flock_windows.go deleted file mode 100644 index 1c17ff4ea30..00000000000 --- a/vendor/github.com/prometheus/tsdb/fileutil/flock_windows.go +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2016 The Prometheus Authors -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package fileutil - -import "syscall" - -type windowsLock struct { - fd syscall.Handle -} - -func (fl *windowsLock) Release() error { - return syscall.Close(fl.fd) -} - -func newLock(fileName string) (Releaser, error) { - pathp, err := syscall.UTF16PtrFromString(fileName) - if err != nil { - return nil, err - } - fd, err := syscall.CreateFile(pathp, syscall.GENERIC_READ|syscall.GENERIC_WRITE, 0, nil, syscall.CREATE_ALWAYS, syscall.FILE_ATTRIBUTE_NORMAL, 0) - if err != nil { - return nil, err - } - return &windowsLock{fd}, nil -} diff --git a/vendor/github.com/prometheus/tsdb/fileutil/mmap.go b/vendor/github.com/prometheus/tsdb/fileutil/mmap.go deleted file mode 100644 index 26fc80c5850..00000000000 --- a/vendor/github.com/prometheus/tsdb/fileutil/mmap.go +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright 2018 The Prometheus Authors -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package fileutil - -import ( - "os" - - "github.com/pkg/errors" -) - -type MmapFile struct { - f *os.File - b []byte -} - -func OpenMmapFile(path string) (*MmapFile, error) { - f, err := os.Open(path) - if err != nil { - return nil, errors.Wrap(err, "try lock file") - } - info, err := f.Stat() - if err != nil { - return nil, errors.Wrap(err, "stat") - } - - b, err := mmap(f, int(info.Size())) - if err != nil { - return nil, errors.Wrap(err, "mmap") - } - - return &MmapFile{f: f, b: b}, nil -} - -func (f *MmapFile) Close() error { - err0 := munmap(f.b) - err1 := f.f.Close() - - if err0 != nil { - return err0 - } - return err1 -} - -func (f *MmapFile) File() *os.File { - return f.f -} - -func (f *MmapFile) Bytes() []byte { - return f.b -} diff --git a/vendor/github.com/prometheus/tsdb/fileutil/mmap_386.go b/vendor/github.com/prometheus/tsdb/fileutil/mmap_386.go deleted file mode 100644 index 66b9d368034..00000000000 --- a/vendor/github.com/prometheus/tsdb/fileutil/mmap_386.go +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright 2018 The Prometheus Authors -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build windows - -package fileutil - -const maxMapSize = 0x7FFFFFFF // 2GB diff --git a/vendor/github.com/prometheus/tsdb/fileutil/mmap_amd64.go b/vendor/github.com/prometheus/tsdb/fileutil/mmap_amd64.go deleted file mode 100644 index 4b523bc67c2..00000000000 --- a/vendor/github.com/prometheus/tsdb/fileutil/mmap_amd64.go +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright 2018 The Prometheus Authors -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build windows - -package fileutil - -const maxMapSize = 0xFFFFFFFFFFFF // 256TB diff --git a/vendor/github.com/prometheus/tsdb/fileutil/mmap_unix.go b/vendor/github.com/prometheus/tsdb/fileutil/mmap_unix.go deleted file mode 100644 index 043f4d408cc..00000000000 --- a/vendor/github.com/prometheus/tsdb/fileutil/mmap_unix.go +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2017 The Prometheus Authors -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build !windows,!plan9 - -package fileutil - -import ( - "os" - - "golang.org/x/sys/unix" -) - -func mmap(f *os.File, length int) ([]byte, error) { - return unix.Mmap(int(f.Fd()), 0, length, unix.PROT_READ, unix.MAP_SHARED) -} - -func munmap(b []byte) (err error) { - return unix.Munmap(b) -} diff --git a/vendor/github.com/prometheus/tsdb/fileutil/mmap_windows.go b/vendor/github.com/prometheus/tsdb/fileutil/mmap_windows.go deleted file mode 100644 index b9422641239..00000000000 --- a/vendor/github.com/prometheus/tsdb/fileutil/mmap_windows.go +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2017 The Prometheus Authors -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package fileutil - -import ( - "os" - "syscall" - "unsafe" -) - -func mmap(f *os.File, size int) ([]byte, error) { - low, high := uint32(size), uint32(size>>32) - h, errno := syscall.CreateFileMapping(syscall.Handle(f.Fd()), nil, syscall.PAGE_READONLY, high, low, nil) - if h == 0 { - return nil, os.NewSyscallError("CreateFileMapping", errno) - } - - addr, errno := syscall.MapViewOfFile(h, syscall.FILE_MAP_READ, 0, 0, uintptr(size)) - if addr == 0 { - return nil, os.NewSyscallError("MapViewOfFile", errno) - } - - if err := syscall.CloseHandle(syscall.Handle(h)); err != nil { - return nil, os.NewSyscallError("CloseHandle", err) - } - - return (*[maxMapSize]byte)(unsafe.Pointer(addr))[:size], nil -} - -func munmap(b []byte) error { - if err := syscall.UnmapViewOfFile((uintptr)(unsafe.Pointer(&b[0]))); err != nil { - return os.NewSyscallError("UnmapViewOfFile", err) - } - return nil -} diff --git a/vendor/github.com/prometheus/tsdb/fileutil/preallocate.go b/vendor/github.com/prometheus/tsdb/fileutil/preallocate.go deleted file mode 100644 index c747b7cf81f..00000000000 --- a/vendor/github.com/prometheus/tsdb/fileutil/preallocate.go +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright 2015 The etcd Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package fileutil - -import ( - "io" - "os" -) - -// Preallocate tries to allocate the space for given -// file. This operation is only supported on linux by a -// few filesystems (btrfs, ext4, etc.). -// If the operation is unsupported, no error will be returned. -// Otherwise, the error encountered will be returned. -func Preallocate(f *os.File, sizeInBytes int64, extendFile bool) error { - if sizeInBytes == 0 { - // fallocate will return EINVAL if length is 0; skip - return nil - } - if extendFile { - return preallocExtend(f, sizeInBytes) - } - return preallocFixed(f, sizeInBytes) -} - -func preallocExtendTrunc(f *os.File, sizeInBytes int64) error { - curOff, err := f.Seek(0, io.SeekCurrent) - if err != nil { - return err - } - size, err := f.Seek(sizeInBytes, io.SeekEnd) - if err != nil { - return err - } - if _, err = f.Seek(curOff, io.SeekStart); err != nil { - return err - } - if sizeInBytes > size { - return nil - } - return f.Truncate(sizeInBytes) -} diff --git a/vendor/github.com/prometheus/tsdb/fileutil/preallocate_darwin.go b/vendor/github.com/prometheus/tsdb/fileutil/preallocate_darwin.go deleted file mode 100644 index c9fa1a6c226..00000000000 --- a/vendor/github.com/prometheus/tsdb/fileutil/preallocate_darwin.go +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2015 The etcd Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package fileutil - -import ( - "os" - "syscall" - "unsafe" -) - -func preallocExtend(f *os.File, sizeInBytes int64) error { - if err := preallocFixed(f, sizeInBytes); err != nil { - return err - } - return preallocExtendTrunc(f, sizeInBytes) -} - -func preallocFixed(f *os.File, sizeInBytes int64) error { - fstore := &syscall.Fstore_t{ - Flags: syscall.F_ALLOCATEALL, - Posmode: syscall.F_PEOFPOSMODE, - Length: sizeInBytes} - p := unsafe.Pointer(fstore) - _, _, errno := syscall.Syscall(syscall.SYS_FCNTL, f.Fd(), uintptr(syscall.F_PREALLOCATE), uintptr(p)) - if errno == 0 || errno == syscall.ENOTSUP { - return nil - } - return errno -} diff --git a/vendor/github.com/prometheus/tsdb/fileutil/preallocate_linux.go b/vendor/github.com/prometheus/tsdb/fileutil/preallocate_linux.go deleted file mode 100644 index ada0462213e..00000000000 --- a/vendor/github.com/prometheus/tsdb/fileutil/preallocate_linux.go +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright 2015 The etcd Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package fileutil - -import ( - "os" - "syscall" -) - -func preallocExtend(f *os.File, sizeInBytes int64) error { - // use mode = 0 to change size - err := syscall.Fallocate(int(f.Fd()), 0, 0, sizeInBytes) - if err != nil { - errno, ok := err.(syscall.Errno) - // not supported; fallback - // fallocate EINTRs frequently in some environments; fallback - if ok && (errno == syscall.ENOTSUP || errno == syscall.EINTR) { - return preallocExtendTrunc(f, sizeInBytes) - } - } - return err -} - -func preallocFixed(f *os.File, sizeInBytes int64) error { - // use mode = 1 to keep size; see FALLOC_FL_KEEP_SIZE - err := syscall.Fallocate(int(f.Fd()), 1, 0, sizeInBytes) - if err != nil { - errno, ok := err.(syscall.Errno) - // treat not supported as nil error - if ok && errno == syscall.ENOTSUP { - return nil - } - } - return err -} diff --git a/vendor/github.com/prometheus/tsdb/fileutil/preallocate_other.go b/vendor/github.com/prometheus/tsdb/fileutil/preallocate_other.go deleted file mode 100644 index 162fbc5f782..00000000000 --- a/vendor/github.com/prometheus/tsdb/fileutil/preallocate_other.go +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2015 The etcd Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build !linux,!darwin - -package fileutil - -import "os" - -func preallocExtend(f *os.File, sizeInBytes int64) error { - return preallocExtendTrunc(f, sizeInBytes) -} - -func preallocFixed(f *os.File, sizeInBytes int64) error { return nil } diff --git a/vendor/github.com/prometheus/tsdb/fileutil/sync.go b/vendor/github.com/prometheus/tsdb/fileutil/sync.go deleted file mode 100644 index 2e64a40880a..00000000000 --- a/vendor/github.com/prometheus/tsdb/fileutil/sync.go +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright 2016 The etcd Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build !linux,!darwin - -package fileutil - -import "os" - -// Fdatasync is a wrapper around file.Sync(). Special handling is needed on linux platform. -func Fdatasync(f *os.File) error { - return f.Sync() -} diff --git a/vendor/github.com/prometheus/tsdb/fileutil/sync_darwin.go b/vendor/github.com/prometheus/tsdb/fileutil/sync_darwin.go deleted file mode 100644 index 2af1b0f4119..00000000000 --- a/vendor/github.com/prometheus/tsdb/fileutil/sync_darwin.go +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2016 The etcd Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build darwin - -package fileutil - -import ( - "os" -) - -// Fdatasync on darwin platform invokes fcntl(F_FULLFSYNC) for actual persistence -// on physical drive media. -func Fdatasync(f *os.File) error { - return f.Sync() -} diff --git a/vendor/github.com/prometheus/tsdb/fileutil/sync_linux.go b/vendor/github.com/prometheus/tsdb/fileutil/sync_linux.go deleted file mode 100644 index 8b4fc8268e5..00000000000 --- a/vendor/github.com/prometheus/tsdb/fileutil/sync_linux.go +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2016 The etcd Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build linux - -package fileutil - -import ( - "os" - "syscall" -) - -// Fdatasync is similar to fsync(), but does not flush modified metadata -// unless that metadata is needed in order to allow a subsequent data retrieval -// to be correctly handled. -func Fdatasync(f *os.File) error { - return syscall.Fdatasync(int(f.Fd())) -} diff --git a/vendor/github.com/prometheus/tsdb/wal/live_reader.go b/vendor/github.com/prometheus/tsdb/wal/live_reader.go deleted file mode 100644 index 94175e79178..00000000000 --- a/vendor/github.com/prometheus/tsdb/wal/live_reader.go +++ /dev/null @@ -1,322 +0,0 @@ -// Copyright 2019 The Prometheus Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package wal - -import ( - "encoding/binary" - "fmt" - "hash/crc32" - "io" - - "github.com/go-kit/kit/log" - "github.com/go-kit/kit/log/level" - "github.com/golang/snappy" - "github.com/pkg/errors" - "github.com/prometheus/client_golang/prometheus" -) - -// liveReaderMetrics holds all metrics exposed by the LiveReader. -type liveReaderMetrics struct { - readerCorruptionErrors *prometheus.CounterVec -} - -// LiveReaderMetrics instatiates, registers and returns metrics to be injected -// at LiveReader instantiation. -func NewLiveReaderMetrics(reg prometheus.Registerer) *liveReaderMetrics { - m := &liveReaderMetrics{ - readerCorruptionErrors: prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "prometheus_tsdb_wal_reader_corruption_errors_total", - Help: "Errors encountered when reading the WAL.", - }, []string{"error"}), - } - - if reg != nil { - reg.Register(m.readerCorruptionErrors) - } - - return m -} - -// NewLiveReader returns a new live reader. -func NewLiveReader(logger log.Logger, metrics *liveReaderMetrics, r io.Reader) *LiveReader { - lr := &LiveReader{ - logger: logger, - rdr: r, - metrics: metrics, - - // Until we understand how they come about, make readers permissive - // to records spanning pages. - permissive: true, - } - - return lr -} - -// LiveReader reads WAL records from an io.Reader. It allows reading of WALs -// that are still in the process of being written, and returns records as soon -// as they can be read. -type LiveReader struct { - logger log.Logger - rdr io.Reader - err error - rec []byte - snappyBuf []byte - hdr [recordHeaderSize]byte - buf [pageSize]byte - readIndex int // Index in buf to start at for next read. - writeIndex int // Index in buf to start at for next write. - total int64 // Total bytes processed during reading in calls to Next(). - index int // Used to track partial records, should be 0 at the start of every new record. - - // For testing, we can treat EOF as a non-error. - eofNonErr bool - - // We sometime see records span page boundaries. Should never happen, but it - // does. Until we track down why, set permissive to true to tolerate it. - // NB the non-ive Reader implementation allows for this. - permissive bool - - metrics *liveReaderMetrics -} - -// Err returns any errors encountered reading the WAL. io.EOFs are not terminal -// and Next can be tried again. Non-EOFs are terminal, and the reader should -// not be used again. It is up to the user to decide when to stop trying should -// io.EOF be returned. -func (r *LiveReader) Err() error { - if r.eofNonErr && r.err == io.EOF { - return nil - } - return r.err -} - -// Offset returns the number of bytes consumed from this segment. -func (r *LiveReader) Offset() int64 { - return r.total -} - -func (r *LiveReader) fillBuffer() (int, error) { - n, err := r.rdr.Read(r.buf[r.writeIndex:len(r.buf)]) - r.writeIndex += n - return n, err -} - -// Next returns true if Record() will contain a full record. -// If Next returns false, you should always checked the contents of Error(). -// Return false guarantees there are no more records if the segment is closed -// and not corrupt, otherwise if Err() == io.EOF you should try again when more -// data has been written. -func (r *LiveReader) Next() bool { - for { - // If buildRecord returns a non-EOF error, its game up - the segment is - // corrupt. If buildRecord returns an EOF, we try and read more in - // fillBuffer later on. If that fails to read anything (n=0 && err=EOF), - // we return EOF and the user can try again later. If we have a full - // page, buildRecord is guaranteed to return a record or a non-EOF; it - // has checks the records fit in pages. - if ok, err := r.buildRecord(); ok { - return true - } else if err != nil && err != io.EOF { - r.err = err - return false - } - - // If we've filled the page and not found a record, this - // means records have started to span pages. Shouldn't happen - // but does and until we found out why, we need to deal with this. - if r.permissive && r.writeIndex == pageSize && r.readIndex > 0 { - copy(r.buf[:], r.buf[r.readIndex:]) - r.writeIndex -= r.readIndex - r.readIndex = 0 - continue - } - - if r.readIndex == pageSize { - r.writeIndex = 0 - r.readIndex = 0 - } - - if r.writeIndex != pageSize { - n, err := r.fillBuffer() - if n == 0 || (err != nil && err != io.EOF) { - r.err = err - return false - } - } - } -} - -// Record returns the current record. -// The returned byte slice is only valid until the next call to Next. -func (r *LiveReader) Record() []byte { - return r.rec -} - -// Rebuild a full record from potentially partial records. Returns false -// if there was an error or if we weren't able to read a record for any reason. -// Returns true if we read a full record. Any record data is appended to -// LiveReader.rec -func (r *LiveReader) buildRecord() (bool, error) { - for { - // Check that we have data in the internal buffer to read. - if r.writeIndex <= r.readIndex { - return false, nil - } - - // Attempt to read a record, partial or otherwise. - temp, n, err := r.readRecord() - if err != nil { - return false, err - } - - r.readIndex += n - r.total += int64(n) - if temp == nil { - return false, nil - } - - rt := recTypeFromHeader(r.hdr[0]) - if rt == recFirst || rt == recFull { - r.rec = r.rec[:0] - r.snappyBuf = r.snappyBuf[:0] - } - - compressed := r.hdr[0]&snappyMask != 0 - if compressed { - r.snappyBuf = append(r.snappyBuf, temp...) - } else { - r.rec = append(r.rec, temp...) - } - - if err := validateRecord(rt, r.index); err != nil { - r.index = 0 - return false, err - } - if rt == recLast || rt == recFull { - r.index = 0 - if compressed && len(r.snappyBuf) > 0 { - // The snappy library uses `len` to calculate if we need a new buffer. - // In order to allocate as few buffers as possible make the length - // equal to the capacity. - r.rec = r.rec[:cap(r.rec)] - r.rec, err = snappy.Decode(r.rec, r.snappyBuf) - if err != nil { - return false, err - } - } - return true, nil - } - // Only increment i for non-zero records since we use it - // to determine valid content record sequences. - r.index++ - } -} - -// Returns an error if the recType and i indicate an invalid record sequence. -// As an example, if i is > 0 because we've read some amount of a partial record -// (recFirst, recMiddle, etc. but not recLast) and then we get another recFirst or recFull -// instead of a recLast or recMiddle we would have an invalid record. -func validateRecord(typ recType, i int) error { - switch typ { - case recFull: - if i != 0 { - return errors.New("unexpected full record") - } - return nil - case recFirst: - if i != 0 { - return errors.New("unexpected first record, dropping buffer") - } - return nil - case recMiddle: - if i == 0 { - return errors.New("unexpected middle record, dropping buffer") - } - return nil - case recLast: - if i == 0 { - return errors.New("unexpected last record, dropping buffer") - } - return nil - default: - return errors.Errorf("unexpected record type %d", typ) - } -} - -// Read a sub-record (see recType) from the buffer. It could potentially -// be a full record (recFull) if the record fits within the bounds of a single page. -// Returns a byte slice of the record data read, the number of bytes read, and an error -// if there's a non-zero byte in a page term record or the record checksum fails. -// This is a non-method function to make it clear it does not mutate the reader. -func (r *LiveReader) readRecord() ([]byte, int, error) { - // Special case: for recPageTerm, check that are all zeros to end of page, - // consume them but don't return them. - if r.buf[r.readIndex] == byte(recPageTerm) { - // End of page won't necessarily be end of buffer, as we may have - // got misaligned by records spanning page boundaries. - // r.total % pageSize is the offset into the current page - // that r.readIndex points to in buf. Therefore - // pageSize - (r.total % pageSize) is the amount left to read of - // the current page. - remaining := int(pageSize - (r.total % pageSize)) - if r.readIndex+remaining > r.writeIndex { - return nil, 0, io.EOF - } - - for i := r.readIndex; i < r.readIndex+remaining; i++ { - if r.buf[i] != 0 { - return nil, 0, errors.New("unexpected non-zero byte in page term bytes") - } - } - - return nil, remaining, nil - } - - // Not a recPageTerm; read the record and check the checksum. - if r.writeIndex-r.readIndex < recordHeaderSize { - return nil, 0, io.EOF - } - - copy(r.hdr[:], r.buf[r.readIndex:r.readIndex+recordHeaderSize]) - length := int(binary.BigEndian.Uint16(r.hdr[1:])) - crc := binary.BigEndian.Uint32(r.hdr[3:]) - if r.readIndex+recordHeaderSize+length > pageSize { - if !r.permissive { - return nil, 0, fmt.Errorf("record would overflow current page: %d > %d", r.readIndex+recordHeaderSize+length, pageSize) - } - r.metrics.readerCorruptionErrors.WithLabelValues("record_span_page").Inc() - level.Warn(r.logger).Log("msg", "record spans page boundaries", "start", r.readIndex, "end", recordHeaderSize+length, "pageSize", pageSize) - } - if recordHeaderSize+length > pageSize { - return nil, 0, fmt.Errorf("record length greater than a single page: %d > %d", recordHeaderSize+length, pageSize) - } - if r.readIndex+recordHeaderSize+length > r.writeIndex { - return nil, 0, io.EOF - } - - rec := r.buf[r.readIndex+recordHeaderSize : r.readIndex+recordHeaderSize+length] - if c := crc32.Checksum(rec, castagnoliTable); c != crc { - return nil, 0, errors.Errorf("unexpected checksum %x, expected %x", c, crc) - } - - return rec, length + recordHeaderSize, nil -} - -func min(i, j int) int { - if i < j { - return i - } - return j -} diff --git a/vendor/github.com/prometheus/tsdb/wal/reader.go b/vendor/github.com/prometheus/tsdb/wal/reader.go deleted file mode 100644 index 7612f8775fa..00000000000 --- a/vendor/github.com/prometheus/tsdb/wal/reader.go +++ /dev/null @@ -1,200 +0,0 @@ -// Copyright 2019 The Prometheus Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package wal - -import ( - "encoding/binary" - "hash/crc32" - "io" - - "github.com/golang/snappy" - "github.com/pkg/errors" -) - -// Reader reads WAL records from an io.Reader. -type Reader struct { - rdr io.Reader - err error - rec []byte - snappyBuf []byte - buf [pageSize]byte - total int64 // Total bytes processed. - curRecTyp recType // Used for checking that the last record is not torn. -} - -// NewReader returns a new reader. -func NewReader(r io.Reader) *Reader { - return &Reader{rdr: r} -} - -// Next advances the reader to the next records and returns true if it exists. -// It must not be called again after it returned false. -func (r *Reader) Next() bool { - err := r.next() - if errors.Cause(err) == io.EOF { - // The last WAL segment record shouldn't be torn(should be full or last). - // The last record would be torn after a crash just before - // the last record part could be persisted to disk. - if r.curRecTyp == recFirst || r.curRecTyp == recMiddle { - r.err = errors.New("last record is torn") - } - return false - } - r.err = err - return r.err == nil -} - -func (r *Reader) next() (err error) { - // We have to use r.buf since allocating byte arrays here fails escape - // analysis and ends up on the heap, even though it seemingly should not. - hdr := r.buf[:recordHeaderSize] - buf := r.buf[recordHeaderSize:] - - r.rec = r.rec[:0] - r.snappyBuf = r.snappyBuf[:0] - - i := 0 - for { - if _, err = io.ReadFull(r.rdr, hdr[:1]); err != nil { - return errors.Wrap(err, "read first header byte") - } - r.total++ - r.curRecTyp = recTypeFromHeader(hdr[0]) - compressed := hdr[0]&snappyMask != 0 - - // Gobble up zero bytes. - if r.curRecTyp == recPageTerm { - // recPageTerm is a single byte that indicates the rest of the page is padded. - // If it's the first byte in a page, buf is too small and - // needs to be resized to fit pageSize-1 bytes. - buf = r.buf[1:] - - // We are pedantic and check whether the zeros are actually up - // to a page boundary. - // It's not strictly necessary but may catch sketchy state early. - k := pageSize - (r.total % pageSize) - if k == pageSize { - continue // Initial 0 byte was last page byte. - } - n, err := io.ReadFull(r.rdr, buf[:k]) - if err != nil { - return errors.Wrap(err, "read remaining zeros") - } - r.total += int64(n) - - for _, c := range buf[:k] { - if c != 0 { - return errors.New("unexpected non-zero byte in padded page") - } - } - continue - } - n, err := io.ReadFull(r.rdr, hdr[1:]) - if err != nil { - return errors.Wrap(err, "read remaining header") - } - r.total += int64(n) - - var ( - length = binary.BigEndian.Uint16(hdr[1:]) - crc = binary.BigEndian.Uint32(hdr[3:]) - ) - - if length > pageSize-recordHeaderSize { - return errors.Errorf("invalid record size %d", length) - } - n, err = io.ReadFull(r.rdr, buf[:length]) - if err != nil { - return err - } - r.total += int64(n) - - if n != int(length) { - return errors.Errorf("invalid size: expected %d, got %d", length, n) - } - if c := crc32.Checksum(buf[:length], castagnoliTable); c != crc { - return errors.Errorf("unexpected checksum %x, expected %x", c, crc) - } - - if compressed { - r.snappyBuf = append(r.snappyBuf, buf[:length]...) - } else { - r.rec = append(r.rec, buf[:length]...) - } - - if err := validateRecord(r.curRecTyp, i); err != nil { - return err - } - if r.curRecTyp == recLast || r.curRecTyp == recFull { - if compressed && len(r.snappyBuf) > 0 { - // The snappy library uses `len` to calculate if we need a new buffer. - // In order to allocate as few buffers as possible make the length - // equal to the capacity. - r.rec = r.rec[:cap(r.rec)] - r.rec, err = snappy.Decode(r.rec, r.snappyBuf) - return err - } - return nil - } - - // Only increment i for non-zero records since we use it - // to determine valid content record sequences. - i++ - } -} - -// Err returns the last encountered error wrapped in a corruption error. -// If the reader does not allow to infer a segment index and offset, a total -// offset in the reader stream will be provided. -func (r *Reader) Err() error { - if r.err == nil { - return nil - } - if b, ok := r.rdr.(*segmentBufReader); ok { - return &CorruptionErr{ - Err: r.err, - Dir: b.segs[b.cur].Dir(), - Segment: b.segs[b.cur].Index(), - Offset: int64(b.off), - } - } - return &CorruptionErr{ - Err: r.err, - Segment: -1, - Offset: r.total, - } -} - -// Record returns the current record. The returned byte slice is only -// valid until the next call to Next. -func (r *Reader) Record() []byte { - return r.rec -} - -// Segment returns the current segment being read. -func (r *Reader) Segment() int { - if b, ok := r.rdr.(*segmentBufReader); ok { - return b.segs[b.cur].Index() - } - return -1 -} - -// Offset returns the current position of the segment being read. -func (r *Reader) Offset() int64 { - if b, ok := r.rdr.(*segmentBufReader); ok { - return int64(b.off) - } - return r.total -} diff --git a/vendor/github.com/prometheus/tsdb/wal/wal.go b/vendor/github.com/prometheus/tsdb/wal/wal.go deleted file mode 100644 index 878aae6ba18..00000000000 --- a/vendor/github.com/prometheus/tsdb/wal/wal.go +++ /dev/null @@ -1,852 +0,0 @@ -// Copyright 2017 The Prometheus Authors - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package wal - -import ( - "bufio" - "encoding/binary" - "fmt" - "hash/crc32" - "io" - "os" - "path/filepath" - "sort" - "strconv" - "sync" - "time" - - "github.com/go-kit/kit/log" - "github.com/go-kit/kit/log/level" - "github.com/golang/snappy" - "github.com/pkg/errors" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/tsdb/fileutil" -) - -const ( - DefaultSegmentSize = 128 * 1024 * 1024 // 128 MB - pageSize = 32 * 1024 // 32KB - recordHeaderSize = 7 -) - -// The table gets initialized with sync.Once but may still cause a race -// with any other use of the crc32 package anywhere. Thus we initialize it -// before. -var castagnoliTable = crc32.MakeTable(crc32.Castagnoli) - -// page is an in memory buffer used to batch disk writes. -// Records bigger than the page size are split and flushed separately. -// A flush is triggered when a single records doesn't fit the page size or -// when the next record can't fit in the remaining free page space. -type page struct { - alloc int - flushed int - buf [pageSize]byte -} - -func (p *page) remaining() int { - return pageSize - p.alloc -} - -func (p *page) full() bool { - return pageSize-p.alloc < recordHeaderSize -} - -// Segment represents a segment file. -type Segment struct { - *os.File - dir string - i int -} - -// Index returns the index of the segment. -func (s *Segment) Index() int { - return s.i -} - -// Dir returns the directory of the segment. -func (s *Segment) Dir() string { - return s.dir -} - -// CorruptionErr is an error that's returned when corruption is encountered. -type CorruptionErr struct { - Dir string - Segment int - Offset int64 - Err error -} - -func (e *CorruptionErr) Error() string { - if e.Segment < 0 { - return fmt.Sprintf("corruption after %d bytes: %s", e.Offset, e.Err) - } - return fmt.Sprintf("corruption in segment %s at %d: %s", SegmentName(e.Dir, e.Segment), e.Offset, e.Err) -} - -// OpenWriteSegment opens segment k in dir. The returned segment is ready for new appends. -func OpenWriteSegment(logger log.Logger, dir string, k int) (*Segment, error) { - segName := SegmentName(dir, k) - f, err := os.OpenFile(segName, os.O_WRONLY|os.O_APPEND, 0666) - if err != nil { - return nil, err - } - stat, err := f.Stat() - if err != nil { - f.Close() - return nil, err - } - // If the last page is torn, fill it with zeros. - // In case it was torn after all records were written successfully, this - // will just pad the page and everything will be fine. - // If it was torn mid-record, a full read (which the caller should do anyway - // to ensure integrity) will detect it as a corruption by the end. - if d := stat.Size() % pageSize; d != 0 { - level.Warn(logger).Log("msg", "last page of the wal is torn, filling it with zeros", "segment", segName) - if _, err := f.Write(make([]byte, pageSize-d)); err != nil { - f.Close() - return nil, errors.Wrap(err, "zero-pad torn page") - } - } - return &Segment{File: f, i: k, dir: dir}, nil -} - -// CreateSegment creates a new segment k in dir. -func CreateSegment(dir string, k int) (*Segment, error) { - f, err := os.OpenFile(SegmentName(dir, k), os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0666) - if err != nil { - return nil, err - } - return &Segment{File: f, i: k, dir: dir}, nil -} - -// OpenReadSegment opens the segment with the given filename. -func OpenReadSegment(fn string) (*Segment, error) { - k, err := strconv.Atoi(filepath.Base(fn)) - if err != nil { - return nil, errors.New("not a valid filename") - } - f, err := os.Open(fn) - if err != nil { - return nil, err - } - return &Segment{File: f, i: k, dir: filepath.Dir(fn)}, nil -} - -// WAL is a write ahead log that stores records in segment files. -// It must be read from start to end once before logging new data. -// If an error occurs during read, the repair procedure must be called -// before it's safe to do further writes. -// -// Segments are written to in pages of 32KB, with records possibly split -// across page boundaries. -// Records are never split across segments to allow full segments to be -// safely truncated. It also ensures that torn writes never corrupt records -// beyond the most recent segment. -type WAL struct { - dir string - logger log.Logger - segmentSize int - mtx sync.RWMutex - segment *Segment // Active segment. - donePages int // Pages written to the segment. - page *page // Active page. - stopc chan chan struct{} - actorc chan func() - closed bool // To allow calling Close() more than once without blocking. - compress bool - snappyBuf []byte - - fsyncDuration prometheus.Summary - pageFlushes prometheus.Counter - pageCompletions prometheus.Counter - truncateFail prometheus.Counter - truncateTotal prometheus.Counter - currentSegment prometheus.Gauge -} - -// New returns a new WAL over the given directory. -func New(logger log.Logger, reg prometheus.Registerer, dir string, compress bool) (*WAL, error) { - return NewSize(logger, reg, dir, DefaultSegmentSize, compress) -} - -// NewSize returns a new WAL over the given directory. -// New segments are created with the specified size. -func NewSize(logger log.Logger, reg prometheus.Registerer, dir string, segmentSize int, compress bool) (*WAL, error) { - if segmentSize%pageSize != 0 { - return nil, errors.New("invalid segment size") - } - if err := os.MkdirAll(dir, 0777); err != nil { - return nil, errors.Wrap(err, "create dir") - } - if logger == nil { - logger = log.NewNopLogger() - } - w := &WAL{ - dir: dir, - logger: logger, - segmentSize: segmentSize, - page: &page{}, - actorc: make(chan func(), 100), - stopc: make(chan chan struct{}), - compress: compress, - } - registerMetrics(reg, w) - - _, j, err := w.Segments() - // Index of the Segment we want to open and write to. - writeSegmentIndex := 0 - if err != nil { - return nil, errors.Wrap(err, "get segment range") - } - // If some segments already exist create one with a higher index than the last segment. - if j != -1 { - writeSegmentIndex = j + 1 - } - - segment, err := CreateSegment(w.dir, writeSegmentIndex) - if err != nil { - return nil, err - } - - if err := w.setSegment(segment); err != nil { - return nil, err - } - - go w.run() - - return w, nil -} - -// Open an existing WAL. -func Open(logger log.Logger, reg prometheus.Registerer, dir string) (*WAL, error) { - if logger == nil { - logger = log.NewNopLogger() - } - w := &WAL{ - dir: dir, - logger: logger, - } - - registerMetrics(reg, w) - return w, nil -} - -func registerMetrics(reg prometheus.Registerer, w *WAL) { - w.fsyncDuration = prometheus.NewSummary(prometheus.SummaryOpts{ - Name: "prometheus_tsdb_wal_fsync_duration_seconds", - Help: "Duration of WAL fsync.", - Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, - }) - w.pageFlushes = prometheus.NewCounter(prometheus.CounterOpts{ - Name: "prometheus_tsdb_wal_page_flushes_total", - Help: "Total number of page flushes.", - }) - w.pageCompletions = prometheus.NewCounter(prometheus.CounterOpts{ - Name: "prometheus_tsdb_wal_completed_pages_total", - Help: "Total number of completed pages.", - }) - w.truncateFail = prometheus.NewCounter(prometheus.CounterOpts{ - Name: "prometheus_tsdb_wal_truncations_failed_total", - Help: "Total number of WAL truncations that failed.", - }) - w.truncateTotal = prometheus.NewCounter(prometheus.CounterOpts{ - Name: "prometheus_tsdb_wal_truncations_total", - Help: "Total number of WAL truncations attempted.", - }) - w.currentSegment = prometheus.NewGauge(prometheus.GaugeOpts{ - Name: "prometheus_tsdb_wal_segment_current", - Help: "WAL segment index that TSDB is currently writing to.", - }) - if reg != nil { - reg.MustRegister(w.fsyncDuration, w.pageFlushes, w.pageCompletions, w.truncateFail, w.truncateTotal, w.currentSegment) - } -} - -// CompressionEnabled returns if compression is enabled on this WAL. -func (w *WAL) CompressionEnabled() bool { - return w.compress -} - -// Dir returns the directory of the WAL. -func (w *WAL) Dir() string { - return w.dir -} - -func (w *WAL) run() { -Loop: - for { - select { - case f := <-w.actorc: - f() - case donec := <-w.stopc: - close(w.actorc) - defer close(donec) - break Loop - } - } - // Drain and process any remaining functions. - for f := range w.actorc { - f() - } -} - -// Repair attempts to repair the WAL based on the error. -// It discards all data after the corruption. -func (w *WAL) Repair(origErr error) error { - // We could probably have a mode that only discards torn records right around - // the corruption to preserve as data much as possible. - // But that's not generally applicable if the records have any kind of causality. - // Maybe as an extra mode in the future if mid-WAL corruptions become - // a frequent concern. - err := errors.Cause(origErr) // So that we can pick up errors even if wrapped. - - cerr, ok := err.(*CorruptionErr) - if !ok { - return errors.Wrap(origErr, "cannot handle error") - } - if cerr.Segment < 0 { - return errors.New("corruption error does not specify position") - } - level.Warn(w.logger).Log("msg", "starting corruption repair", - "segment", cerr.Segment, "offset", cerr.Offset) - - // All segments behind the corruption can no longer be used. - segs, err := listSegments(w.dir) - if err != nil { - return errors.Wrap(err, "list segments") - } - level.Warn(w.logger).Log("msg", "deleting all segments newer than corrupted segment", "segment", cerr.Segment) - - for _, s := range segs { - if w.segment.i == s.index { - // The active segment needs to be removed, - // close it first (Windows!). Can be closed safely - // as we set the current segment to repaired file - // below. - if err := w.segment.Close(); err != nil { - return errors.Wrap(err, "close active segment") - } - } - if s.index <= cerr.Segment { - continue - } - if err := os.Remove(filepath.Join(w.dir, s.name)); err != nil { - return errors.Wrapf(err, "delete segment:%v", s.index) - } - } - // Regardless of the corruption offset, no record reaches into the previous segment. - // So we can safely repair the WAL by removing the segment and re-inserting all - // its records up to the corruption. - level.Warn(w.logger).Log("msg", "rewrite corrupted segment", "segment", cerr.Segment) - - fn := SegmentName(w.dir, cerr.Segment) - tmpfn := fn + ".repair" - - if err := fileutil.Rename(fn, tmpfn); err != nil { - return err - } - // Create a clean segment and make it the active one. - s, err := CreateSegment(w.dir, cerr.Segment) - if err != nil { - return err - } - if err := w.setSegment(s); err != nil { - return err - } - - f, err := os.Open(tmpfn) - if err != nil { - return errors.Wrap(err, "open segment") - } - defer f.Close() - - r := NewReader(bufio.NewReader(f)) - - for r.Next() { - // Add records only up to the where the error was. - if r.Offset() >= cerr.Offset { - break - } - if err := w.Log(r.Record()); err != nil { - return errors.Wrap(err, "insert record") - } - } - // We expect an error here from r.Err(), so nothing to handle. - - // We need to pad to the end of the last page in the repaired segment - w.flushPage(true) - - // We explicitly close even when there is a defer for Windows to be - // able to delete it. The defer is in place to close it in-case there - // are errors above. - if err := f.Close(); err != nil { - return errors.Wrap(err, "close corrupted file") - } - if err := os.Remove(tmpfn); err != nil { - return errors.Wrap(err, "delete corrupted segment") - } - - // Explicitly close the the segment we just repaired to avoid issues with Windows. - s.Close() - - // We always want to start writing to a new Segment rather than an existing - // Segment, which is handled by NewSize, but earlier in Repair we're deleting - // all segments that come after the corrupted Segment. Recreate a new Segment here. - s, err = CreateSegment(w.dir, cerr.Segment+1) - if err != nil { - return err - } - if err := w.setSegment(s); err != nil { - return err - } - return nil -} - -// SegmentName builds a segment name for the directory. -func SegmentName(dir string, i int) string { - return filepath.Join(dir, fmt.Sprintf("%08d", i)) -} - -// NextSegment creates the next segment and closes the previous one. -func (w *WAL) NextSegment() error { - w.mtx.Lock() - defer w.mtx.Unlock() - return w.nextSegment() -} - -// nextSegment creates the next segment and closes the previous one. -func (w *WAL) nextSegment() error { - // Only flush the current page if it actually holds data. - if w.page.alloc > 0 { - if err := w.flushPage(true); err != nil { - return err - } - } - next, err := CreateSegment(w.dir, w.segment.Index()+1) - if err != nil { - return errors.Wrap(err, "create new segment file") - } - prev := w.segment - if err := w.setSegment(next); err != nil { - return err - } - - // Don't block further writes by fsyncing the last segment. - w.actorc <- func() { - if err := w.fsync(prev); err != nil { - level.Error(w.logger).Log("msg", "sync previous segment", "err", err) - } - if err := prev.Close(); err != nil { - level.Error(w.logger).Log("msg", "close previous segment", "err", err) - } - } - return nil -} - -func (w *WAL) setSegment(segment *Segment) error { - w.segment = segment - - // Correctly initialize donePages. - stat, err := segment.Stat() - if err != nil { - return err - } - w.donePages = int(stat.Size() / pageSize) - w.currentSegment.Set(float64(segment.Index())) - return nil -} - -// flushPage writes the new contents of the page to disk. If no more records will fit into -// the page, the remaining bytes will be set to zero and a new page will be started. -// If clear is true, this is enforced regardless of how many bytes are left in the page. -func (w *WAL) flushPage(clear bool) error { - w.pageFlushes.Inc() - - p := w.page - clear = clear || p.full() - - // No more data will fit into the page or an implicit clear. - // Enqueue and clear it. - if clear { - p.alloc = pageSize // Write till end of page. - } - n, err := w.segment.Write(p.buf[p.flushed:p.alloc]) - if err != nil { - return err - } - p.flushed += n - - // We flushed an entire page, prepare a new one. - if clear { - for i := range p.buf { - p.buf[i] = 0 - } - p.alloc = 0 - p.flushed = 0 - w.donePages++ - w.pageCompletions.Inc() - } - return nil -} - -// First Byte of header format: -// [ 4 bits unallocated] [1 bit snappy compression flag] [ 3 bit record type ] -const ( - snappyMask = 1 << 3 - recTypeMask = snappyMask - 1 -) - -type recType uint8 - -const ( - recPageTerm recType = 0 // Rest of page is empty. - recFull recType = 1 // Full record. - recFirst recType = 2 // First fragment of a record. - recMiddle recType = 3 // Middle fragments of a record. - recLast recType = 4 // Final fragment of a record. -) - -func recTypeFromHeader(header byte) recType { - return recType(header & recTypeMask) -} - -func (t recType) String() string { - switch t { - case recPageTerm: - return "zero" - case recFull: - return "full" - case recFirst: - return "first" - case recMiddle: - return "middle" - case recLast: - return "last" - default: - return "" - } -} - -func (w *WAL) pagesPerSegment() int { - return w.segmentSize / pageSize -} - -// Log writes the records into the log. -// Multiple records can be passed at once to reduce writes and increase throughput. -func (w *WAL) Log(recs ...[]byte) error { - w.mtx.Lock() - defer w.mtx.Unlock() - // Callers could just implement their own list record format but adding - // a bit of extra logic here frees them from that overhead. - for i, r := range recs { - if err := w.log(r, i == len(recs)-1); err != nil { - return err - } - } - return nil -} - -// log writes rec to the log and forces a flush of the current page if: -// - the final record of a batch -// - the record is bigger than the page size -// - the current page is full. -func (w *WAL) log(rec []byte, final bool) error { - // When the last page flush failed the page will remain full. - // When the page is full, need to flush it before trying to add more records to it. - if w.page.full() { - if err := w.flushPage(true); err != nil { - return err - } - } - // If the record is too big to fit within the active page in the current - // segment, terminate the active segment and advance to the next one. - // This ensures that records do not cross segment boundaries. - left := w.page.remaining() - recordHeaderSize // Free space in the active page. - left += (pageSize - recordHeaderSize) * (w.pagesPerSegment() - w.donePages - 1) // Free pages in the active segment. - - if len(rec) > left { - if err := w.nextSegment(); err != nil { - return err - } - } - - compressed := false - if w.compress && len(rec) > 0 { - // The snappy library uses `len` to calculate if we need a new buffer. - // In order to allocate as few buffers as possible make the length - // equal to the capacity. - w.snappyBuf = w.snappyBuf[:cap(w.snappyBuf)] - w.snappyBuf = snappy.Encode(w.snappyBuf, rec) - if len(w.snappyBuf) < len(rec) { - rec = w.snappyBuf - compressed = true - } - } - - // Populate as many pages as necessary to fit the record. - // Be careful to always do one pass to ensure we write zero-length records. - for i := 0; i == 0 || len(rec) > 0; i++ { - p := w.page - - // Find how much of the record we can fit into the page. - var ( - l = min(len(rec), (pageSize-p.alloc)-recordHeaderSize) - part = rec[:l] - buf = p.buf[p.alloc:] - typ recType - ) - - switch { - case i == 0 && len(part) == len(rec): - typ = recFull - case len(part) == len(rec): - typ = recLast - case i == 0: - typ = recFirst - default: - typ = recMiddle - } - if compressed { - typ |= snappyMask - } - - buf[0] = byte(typ) - crc := crc32.Checksum(part, castagnoliTable) - binary.BigEndian.PutUint16(buf[1:], uint16(len(part))) - binary.BigEndian.PutUint32(buf[3:], crc) - - copy(buf[recordHeaderSize:], part) - p.alloc += len(part) + recordHeaderSize - - // By definition when a record is split it means its size is bigger than - // the page boundary so the current page would be full and needs to be flushed. - // On contrary if we wrote a full record, we can fit more records of the batch - // into the page before flushing it. - if final || typ != recFull || w.page.full() { - if err := w.flushPage(false); err != nil { - return err - } - } - rec = rec[l:] - } - return nil -} - -// Segments returns the range [first, n] of currently existing segments. -// If no segments are found, first and n are -1. -func (w *WAL) Segments() (first, last int, err error) { - refs, err := listSegments(w.dir) - if err != nil { - return 0, 0, err - } - if len(refs) == 0 { - return -1, -1, nil - } - return refs[0].index, refs[len(refs)-1].index, nil -} - -// Truncate drops all segments before i. -func (w *WAL) Truncate(i int) (err error) { - w.truncateTotal.Inc() - defer func() { - if err != nil { - w.truncateFail.Inc() - } - }() - refs, err := listSegments(w.dir) - if err != nil { - return err - } - for _, r := range refs { - if r.index >= i { - break - } - if err = os.Remove(filepath.Join(w.dir, r.name)); err != nil { - return err - } - } - return nil -} - -func (w *WAL) fsync(f *Segment) error { - start := time.Now() - err := f.File.Sync() - w.fsyncDuration.Observe(time.Since(start).Seconds()) - return err -} - -// Close flushes all writes and closes active segment. -func (w *WAL) Close() (err error) { - w.mtx.Lock() - defer w.mtx.Unlock() - - if w.closed { - return errors.New("wal already closed") - } - - // Flush the last page and zero out all its remaining size. - // We must not flush an empty page as it would falsely signal - // the segment is done if we start writing to it again after opening. - if w.page.alloc > 0 { - if err := w.flushPage(true); err != nil { - return err - } - } - - donec := make(chan struct{}) - w.stopc <- donec - <-donec - - if err = w.fsync(w.segment); err != nil { - level.Error(w.logger).Log("msg", "sync previous segment", "err", err) - } - if err := w.segment.Close(); err != nil { - level.Error(w.logger).Log("msg", "close previous segment", "err", err) - } - w.closed = true - return nil -} - -type segmentRef struct { - name string - index int -} - -func listSegments(dir string) (refs []segmentRef, err error) { - files, err := fileutil.ReadDir(dir) - if err != nil { - return nil, err - } - var last int - for _, fn := range files { - k, err := strconv.Atoi(fn) - if err != nil { - continue - } - if len(refs) > 0 && k > last+1 { - return nil, errors.New("segments are not sequential") - } - refs = append(refs, segmentRef{name: fn, index: k}) - last = k - } - sort.Slice(refs, func(i, j int) bool { - return refs[i].index < refs[j].index - }) - return refs, nil -} - -// SegmentRange groups segments by the directory and the first and last index it includes. -type SegmentRange struct { - Dir string - First, Last int -} - -// NewSegmentsReader returns a new reader over all segments in the directory. -func NewSegmentsReader(dir string) (io.ReadCloser, error) { - return NewSegmentsRangeReader(SegmentRange{dir, -1, -1}) -} - -// NewSegmentsRangeReader returns a new reader over the given WAL segment ranges. -// If first or last are -1, the range is open on the respective end. -func NewSegmentsRangeReader(sr ...SegmentRange) (io.ReadCloser, error) { - var segs []*Segment - - for _, sgmRange := range sr { - refs, err := listSegments(sgmRange.Dir) - if err != nil { - return nil, errors.Wrapf(err, "list segment in dir:%v", sgmRange.Dir) - } - - for _, r := range refs { - if sgmRange.First >= 0 && r.index < sgmRange.First { - continue - } - if sgmRange.Last >= 0 && r.index > sgmRange.Last { - break - } - s, err := OpenReadSegment(filepath.Join(sgmRange.Dir, r.name)) - if err != nil { - return nil, errors.Wrapf(err, "open segment:%v in dir:%v", r.name, sgmRange.Dir) - } - segs = append(segs, s) - } - } - return NewSegmentBufReader(segs...), nil -} - -// segmentBufReader is a buffered reader that reads in multiples of pages. -// The main purpose is that we are able to track segment and offset for -// corruption reporting. We have to be careful not to increment curr too -// early, as it is used by Reader.Err() to tell Repair which segment is corrupt. -// As such we pad the end of non-page align segments with zeros. -type segmentBufReader struct { - buf *bufio.Reader - segs []*Segment - cur int // Index into segs. - off int // Offset of read data into current segment. -} - -func NewSegmentBufReader(segs ...*Segment) *segmentBufReader { - return &segmentBufReader{ - buf: bufio.NewReaderSize(segs[0], 16*pageSize), - segs: segs, - } -} - -func (r *segmentBufReader) Close() (err error) { - for _, s := range r.segs { - if e := s.Close(); e != nil { - err = e - } - } - return err -} - -// Read implements io.Reader. -func (r *segmentBufReader) Read(b []byte) (n int, err error) { - n, err = r.buf.Read(b) - r.off += n - - // If we succeeded, or hit a non-EOF, we can stop. - if err == nil || err != io.EOF { - return n, err - } - - // We hit EOF; fake out zero padding at the end of short segments, so we - // don't increment curr too early and report the wrong segment as corrupt. - if r.off%pageSize != 0 { - i := 0 - for ; n+i < len(b) && (r.off+i)%pageSize != 0; i++ { - b[n+i] = 0 - } - - // Return early, even if we didn't fill b. - r.off += i - return n + i, nil - } - - // There is no more deta left in the curr segment and there are no more - // segments left. Return EOF. - if r.cur+1 >= len(r.segs) { - return n, io.EOF - } - - // Move to next segment. - r.cur++ - r.off = 0 - r.buf.Reset(r.segs[r.cur]) - return n, nil -} diff --git a/vendor/modules.txt b/vendor/modules.txt index 34f3d7145d6..4af1ba7c1bc 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -440,10 +440,6 @@ github.com/prometheus/prometheus/util/teststorage github.com/prometheus/prometheus/util/testutil github.com/prometheus/prometheus/util/treecache github.com/prometheus/prometheus/web/api/v1 -# github.com/prometheus/tsdb v0.10.0 -github.com/prometheus/tsdb/errors -github.com/prometheus/tsdb/fileutil -github.com/prometheus/tsdb/wal # github.com/rs/cors v1.6.0 github.com/rs/cors # github.com/samuel/go-zookeeper v0.0.0-20190810000440-0ceca61e4d75 From ad3476ef0867d8a0013c36b6c9204fbdbbb86046 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Fri, 25 Oct 2019 15:40:30 +0200 Subject: [PATCH 36/56] Avoid flushing on shutdown Signed-off-by: Ganesh Vernekar --- pkg/ingester/ingester.go | 2 +- pkg/ring/lifecycler.go | 18 ++++++++++-------- pkg/ring/lifecycler_test.go | 10 +++++----- pkg/ruler/ruler.go | 2 +- 4 files changed, 17 insertions(+), 15 deletions(-) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 5b3bdefee54..a81ef69134e 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -204,7 +204,7 @@ func New(cfg Config, clientConfig client.Config, limits *validation.Overrides, c return nil, err } - i.lifecycler, err = ring.NewLifecycler(cfg.LifecyclerConfig, i, "ingester") + i.lifecycler, err = ring.NewLifecycler(cfg.LifecyclerConfig, i, "ingester", !cfg.WALConfig.walEnabled) if err != nil { return nil, err diff --git a/pkg/ring/lifecycler.go b/pkg/ring/lifecycler.go index 669ab94989b..8bcf39b8daf 100644 --- a/pkg/ring/lifecycler.go +++ b/pkg/ring/lifecycler.go @@ -115,9 +115,10 @@ type Lifecycler struct { actorChan chan func() // These values are initialised at startup, and never change - ID string - Addr string - RingName string + ID string + Addr string + RingName string + flushOnShutdown bool // We need to remember the ingester state just in case consul goes away and comes // back empty. And it changes during lifecycle of ingester. @@ -132,7 +133,7 @@ type Lifecycler struct { } // NewLifecycler makes and starts a new Lifecycler. -func NewLifecycler(cfg LifecyclerConfig, flushTransferer FlushTransferer, name string) (*Lifecycler, error) { +func NewLifecycler(cfg LifecyclerConfig, flushTransferer FlushTransferer, name string, flushOnShutdown bool) (*Lifecycler, error) { addr := cfg.Addr if addr == "" { var err error @@ -156,9 +157,10 @@ func NewLifecycler(cfg LifecyclerConfig, flushTransferer FlushTransferer, name s flushTransferer: flushTransferer, KVStore: store, - Addr: fmt.Sprintf("%s:%d", addr, port), - ID: cfg.ID, - RingName: name, + Addr: fmt.Sprintf("%s:%d", addr, port), + ID: cfg.ID, + RingName: name, + flushOnShutdown: flushOnShutdown, quit: make(chan struct{}), actorChan: make(chan func()), @@ -479,7 +481,7 @@ func (i *Lifecycler) changeState(ctx context.Context, state IngesterState) error } func (i *Lifecycler) processShutdown(ctx context.Context) { - flushRequired := true + flushRequired := i.flushOnShutdown transferStart := time.Now() if err := i.flushTransferer.TransferOut(ctx); err != nil { level.Error(util.Logger).Log("msg", "Failed to transfer chunks to another ingester", "err", err) diff --git a/pkg/ring/lifecycler_test.go b/pkg/ring/lifecycler_test.go index 50c8a84373e..6e700d475de 100644 --- a/pkg/ring/lifecycler_test.go +++ b/pkg/ring/lifecycler_test.go @@ -68,7 +68,7 @@ func TestRingNormaliseMigration(t *testing.T) { lifecyclerConfig1 := testLifecyclerConfig(ringConfig, "ing1") ft := &flushTransferer{} - l1, err := NewLifecycler(lifecyclerConfig1, ft, "ingester") + l1, err := NewLifecycler(lifecyclerConfig1, ft, "ingester", true) require.NoError(t, err) // Check this ingester joined, is active, and has one token. @@ -85,7 +85,7 @@ func TestRingNormaliseMigration(t *testing.T) { lifecyclerConfig2.JoinAfter = 100 * time.Second lifecyclerConfig2.NormaliseTokens = true - l2, err := NewLifecycler(lifecyclerConfig2, &flushTransferer{}, "ingester") + l2, err := NewLifecycler(lifecyclerConfig2, &flushTransferer{}, "ingester", true) require.NoError(t, err) // This will block until l1 has successfully left the ring. @@ -122,7 +122,7 @@ func TestRingRestart(t *testing.T) { // Add an 'ingester' with normalised tokens. lifecyclerConfig1 := testLifecyclerConfig(ringConfig, "ing1") lifecyclerConfig1.NormaliseTokens = true - l1, err := NewLifecycler(lifecyclerConfig1, &nopFlushTransferer{}, "ingester") + l1, err := NewLifecycler(lifecyclerConfig1, &nopFlushTransferer{}, "ingester", true) require.NoError(t, err) // Check this ingester joined, is active, and has one token. @@ -135,7 +135,7 @@ func TestRingRestart(t *testing.T) { token := l1.tokens[0] // Add a second ingester with the same settings, so it will think it has restarted - l2, err := NewLifecycler(lifecyclerConfig1, &nopFlushTransferer{}, "ingester") + l2, err := NewLifecycler(lifecyclerConfig1, &nopFlushTransferer{}, "ingester", true) require.NoError(t, err) // Check the new ingester picked up the same token @@ -195,7 +195,7 @@ func TestCheckReady(t *testing.T) { defer r.Stop() cfg := testLifecyclerConfig(ringConfig, "ring1") cfg.MinReadyDuration = 1 * time.Nanosecond - l1, err := NewLifecycler(cfg, &nopFlushTransferer{}, "ingester") + l1, err := NewLifecycler(cfg, &nopFlushTransferer{}, "ingester", true) l1.setTokens([]uint32{1}) require.NoError(t, err) diff --git a/pkg/ruler/ruler.go b/pkg/ruler/ruler.go index 9f99ed12cca..7522dc318f3 100644 --- a/pkg/ruler/ruler.go +++ b/pkg/ruler/ruler.go @@ -156,7 +156,7 @@ func NewRuler(cfg Config, engine *promql.Engine, queryable storage.Queryable, d // If sharding is enabled, create/join a ring to distribute tokens to // the ruler if cfg.EnableSharding { - ruler.lifecycler, err = ring.NewLifecycler(cfg.LifecyclerConfig, ruler, "ruler") + ruler.lifecycler, err = ring.NewLifecycler(cfg.LifecyclerConfig, ruler, "ruler", true) if err != nil { return nil, err } From 251bde5b22a5a41d8d1abc2c319122c2a4d2e439 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Mon, 11 Nov 2019 16:46:37 +0100 Subject: [PATCH 37/56] Fix after rebase Signed-off-by: Ganesh Vernekar --- pkg/ingester/ingester.go | 4 +++- pkg/ingester/ingester_v2.go | 2 +- pkg/ingester/user_state.go | 2 -- pkg/ingester/wal.go | 2 +- pkg/ring/lifecycler_test.go | 4 ++-- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index cfb6005a3ef..969e1a31445 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -269,7 +269,9 @@ func (i *Ingester) Shutdown() { close(i.quit) i.done.Wait() - i.wal.Stop() + if i.wal != nil { + i.wal.Stop() + } // Next initiate our graceful exit from the ring. i.lifecycler.Shutdown() diff --git a/pkg/ingester/ingester_v2.go b/pkg/ingester/ingester_v2.go index ff898b73473..69f2471af79 100644 --- a/pkg/ingester/ingester_v2.go +++ b/pkg/ingester/ingester_v2.go @@ -51,7 +51,7 @@ func NewV2(cfg Config, clientConfig client.Config, limits *validation.Overrides, }, } - i.lifecycler, err = ring.NewLifecycler(cfg.LifecyclerConfig, i, "ingester") + i.lifecycler, err = ring.NewLifecycler(cfg.LifecyclerConfig, i, "ingester", true) if err != nil { return nil, err } diff --git a/pkg/ingester/user_state.go b/pkg/ingester/user_state.go index ced23c8997e..6455aa14187 100644 --- a/pkg/ingester/user_state.go +++ b/pkg/ingester/user_state.go @@ -209,7 +209,6 @@ func (u *userState) createSeriesWithFingerprint(fp model.Fingerprint, metric lab if !recovery { if err := u.limiter.AssertMaxSeriesPerUser(u.userID, u.fpToSeries.length()); err != nil { - u.fpLocker.Unlock(fp) u.discardedSamples.WithLabelValues(perUserSeriesLimit).Inc() return nil, httpgrpc.Errorf(http.StatusTooManyRequests, err.Error()) } @@ -223,7 +222,6 @@ func (u *userState) createSeriesWithFingerprint(fp model.Fingerprint, metric lab if !recovery { // Check if the per-metric limit has been exceeded if err = u.canAddSeriesFor(string(metricName)); err != nil { - u.fpLocker.Unlock(fp) u.discardedSamples.WithLabelValues(perMetricSeriesLimit).Inc() return nil, httpgrpc.Errorf(http.StatusTooManyRequests, "%s for: %s", err.Error(), metric) } diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index 7f710823de3..fb28143a211 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -342,7 +342,7 @@ func (w *walWrapper) checkpointSeries(cp *wal.WAL, userID string, fp model.Finge func recoverFromWAL(ingester *Ingester) (err error) { walDir := ingester.cfg.WALConfig.dir // Use a local userStates, so we don't need to worry about locking. - userStates := newUserStates(ingester.limits, ingester.cfg) + userStates := newUserStates(ingester.limiter, ingester.cfg) lastCheckpointDir, idx, err := lastCheckpoint(walDir) if err != nil { diff --git a/pkg/ring/lifecycler_test.go b/pkg/ring/lifecycler_test.go index 8faac3b1bef..65c912e5318 100644 --- a/pkg/ring/lifecycler_test.go +++ b/pkg/ring/lifecycler_test.go @@ -118,7 +118,7 @@ func TestLifecycler_HealthyInstancesCount(t *testing.T) { lifecyclerConfig1.HeartbeatPeriod = 100 * time.Millisecond lifecyclerConfig1.JoinAfter = 100 * time.Millisecond - lifecycler1, err := NewLifecycler(lifecyclerConfig1, &flushTransferer{}, "ingester") + lifecycler1, err := NewLifecycler(lifecyclerConfig1, &flushTransferer{}, "ingester", true) require.NoError(t, err) assert.Equal(t, 0, lifecycler1.HealthyInstancesCount()) @@ -134,7 +134,7 @@ func TestLifecycler_HealthyInstancesCount(t *testing.T) { lifecyclerConfig2.HeartbeatPeriod = 100 * time.Millisecond lifecyclerConfig2.JoinAfter = 100 * time.Millisecond - lifecycler2, err := NewLifecycler(lifecyclerConfig2, &flushTransferer{}, "ingester") + lifecycler2, err := NewLifecycler(lifecyclerConfig2, &flushTransferer{}, "ingester", true) require.NoError(t, err) assert.Equal(t, 0, lifecycler2.HealthyInstancesCount()) From 88009b9a165321d36c1b18d81424550b562cb36e Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Wed, 13 Nov 2019 14:13:19 -0500 Subject: [PATCH 38/56] Fix bug of resetting userStates Signed-off-by: Ganesh Vernekar --- pkg/ingester/ingester.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 969e1a31445..d78e5f2bf35 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -209,6 +209,7 @@ func New(cfg Config, clientConfig client.Config, limits *validation.Overrides, c } var err error + i.limiter = NewSeriesLimiter(limits, i.lifecycler, cfg.LifecyclerConfig.RingConfig.ReplicationFactor, cfg.ShardByAllLabels) i.wal, err = newWAL(cfg.WALConfig, i) if err != nil { @@ -216,14 +217,14 @@ func New(cfg Config, clientConfig client.Config, limits *validation.Overrides, c } i.lifecycler, err = ring.NewLifecycler(cfg.LifecyclerConfig, i, "ingester", !cfg.WALConfig.walEnabled) - if err != nil { return nil, err } // Init the limter and instantiate the user states which depend on it - i.limiter = NewSeriesLimiter(limits, i.lifecycler, cfg.LifecyclerConfig.RingConfig.ReplicationFactor, cfg.ShardByAllLabels) - i.userStates = newUserStates(i.limiter, cfg) + if i.userStates == nil { + i.userStates = newUserStates(i.limiter, cfg) + } // Now that user states have been created, we can start the lifecycler i.lifecycler.Start() From 5dfe853e491c9f6cb4f104f52ced31accb36f249 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Thu, 14 Nov 2019 09:46:46 -0500 Subject: [PATCH 39/56] Fix review comments Signed-off-by: Ganesh Vernekar --- pkg/ingester/ingester.go | 11 +++++++---- pkg/ingester/wal.go | 15 +++++++++++---- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index d78e5f2bf35..17e687ee2d4 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -209,20 +209,23 @@ func New(cfg Config, clientConfig client.Config, limits *validation.Overrides, c } var err error - i.limiter = NewSeriesLimiter(limits, i.lifecycler, cfg.LifecyclerConfig.RingConfig.ReplicationFactor, cfg.ShardByAllLabels) - i.wal, err = newWAL(cfg.WALConfig, i) + // During WAL recovery, it will create new user states which requires the limiter. + // Hence initialise the limiter before creating the WAL. + i.lifecycler, err = ring.NewLifecycler(cfg.LifecyclerConfig, i, "ingester", !cfg.WALConfig.walEnabled) if err != nil { return nil, err } + i.limiter = NewSeriesLimiter(limits, i.lifecycler, cfg.LifecyclerConfig.RingConfig.ReplicationFactor, cfg.ShardByAllLabels) - i.lifecycler, err = ring.NewLifecycler(cfg.LifecyclerConfig, i, "ingester", !cfg.WALConfig.walEnabled) + i.wal, err = newWAL(cfg.WALConfig, i) if err != nil { + i.lifecycler.Shutdown() return nil, err } - // Init the limter and instantiate the user states which depend on it if i.userStates == nil { + // If userStates is nil, it means that the WAL recovery did not happen. i.userStates = newUserStates(i.limiter, cfg) } diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index fb28143a211..82ab293be08 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -76,6 +76,9 @@ type walWrapper struct { checkpointCreationTotal prometheus.Counter } +// newWAL creates a WAL object. +// * If the WAL is disabled, then the returned WAL is a no-op WAL. +// * If WAL recovery is enabled, then the userStates is always set for ingester. func newWAL(cfg WALConfig, ingester *Ingester) (WAL, error) { if cfg.recover { level.Info(util.Logger).Log("msg", "recovering from WAL") @@ -344,6 +347,14 @@ func recoverFromWAL(ingester *Ingester) (err error) { // Use a local userStates, so we don't need to worry about locking. userStates := newUserStates(ingester.limiter, ingester.cfg) + defer func() { + if err == nil { + ingester.userStatesMtx.Lock() + ingester.userStates = userStates + ingester.userStatesMtx.Unlock() + } + }() + lastCheckpointDir, idx, err := lastCheckpoint(walDir) if err != nil { return err @@ -383,10 +394,6 @@ func recoverFromWAL(ingester *Ingester) (err error) { elapsed := time.Since(start) level.Info(util.Logger).Log("msg", "recovered from segments", "time", elapsed.String()) - ingester.userStatesMtx.Lock() - ingester.userStates = userStates - ingester.userStatesMtx.Unlock() - return nil } From c55cedc61f2f5fb2eebd84ab2a84e4fea44ce43c Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Sun, 17 Nov 2019 19:29:52 -0500 Subject: [PATCH 40/56] Small enhancements Signed-off-by: Ganesh Vernekar --- pkg/ingester/wal.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index 82ab293be08..f6637774d73 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -158,6 +158,10 @@ func (w *walWrapper) Log(record *Record) error { func (w *walWrapper) run() { defer w.wait.Done() + if !w.cfg.checkpointEnabled { + return + } + ticker := time.NewTicker(w.cfg.checkpointDuration) defer ticker.Stop() From 902917f877a705546dd9f68f8100074d493a5c9e Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Thu, 5 Dec 2019 15:11:52 +0530 Subject: [PATCH 41/56] Update comments Signed-off-by: Ganesh Vernekar --- pkg/ingester/ingester.go | 3 ++- pkg/ingester/wal.go | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 14a363facb7..728aa9f91a2 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -212,6 +212,7 @@ func New(cfg Config, clientConfig client.Config, limits *validation.Overrides, c // During WAL recovery, it will create new user states which requires the limiter. // Hence initialise the limiter before creating the WAL. + // The '!cfg.WALConfig.walEnabled' argument says don't flush on shutdown if the WAL is enabled. i.lifecycler, err = ring.NewLifecycler(cfg.LifecyclerConfig, i, "ingester", !cfg.WALConfig.walEnabled) if err != nil { return nil, err @@ -224,8 +225,8 @@ func New(cfg Config, clientConfig client.Config, limits *validation.Overrides, c return nil, err } + // If the WAL recover happened, then the userStates would already be set. if i.userStates == nil { - // If userStates is nil, it means that the WAL recovery did not happen. i.userStates = newUserStates(i.limiter, cfg) } diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index f6637774d73..facb86bb8ed 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -148,6 +148,9 @@ func (w *walWrapper) Stop() { } func (w *walWrapper) Log(record *Record) error { + if record == nil { + return nil + } buf, err := proto.Marshal(record) if err != nil { return err From 4b0d5787e5319470df962b97787258783d0fae67 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Thu, 5 Dec 2019 17:29:33 +0530 Subject: [PATCH 42/56] Remove ingester<->WAL circular dependancy Signed-off-by: Ganesh Vernekar --- pkg/ingester/ingester.go | 17 +++++++++++++---- pkg/ingester/wal.go | 37 ++++++++++++++++--------------------- 2 files changed, 29 insertions(+), 25 deletions(-) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 728aa9f91a2..a5a15d2123e 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -219,10 +219,14 @@ func New(cfg Config, clientConfig client.Config, limits *validation.Overrides, c } i.limiter = NewSeriesLimiter(limits, i.lifecycler, cfg.LifecyclerConfig.RingConfig.ReplicationFactor, cfg.ShardByAllLabels) - i.wal, err = newWAL(cfg.WALConfig, i) - if err != nil { - i.lifecycler.Shutdown() - return nil, err + if cfg.WALConfig.recover { + level.Info(util.Logger).Log("msg", "recovering from WAL") + start := time.Now() + if err := recoverFromWAL(i); err != nil { + return nil, err + } + elapsed := time.Since(start) + level.Info(util.Logger).Log("msg", "recovery from WAL completed", "time", elapsed.String()) } // If the WAL recover happened, then the userStates would already be set. @@ -230,6 +234,11 @@ func New(cfg Config, clientConfig client.Config, limits *validation.Overrides, c i.userStates = newUserStates(i.limiter, cfg) } + i.wal, err = newWAL(cfg.WALConfig, i.userStates.cp) + if err != nil { + return nil, err + } + // Now that user states have been created, we can start the lifecycler i.lifecycler.Start() diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index facb86bb8ed..9ab26984238 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -62,12 +62,12 @@ func (noop) Log(*Record) error { func (noop) Stop() {} type walWrapper struct { - cfg WALConfig - ingester *Ingester - quit chan struct{} - wait sync.WaitGroup + cfg WALConfig + quit chan struct{} + wait sync.WaitGroup - wal *wal.WAL + wal *wal.WAL + getUserStates func() map[string]*userState // Checkpoint metrics. checkpointDeleteFail prometheus.Counter @@ -79,17 +79,7 @@ type walWrapper struct { // newWAL creates a WAL object. // * If the WAL is disabled, then the returned WAL is a no-op WAL. // * If WAL recovery is enabled, then the userStates is always set for ingester. -func newWAL(cfg WALConfig, ingester *Ingester) (WAL, error) { - if cfg.recover { - level.Info(util.Logger).Log("msg", "recovering from WAL") - start := time.Now() - if err := recoverFromWAL(ingester); err != nil { - return nil, err - } - elapsed := time.Since(start) - level.Info(util.Logger).Log("msg", "recovery from WAL completed", "time", elapsed.String()) - } - +func newWAL(cfg WALConfig, userStatesFunc func() map[string]*userState) (WAL, error) { if !cfg.walEnabled { return &noop{}, nil } @@ -104,10 +94,10 @@ func newWAL(cfg WALConfig, ingester *Ingester) (WAL, error) { } w := &walWrapper{ - cfg: cfg, - ingester: ingester, - quit: make(chan struct{}), - wal: tsdbWAL, + cfg: cfg, + quit: make(chan struct{}), + wal: tsdbWAL, + getUserStates: userStatesFunc, } w.checkpointDeleteFail = prometheus.NewCounter(prometheus.CounterOpts{ @@ -203,6 +193,7 @@ func (w *walWrapper) checkpoint() (err error) { if !w.cfg.checkpointEnabled { return nil } + w.checkpointCreationTotal.Inc() defer func() { if err != nil { @@ -210,6 +201,10 @@ func (w *walWrapper) checkpoint() (err error) { } }() + if w.getUserStates == nil { + return errors.New("function to get user states not initialised") + } + _, lastSegment, err := w.wal.Segments() if err != nil { return err @@ -239,7 +234,7 @@ func (w *walWrapper) checkpoint() (err error) { }() var wireChunkBuf []client.Chunk - for userID, state := range w.ingester.userStates.cp() { + for userID, state := range w.getUserStates() { for pair := range state.fpToSeries.iter() { state.fpLocker.Lock(pair.fp) wireChunkBuf, err = w.checkpointSeries(cp, userID, pair.fp, pair.series, wireChunkBuf) From d79995100b40005a9e7cd346f8992932fc870074 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Fri, 6 Dec 2019 14:57:24 +0530 Subject: [PATCH 43/56] Change segment size of the WAL Signed-off-by: Ganesh Vernekar --- pkg/ingester/wal.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index 9ab26984238..a69cfc88fec 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -88,7 +88,7 @@ func newWAL(cfg WALConfig, userStatesFunc func() map[string]*userState) (WAL, er if cfg.metricsRegisterer != nil { walRegistry = prometheus.WrapRegistererWith(prometheus.Labels{"kind": "wal"}, cfg.metricsRegisterer) } - tsdbWAL, err := wal.New(util.Logger, walRegistry, cfg.dir, true) + tsdbWAL, err := wal.NewSize(util.Logger, walRegistry, cfg.dir, wal.DefaultSegmentSize/4, true) if err != nil { return nil, err } From 69e0a48592f3938cd8d1ce416fc5d6b4d8e68324 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Fri, 6 Dec 2019 17:01:52 +0530 Subject: [PATCH 44/56] Use same directory for temporary tokens file Signed-off-by: Ganesh Vernekar --- pkg/ring/tokens.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/ring/tokens.go b/pkg/ring/tokens.go index b8ec5e323b9..729b3a9c367 100644 --- a/pkg/ring/tokens.go +++ b/pkg/ring/tokens.go @@ -22,7 +22,7 @@ func (t Tokens) StoreToFile(tokenFilePath string) error { // If any operations failed further in the function, we keep the temporary // file hanging around for debugging. - f, err := ioutil.TempFile(os.TempDir(), "tokens") + f, err := os.Create(tokenFilePath + ".tmp") if err != nil { return err } From 14218bf50c7c5da89caaa9f7a42b01d479e01395 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Wed, 11 Dec 2019 13:21:51 +0530 Subject: [PATCH 45/56] Disble transfer out when WAL is enabled Signed-off-by: Ganesh Vernekar --- pkg/ingester/ingester.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index a5a15d2123e..bebe5774504 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -198,6 +198,13 @@ func New(cfg Config, clientConfig client.Config, limits *validation.Overrides, c return NewV2(cfg, clientConfig, limits, chunkStore, registerer) } + if cfg.WALConfig.walEnabled { + // If WAL is enabled, we don't transfer out the data to any ingester. + // Either the next ingester which takes it's place should recover from WAL + // or the data has to be flushed during scaledown. + cfg.MaxTransferRetries = 0 + } + i := &Ingester{ cfg: cfg, clientConfig: clientConfig, From 3f315269cd1c1238118014a9413beb4654e707d8 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Wed, 11 Dec 2019 18:05:33 +0530 Subject: [PATCH 46/56] Flush on shutdown endpoint irrespective of WAL Signed-off-by: Ganesh Vernekar --- pkg/ingester/ingester.go | 4 ++++ pkg/ring/lifecycler.go | 11 +++++++++++ 2 files changed, 15 insertions(+) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index bebe5774504..4fc4e071c18 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -308,7 +308,11 @@ func (i *Ingester) Shutdown() { // * Change the state of ring to stop accepting writes. // * Flush all the chunks. func (i *Ingester) ShutdownHandler(w http.ResponseWriter, r *http.Request) { + originalState := i.lifecycler.FlushOnShutdown() + // We want to flush the chunks if transfer fails irrespective of original flag. + i.lifecycler.SetFlushOnShutdown(true) i.Shutdown() + i.lifecycler.SetFlushOnShutdown(originalState) w.WriteHeader(http.StatusNoContent) } diff --git a/pkg/ring/lifecycler.go b/pkg/ring/lifecycler.go index 2537f328996..f85173e2194 100644 --- a/pkg/ring/lifecycler.go +++ b/pkg/ring/lifecycler.go @@ -686,6 +686,17 @@ func (i *Lifecycler) updateCounters(ringDesc *Desc) { i.countersLock.Unlock() } +// FlushOnShutdown returns if flushing is enabled if transfer fails on a shutdown. +func (i *Lifecycler) FlushOnShutdown() bool { + return i.flushOnShutdown +} + +// SetFlushOnShutdown enables/disables flush on shutdown if transfer fails. +// Passing 'true' enables it, and 'false' disabled it. +func (i *Lifecycler) SetFlushOnShutdown(flushOnShutdown bool) { + i.flushOnShutdown = flushOnShutdown +} + func (i *Lifecycler) processShutdown(ctx context.Context) { flushRequired := i.flushOnShutdown transferStart := time.Now() From 7e6f5ded74efc47f2059dbd6bc466fca47709217 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Mon, 16 Dec 2019 16:08:43 +0530 Subject: [PATCH 47/56] Use sync.Pool for the records Signed-off-by: Ganesh Vernekar --- pkg/ingester/ingester.go | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index c513c0c4130..82b00d4d9d4 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -35,6 +35,11 @@ const ( queryStreamBatchSize = 128 ) +var ( + // This is initialised if the WAL is enabled and the records are fetched from this pool. + recordPool sync.Pool +) + type ingesterMetrics struct { flushQueueLength prometheus.Gauge ingestedSamples prometheus.Counter @@ -203,6 +208,12 @@ func New(cfg Config, clientConfig client.Config, limits *validation.Overrides, c // Either the next ingester which takes it's place should recover from WAL // or the data has to be flushed during scaledown. cfg.MaxTransferRetries = 0 + + recordPool = sync.Pool{ + New: func() interface{} { + return &Record{} + }, + } } i := &Ingester{ @@ -335,9 +346,15 @@ func (i *Ingester) Push(ctx old_ctx.Context, req *client.WriteRequest) (*client. var record *Record if i.cfg.WALConfig.walEnabled { - record = &Record{ - UserId: userID, - Samples: make([]Sample, 0, len(req.Timeseries)), + record = recordPool.Get().(*Record) + record.UserId = userID + // Assuming there is not much churn in most cases, there is no use + // keeping the record.Labels slice hanging around. + record.Labels = nil + if cap(record.Samples) < len(req.Timeseries) { + record.Samples = make([]Sample, 0, len(req.Timeseries)) + } else { + record.Samples = record.Samples[:0] } } @@ -363,8 +380,11 @@ func (i *Ingester) Push(ctx old_ctx.Context, req *client.WriteRequest) (*client. } client.ReuseSlice(req.Timeseries) - if err := i.wal.Log(record); err != nil { - return nil, err + if record != nil { + if err := i.wal.Log(record); err != nil { + return nil, err + } + recordPool.Put(record) } return &client.WriteResponse{}, lastPartialErr From f2d0b299e887099b90ecb6c60e4aedf6646dc308 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Wed, 18 Dec 2019 17:49:15 +0530 Subject: [PATCH 48/56] Fix Goutham's comments Signed-off-by: Ganesh Vernekar --- pkg/ingester/ingester.go | 5 +- pkg/ingester/ingester_v2.go | 2 +- pkg/ingester/wal.go | 174 +++++++++++++++++++++++++----------- 3 files changed, 125 insertions(+), 56 deletions(-) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 82b00d4d9d4..b0eb34d47ed 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -179,6 +179,7 @@ type Ingester struct { flushQueues []*util.PriorityQueue flushQueuesDone sync.WaitGroup + // This should never be nil. wal WAL // Hook for injecting behaviour from tests. @@ -305,9 +306,7 @@ func (i *Ingester) Shutdown() { close(i.quit) i.done.Wait() - if i.wal != nil { - i.wal.Stop() - } + i.wal.Stop() // Next initiate our graceful exit from the ring. i.lifecycler.Shutdown() diff --git a/pkg/ingester/ingester_v2.go b/pkg/ingester/ingester_v2.go index 8133cb8b1a5..237fdab8f6a 100644 --- a/pkg/ingester/ingester_v2.go +++ b/pkg/ingester/ingester_v2.go @@ -56,7 +56,7 @@ func NewV2(cfg Config, clientConfig client.Config, limits *validation.Overrides, limits: limits, chunkStore: nil, quit: make(chan struct{}), - + wal: &noopWAL{}, TSDBState: TSDBState{ dbs: make(map[string]*tsdb.DB), bucket: bucketClient, diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index a69cfc88fec..29b584e7df1 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -5,6 +5,7 @@ import ( "fmt" "io" "io/ioutil" + "math" "os" "path/filepath" "runtime" @@ -41,25 +42,22 @@ func (cfg *WALConfig) RegisterFlags(f *flag.FlagSet) { f.BoolVar(&cfg.walEnabled, "ingester.wal-enable", false, "Enable the WAL.") f.BoolVar(&cfg.checkpointEnabled, "ingester.checkpoint-enable", false, "Enable checkpointing.") f.BoolVar(&cfg.recover, "ingester.recover-from-wal", false, "Recover data from existing WAL.") - f.StringVar(&cfg.dir, "ingester.wal-dir", "", "Directory to store the WAL.") + f.StringVar(&cfg.dir, "ingester.wal-dir", "wal", "Directory to store the WAL.") f.DurationVar(&cfg.checkpointDuration, "ingester.checkpoint-duration", 1*time.Hour, "Duration over which to checkpoint.") } // WAL interface allows us to have a no-op WAL when the WAL is disabled. type WAL interface { - Log(record *Record) error + // Log marshalls the records and writes it into the WAL. + Log(*Record) error + // Stop stops all the WAL operations. Stop() } -type noop struct{} +type noopWAL struct{} -// Log a Record to the WAL. -func (noop) Log(*Record) error { - return nil -} - -// Stop any background WAL processes. -func (noop) Stop() {} +func (noopWAL) Log(*Record) error { return nil } +func (noopWAL) Stop() {} type walWrapper struct { cfg WALConfig @@ -81,7 +79,7 @@ type walWrapper struct { // * If WAL recovery is enabled, then the userStates is always set for ingester. func newWAL(cfg WALConfig, userStatesFunc func() map[string]*userState) (WAL, error) { if !cfg.walEnabled { - return &noop{}, nil + return &noopWAL{}, nil } var walRegistry prometheus.Registerer @@ -138,14 +136,19 @@ func (w *walWrapper) Stop() { } func (w *walWrapper) Log(record *Record) error { - if record == nil { + select { + case <-w.quit: return nil + default: + if record == nil { + return nil + } + buf, err := proto.Marshal(record) + if err != nil { + return err + } + return w.wal.Log(buf) } - buf, err := proto.Marshal(record) - if err != nil { - return err - } - return w.wal.Log(buf) } func (w *walWrapper) run() { @@ -163,14 +166,14 @@ func (w *walWrapper) run() { case <-ticker.C: start := time.Now() level.Info(util.Logger).Log("msg", "starting checkpoint") - if err := w.checkpoint(); err != nil { + if err := w.performCheckpoint(); err != nil { level.Error(util.Logger).Log("msg", "error checkpointing series", "err", err) continue } elapsed := time.Since(start) level.Info(util.Logger).Log("msg", "checkpoint done", "time", elapsed.String()) case <-w.quit: - if err := w.checkpoint(); err != nil { + if err := w.performCheckpoint(); err != nil { level.Error(util.Logger).Log("msg", "error checkpointing series during shutdown", "err", err) } return @@ -189,7 +192,7 @@ func (w *walWrapper) isStopped() bool { const checkpointPrefix = "checkpoint." -func (w *walWrapper) checkpoint() (err error) { +func (w *walWrapper) performCheckpoint() (err error) { if !w.cfg.checkpointEnabled { return nil } @@ -215,29 +218,29 @@ func (w *walWrapper) checkpoint() (err error) { return err } - newIdx := lastCh + 1 - - cpdir := filepath.Join(w.wal.Dir(), fmt.Sprintf(checkpointPrefix+"%06d", newIdx)) - level.Info(util.Logger).Log("msg", "attempting checkpoint for", "dir", cpdir) - cpdirtmp := cpdir + ".tmp" + // Checkpoint is named after the last WAL segment present so that when replaying the WAL + // we can start from that particular WAL segment. + checkpointDir := filepath.Join(w.wal.Dir(), fmt.Sprintf(checkpointPrefix+"%06d", lastSegment)) + level.Info(util.Logger).Log("msg", "attempting checkpoint for", "dir", checkpointDir) + checkpointDirTemp := checkpointDir + ".tmp" - if err := os.MkdirAll(cpdirtmp, 0777); err != nil { + if err := os.MkdirAll(checkpointDirTemp, 0777); err != nil { return errors.Wrap(err, "create checkpoint dir") } - cp, err := wal.New(nil, nil, cpdirtmp, true) + checkpoint, err := wal.New(nil, nil, checkpointDirTemp, true) if err != nil { return errors.Wrap(err, "open checkpoint") } defer func() { - cp.Close() - os.RemoveAll(cpdirtmp) + checkpoint.Close() + os.RemoveAll(checkpointDirTemp) }() var wireChunkBuf []client.Chunk for userID, state := range w.getUserStates() { for pair := range state.fpToSeries.iter() { state.fpLocker.Lock(pair.fp) - wireChunkBuf, err = w.checkpointSeries(cp, userID, pair.fp, pair.series, wireChunkBuf) + wireChunkBuf, err = w.checkpointSeries(checkpoint, userID, pair.fp, pair.series, wireChunkBuf) state.fpLocker.Unlock(pair.fp) if err != nil { return err @@ -245,21 +248,25 @@ func (w *walWrapper) checkpoint() (err error) { } } - if err := cp.Close(); err != nil { + if err := checkpoint.Close(); err != nil { return errors.Wrap(err, "close checkpoint") } - if err := fileutil.Replace(cpdirtmp, cpdir); err != nil { + if err := fileutil.Replace(checkpointDirTemp, checkpointDir); err != nil { return errors.Wrap(err, "rename checkpoint directory") } // The last segment might still have been active during the checkpointing, // hence delete only the segments before that. if err := w.wal.Truncate(lastSegment - 1); err != nil { - return err + // It is fine to have old WAL segments hanging around if deletion failed. + // We can try again next time. + level.Error(util.Logger).Log("msg", "error deleting old WAL segments", "err", err) } if lastCh >= 0 { if err := w.deleteCheckpoints(lastCh); err != nil { + // It is fine to have old checkpoints hanging around if deletion failed. + // We can try again next time. level.Error(util.Logger).Log("msg", "error deleting old checkpoint", "err", err) } } @@ -270,25 +277,35 @@ func (w *walWrapper) checkpoint() (err error) { // lastCheckpoint returns the directory name and index of the most recent checkpoint. // If dir does not contain any checkpoints, -1 is returned as index. func lastCheckpoint(dir string) (string, int, error) { - files, err := ioutil.ReadDir(dir) + dirs, err := ioutil.ReadDir(dir) if err != nil { return "", -1, err } - // Traverse list backwards since there may be multiple checkpoints left. - for i := len(files) - 1; i >= 0; i-- { - fi := files[i] + var ( + maxIdx = -1 + checkpointDir string + ) + // There may be multiple checkpoints left, so select the one with max index. + for i := 0; i < len(dirs); i++ { + di := dirs[i] - if !strings.HasPrefix(fi.Name(), checkpointPrefix) { + if !strings.HasPrefix(di.Name(), checkpointPrefix) { continue } - if !fi.IsDir() { - return "", -1, fmt.Errorf("checkpoint %s is not a directory", fi.Name()) + if !di.IsDir() { + return "", -1, fmt.Errorf("checkpoint %s is not a directory", di.Name()) } - idx, err := strconv.Atoi(fi.Name()[len(checkpointPrefix):]) + idx, err := strconv.Atoi(di.Name()[len(checkpointPrefix):]) if err != nil { continue } - return filepath.Join(dir, fi.Name()), idx, nil + if idx > maxIdx { + checkpointDir = di.Name() + maxIdx = idx + } + } + if maxIdx >= 0 { + return filepath.Join(dir, checkpointDir), maxIdx, nil } return "", -1, nil } @@ -388,13 +405,13 @@ func recoverFromWAL(ingester *Ingester) (err error) { return nil } - level.Info(util.Logger).Log("msg", "recovering from segments", "dir", walDir) + level.Info(util.Logger).Log("msg", "recovering from WAL", "dir", walDir, "start_segment", idx) start := time.Now() - if err := processWAL(walDir, userStates, nWorkers, stateCache, seriesCache); err != nil { + if err := processWAL(walDir, idx, userStates, nWorkers, stateCache, seriesCache); err != nil { return err } elapsed := time.Since(start) - level.Info(util.Logger).Log("msg", "recovered from segments", "time", elapsed.String()) + level.Info(util.Logger).Log("msg", "recovered from WAL", "time", elapsed.String()) return nil } @@ -433,7 +450,7 @@ func processCheckpoint(name string, userStates *userStates, nWorkers int, } ) - reader, closer, err := newWalReader(name) + reader, closer, err := newWalReader(name, -1) if err != nil { return err } @@ -531,7 +548,7 @@ type samplesWithUserID struct { } // processWAL processes the records in the WAL concurrently. -func processWAL(name string, userStates *userStates, nWorkers int, +func processWAL(name string, startSegment int, userStates *userStates, nWorkers int, stateCache []map[string]*userState, seriesCache []map[string]map[uint64]*memorySeries) error { var ( wg sync.WaitGroup @@ -556,7 +573,7 @@ func processWAL(name string, userStates *userStates, nWorkers int, }(inputs[i], outputs[i], stateCache[i], seriesCache[i]) } - reader, closer, err := newWalReader(name) + reader, closer, err := newWalReader(name, startSegment) if err != nil { return err } @@ -717,10 +734,63 @@ func processWALSamples(userStates *userStates, stateCache map[string]*userState, } } -func newWalReader(name string) (*wal.Reader, io.Closer, error) { - segmentReader, err := wal.NewSegmentsReader(name) - if err != nil { - return nil, nil, err +// If startSegment is <0, it means all the segments. +func newWalReader(name string, startSegment int) (*wal.Reader, io.Closer, error) { + var ( + segmentReader io.ReadCloser + err error + ) + if startSegment < 0 { + segmentReader, err = wal.NewSegmentsReader(name) + if err != nil { + return nil, nil, err + } + } else { + first, last, err := SegmentRange(name) + if err != nil { + return nil, nil, err + } + if startSegment > last { + return nil, nil, errors.New("start segment is beyond the last WAL segment") + } + if first > startSegment { + startSegment = first + } + segmentReader, err = wal.NewSegmentsRangeReader(wal.SegmentRange{ + Dir: name, + First: startSegment, + Last: -1, // Till the end. + }) + if err != nil { + return nil, nil, err + } } return wal.NewReader(segmentReader), segmentReader, nil } + +// SegmentRange returns the first and last segment index of the WAL in the dir. +// If https://github.com/prometheus/prometheus/pull/6477 is merged, get rid of this +// method and use from Prometheus directly. +func SegmentRange(dir string) (int, int, error) { + files, err := fileutil.ReadDir(dir) + if err != nil { + return 0, 0, err + } + first, last := math.MaxInt32, math.MinInt32 + for _, fn := range files { + k, err := strconv.Atoi(fn) + if err != nil { + continue + } + if k < first { + first = k + } + if k > last { + last = k + } + } + if first == math.MaxInt32 || last == math.MinInt32 { + return -1, -1, nil + } + return first, last, nil +} From ce6a2446e2be69c26ac4f037e7b27d947232720e Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Thu, 19 Dec 2019 19:31:49 +0530 Subject: [PATCH 49/56] Fix Goutham's comments Signed-off-by: Ganesh Vernekar --- pkg/ingester/label_pairs.go | 22 -------- pkg/ingester/series.go | 6 -- pkg/ingester/user_state.go | 2 +- pkg/ingester/wal.go | 29 ++++++---- pkg/ingester/wal.pb.go | 108 ++++++++++++++---------------------- pkg/ingester/wal.proto | 4 +- 6 files changed, 63 insertions(+), 108 deletions(-) diff --git a/pkg/ingester/label_pairs.go b/pkg/ingester/label_pairs.go index 43304cd6e34..896cbf0053e 100644 --- a/pkg/ingester/label_pairs.go +++ b/pkg/ingester/label_pairs.go @@ -88,25 +88,3 @@ func (a labelPairs) equal(b labels.Labels) bool { } return true } - -func newLabelPairs(metric labels.Labels) []client.LabelPair { - lp := make([]client.LabelPair, 0, len(metric)) - for _, m := range metric { - lp = append(lp, client.LabelPair{ - Name: []byte(m.Name), - Value: []byte(m.Value), - }) - } - return lp -} - -func newLabelPairsFromLabelAdapters(metric []client.LabelAdapter) []client.LabelPair { - lp := make([]client.LabelPair, 0, len(metric)) - for _, m := range metric { - lp = append(lp, client.LabelPair{ - Name: []byte(m.Name), - Value: []byte(m.Value), - }) - } - return lp -} diff --git a/pkg/ingester/series.go b/pkg/ingester/series.go index 78d37400a6f..5b7b0ffc539 100644 --- a/pkg/ingester/series.go +++ b/pkg/ingester/series.go @@ -10,7 +10,6 @@ import ( "github.com/prometheus/prometheus/pkg/value" "github.com/cortexproject/cortex/pkg/chunk/encoding" - "github.com/cortexproject/cortex/pkg/ingester/client" "github.com/cortexproject/cortex/pkg/prom1/storage/metric" ) @@ -51,11 +50,6 @@ func newMemorySeries(m labels.Labels) *memorySeries { } } -// helper to extract the not-necessarily-sorted type used elsewhere, without casting everywhere. -func (s *memorySeries) labels() []client.LabelPair { - return newLabelPairs(s.metric) -} - // add adds a sample pair to the series, possibly creating a new chunk. // The caller must have locked the fingerprint of the series. func (s *memorySeries) add(v model.SamplePair) error { diff --git a/pkg/ingester/user_state.go b/pkg/ingester/user_state.go index 2052c7f57af..5030437cb31 100644 --- a/pkg/ingester/user_state.go +++ b/pkg/ingester/user_state.go @@ -231,7 +231,7 @@ func (u *userState) createSeriesWithFingerprint(fp model.Fingerprint, metric lab if record != nil { record.Labels = append(record.Labels, Labels{ Fingerprint: uint64(fp), - Labels: newLabelPairsFromLabelAdapters(metric), + Labels: metric, }) } diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index 29b584e7df1..d3a52b520a0 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -351,7 +351,7 @@ func (w *walWrapper) checkpointSeries(cp *wal.WAL, userID string, fp model.Finge buf, err := proto.Marshal(&Series{ UserId: userID, Fingerprint: uint64(fp), - Labels: series.labels(), + Labels: client.FromLabelsToLabelAdapters(series.metric), Chunks: wireChunks, }) if err != nil { @@ -472,6 +472,11 @@ Loop: if err := proto.Unmarshal(reader.Record(), s); err != nil { return err } + // The yoloString from the unmarshal of LabelAdapter gets corrupted + // when travelling through the channel. Hence making a copy of that. + // This extra alloc during the read path is fine as it's only 1 time + // and saves extra allocs during write path by having LabelAdapter. + s.Labels = copyLabelAdapters(s.Labels) select { case errFromChan = <-errChan: @@ -502,6 +507,17 @@ Loop: return nil } +func copyLabelAdapters(las []client.LabelAdapter) []client.LabelAdapter { + for i := range las { + n, v := make([]byte, len(las[i].Name)), make([]byte, len(las[i].Value)) + copy(n, las[i].Name) + copy(v, las[i].Value) + las[i].Name = string(n) + las[i].Value = string(v) + } + return las +} + func processCheckpointRecord(userStates *userStates, seriesPool *sync.Pool, stateCache map[string]*userState, seriesCache map[string]map[uint64]*memorySeries, seriesChan <-chan *Series, errChan chan error) { var la []client.LabelAdapter @@ -580,7 +596,6 @@ func processWAL(name string, startSegment int, userStates *userStates, nWorkers defer closer.Close() var ( - la []client.LabelAdapter errFromChan error record = &Record{} ) @@ -605,15 +620,7 @@ Loop: if ok { continue } - - la = la[:0] - for _, l := range labels.Labels { - la = append(la, client.LabelAdapter{ - Name: string(l.Name), - Value: string(l.Value), - }) - } - _, err := state.createSeriesWithFingerprint(model.Fingerprint(labels.Fingerprint), la, nil, true) + _, err := state.createSeriesWithFingerprint(model.Fingerprint(labels.Fingerprint), labels.Labels, nil, true) if err != nil { return err } diff --git a/pkg/ingester/wal.pb.go b/pkg/ingester/wal.pb.go index 54ef840ca1d..c3fcc096001 100644 --- a/pkg/ingester/wal.pb.go +++ b/pkg/ingester/wal.pb.go @@ -7,6 +7,7 @@ import ( encoding_binary "encoding/binary" fmt "fmt" client "github.com/cortexproject/cortex/pkg/ingester/client" + github_com_cortexproject_cortex_pkg_ingester_client "github.com/cortexproject/cortex/pkg/ingester/client" _ "github.com/gogo/protobuf/gogoproto" proto "github.com/gogo/protobuf/proto" io "io" @@ -86,8 +87,8 @@ func (m *Record) GetSamples() []Sample { } type Labels struct { - Fingerprint uint64 `protobuf:"varint,1,opt,name=fingerprint,proto3" json:"fingerprint,omitempty"` - Labels []client.LabelPair `protobuf:"bytes,2,rep,name=labels,proto3" json:"labels"` + Fingerprint uint64 `protobuf:"varint,1,opt,name=fingerprint,proto3" json:"fingerprint,omitempty"` + Labels []github_com_cortexproject_cortex_pkg_ingester_client.LabelAdapter `protobuf:"bytes,2,rep,name=labels,proto3,customtype=github.com/cortexproject/cortex/pkg/ingester/client.LabelAdapter" json:"labels"` } func (m *Labels) Reset() { *m = Labels{} } @@ -129,13 +130,6 @@ func (m *Labels) GetFingerprint() uint64 { return 0 } -func (m *Labels) GetLabels() []client.LabelPair { - if m != nil { - return m.Labels - } - return nil -} - type Sample struct { Fingerprint uint64 `protobuf:"varint,1,opt,name=fingerprint,proto3" json:"fingerprint,omitempty"` Timestamp uint64 `protobuf:"varint,2,opt,name=timestamp,proto3" json:"timestamp,omitempty"` @@ -196,10 +190,10 @@ func (m *Sample) GetValue() float64 { } type Series struct { - UserId string `protobuf:"bytes,1,opt,name=user_id,json=userId,proto3" json:"user_id,omitempty"` - Fingerprint uint64 `protobuf:"varint,2,opt,name=fingerprint,proto3" json:"fingerprint,omitempty"` - Labels []client.LabelPair `protobuf:"bytes,3,rep,name=labels,proto3" json:"labels"` - Chunks []client.Chunk `protobuf:"bytes,4,rep,name=chunks,proto3" json:"chunks"` + UserId string `protobuf:"bytes,1,opt,name=user_id,json=userId,proto3" json:"user_id,omitempty"` + Fingerprint uint64 `protobuf:"varint,2,opt,name=fingerprint,proto3" json:"fingerprint,omitempty"` + Labels []github_com_cortexproject_cortex_pkg_ingester_client.LabelAdapter `protobuf:"bytes,3,rep,name=labels,proto3,customtype=github.com/cortexproject/cortex/pkg/ingester/client.LabelAdapter" json:"labels"` + Chunks []client.Chunk `protobuf:"bytes,4,rep,name=chunks,proto3" json:"chunks"` } func (m *Series) Reset() { *m = Series{} } @@ -248,13 +242,6 @@ func (m *Series) GetFingerprint() uint64 { return 0 } -func (m *Series) GetLabels() []client.LabelPair { - if m != nil { - return m.Labels - } - return nil -} - func (m *Series) GetChunks() []client.Chunk { if m != nil { return m.Chunks @@ -272,32 +259,33 @@ func init() { func init() { proto.RegisterFile("wal.proto", fileDescriptor_ae6364fc8077884f) } var fileDescriptor_ae6364fc8077884f = []byte{ - // 395 bytes of a gzipped FileDescriptorProto - 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x8c, 0x52, 0x41, 0x6b, 0xdb, 0x30, - 0x14, 0xb6, 0xe2, 0x4c, 0x59, 0x14, 0x06, 0x9b, 0x18, 0xcc, 0x84, 0xa1, 0x19, 0x9f, 0x02, 0x63, - 0xf6, 0xd8, 0xee, 0x63, 0x64, 0xa7, 0xc1, 0x0e, 0xc3, 0xb9, 0x6d, 0xb0, 0x61, 0x3b, 0x8a, 0xa3, - 0xc6, 0xb6, 0x8c, 0x24, 0xb7, 0x3d, 0x16, 0xfa, 0x07, 0xfa, 0x17, 0x7a, 0xeb, 0x4f, 0xc9, 0x31, - 0xc7, 0x9c, 0x4a, 0xe3, 0x5c, 0x7a, 0xcc, 0x4f, 0x28, 0x91, 0xed, 0x36, 0x0d, 0xb4, 0xe4, 0xa6, - 0xef, 0x7b, 0xdf, 0xf7, 0xbd, 0xf7, 0xc4, 0x43, 0xdd, 0x93, 0x20, 0x71, 0x73, 0xc1, 0x15, 0xc7, - 0x2f, 0x59, 0x16, 0x53, 0xa9, 0xa8, 0xe8, 0x7f, 0x8a, 0x99, 0x9a, 0x16, 0xa1, 0x1b, 0xf1, 0xd4, - 0x8b, 0x79, 0xcc, 0x3d, 0x2d, 0x08, 0x8b, 0x89, 0x46, 0x1a, 0xe8, 0x57, 0x65, 0xec, 0x7f, 0xdf, - 0x91, 0x47, 0x5c, 0x28, 0x7a, 0x9a, 0x0b, 0x7e, 0x44, 0x23, 0x55, 0x23, 0x2f, 0x9f, 0xc5, 0x5e, - 0x13, 0xee, 0x45, 0x09, 0xa3, 0x59, 0x53, 0xaa, 0x12, 0x9c, 0x73, 0x80, 0xa0, 0x4f, 0x23, 0x2e, - 0xc6, 0xf8, 0x1d, 0xea, 0x14, 0x92, 0x8a, 0xff, 0x6c, 0x6c, 0x01, 0x1b, 0x0c, 0xba, 0x3e, 0xdc, - 0xc2, 0x9f, 0x63, 0xec, 0x22, 0x98, 0x04, 0x21, 0x4d, 0xa4, 0xd5, 0xb2, 0xcd, 0x41, 0xef, 0xcb, - 0x6b, 0xb7, 0x89, 0x74, 0x7f, 0x69, 0x7e, 0xd8, 0x9e, 0x5f, 0x7f, 0x30, 0xfc, 0x5a, 0x85, 0x3f, - 0xa3, 0x8e, 0x0c, 0xd2, 0x3c, 0xa1, 0xd2, 0x32, 0xf7, 0x0d, 0x23, 0x5d, 0xa8, 0x0d, 0x8d, 0xcc, - 0xf9, 0x8b, 0x60, 0x95, 0x84, 0x6d, 0xd4, 0x9b, 0x6c, 0xc5, 0x22, 0x17, 0x2c, 0x53, 0x7a, 0x90, - 0xb6, 0xbf, 0x4b, 0x61, 0x6f, 0x6f, 0x9a, 0x37, 0x6e, 0xbd, 0x90, 0x4e, 0xf8, 0x1d, 0x30, 0xf1, - 0x78, 0x1c, 0xe7, 0x1f, 0x82, 0x55, 0xd7, 0x03, 0xc2, 0xdf, 0xa3, 0xae, 0x62, 0x29, 0x95, 0x2a, - 0x48, 0x73, 0xab, 0xa5, 0xeb, 0x0f, 0x04, 0x7e, 0x8b, 0x5e, 0x1c, 0x07, 0x49, 0x41, 0x2d, 0xd3, - 0x06, 0x03, 0xe0, 0x57, 0xc0, 0xb9, 0x04, 0x08, 0x8e, 0xa8, 0x60, 0x54, 0x3e, 0xfd, 0x85, 0x7b, - 0x9d, 0x5b, 0xcf, 0xad, 0x65, 0x1e, 0xb4, 0x16, 0xfe, 0x88, 0x60, 0x34, 0x2d, 0xb2, 0x99, 0xb4, - 0xda, 0xda, 0xf0, 0xaa, 0x31, 0xfc, 0xd8, 0xb2, 0x8d, 0xb8, 0x92, 0x0c, 0xbf, 0x2d, 0x56, 0xc4, - 0x58, 0xae, 0x88, 0xb1, 0x59, 0x11, 0x70, 0x56, 0x12, 0x70, 0x55, 0x12, 0x30, 0x2f, 0x09, 0x58, - 0x94, 0x04, 0xdc, 0x94, 0x04, 0xdc, 0x96, 0xc4, 0xd8, 0x94, 0x04, 0x5c, 0xac, 0x89, 0xb1, 0x58, - 0x13, 0x63, 0xb9, 0x26, 0xc6, 0x9f, 0xfb, 0xbb, 0x0c, 0xa1, 0xbe, 0x96, 0xaf, 0x77, 0x01, 0x00, - 0x00, 0xff, 0xff, 0x63, 0xe6, 0x8a, 0xe9, 0xb5, 0x02, 0x00, 0x00, + // 415 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xbc, 0x53, 0xcd, 0xca, 0xd3, 0x40, + 0x14, 0xcd, 0x34, 0x75, 0x6a, 0xa7, 0x08, 0x3a, 0x08, 0x86, 0x22, 0xd3, 0x90, 0x55, 0x41, 0x4c, + 0x44, 0xf7, 0xa2, 0x75, 0xa3, 0xe0, 0x42, 0xd2, 0x9d, 0x0b, 0x25, 0x3f, 0xd3, 0x74, 0x6c, 0x92, + 0x09, 0x33, 0x13, 0x75, 0x29, 0xf8, 0x02, 0xbe, 0x81, 0x5b, 0x1f, 0xa5, 0xcb, 0x2e, 0x8b, 0x8b, + 0x62, 0x53, 0x04, 0x97, 0x7d, 0x04, 0xc9, 0x24, 0xd1, 0x52, 0x50, 0x3e, 0xbe, 0xc5, 0xb7, 0xcb, + 0x39, 0xf7, 0xdc, 0x73, 0xcf, 0xdc, 0xcc, 0xa0, 0xe1, 0x87, 0x20, 0x75, 0x0b, 0xc1, 0x15, 0xc7, + 0xd7, 0x59, 0x9e, 0x50, 0xa9, 0xa8, 0x18, 0xdf, 0x4f, 0x98, 0x5a, 0x96, 0xa1, 0x1b, 0xf1, 0xcc, + 0x4b, 0x78, 0xc2, 0x3d, 0x2d, 0x08, 0xcb, 0x85, 0x46, 0x1a, 0xe8, 0xaf, 0xa6, 0x71, 0xfc, 0xe4, + 0x44, 0x1e, 0x71, 0xa1, 0xe8, 0xc7, 0x42, 0xf0, 0x77, 0x34, 0x52, 0x2d, 0xf2, 0x8a, 0x55, 0xe2, + 0x75, 0xe6, 0x5e, 0x94, 0x32, 0x9a, 0x77, 0xa5, 0xc6, 0xc1, 0xf9, 0x0c, 0x10, 0xf4, 0x69, 0xc4, + 0x45, 0x8c, 0xef, 0xa0, 0x41, 0x29, 0xa9, 0x78, 0xcb, 0x62, 0x0b, 0xd8, 0x60, 0x3a, 0xf4, 0x61, + 0x0d, 0x5f, 0xc4, 0xd8, 0x45, 0x30, 0x0d, 0x42, 0x9a, 0x4a, 0xab, 0x67, 0x9b, 0xd3, 0xd1, 0xc3, + 0x9b, 0x6e, 0x67, 0xe9, 0xbe, 0xd4, 0xfc, 0xac, 0xbf, 0xde, 0x4d, 0x0c, 0xbf, 0x55, 0xe1, 0x07, + 0x68, 0x20, 0x83, 0xac, 0x48, 0xa9, 0xb4, 0xcc, 0xf3, 0x86, 0xb9, 0x2e, 0xb4, 0x0d, 0x9d, 0xcc, + 0xf9, 0x0a, 0x10, 0x6c, 0xac, 0xb0, 0x8d, 0x46, 0x8b, 0x5a, 0x2d, 0x0a, 0xc1, 0x72, 0xa5, 0x93, + 0xf4, 0xfd, 0x53, 0x0a, 0xcb, 0xb3, 0x38, 0xb7, 0xdc, 0xf6, 0x44, 0xda, 0xe1, 0x55, 0xc0, 0xc4, + 0xec, 0x79, 0x6d, 0xff, 0x7d, 0x37, 0xb9, 0xcc, 0x7e, 0x1a, 0x9b, 0xa7, 0x71, 0x50, 0x28, 0x2a, + 0xba, 0x33, 0x39, 0x6f, 0x10, 0x6c, 0xa2, 0x5f, 0x20, 0xe0, 0x5d, 0x34, 0x54, 0x2c, 0xa3, 0x52, + 0x05, 0x59, 0x61, 0xf5, 0x74, 0xfd, 0x2f, 0x81, 0x6f, 0xa3, 0x6b, 0xef, 0x83, 0xb4, 0xa4, 0x96, + 0x69, 0x83, 0x29, 0xf0, 0x1b, 0xe0, 0xfc, 0x04, 0x08, 0xce, 0xa9, 0x60, 0x54, 0xfe, 0xfb, 0x3f, + 0x9c, 0x4d, 0xee, 0xfd, 0x6f, 0x35, 0xe6, 0x95, 0xad, 0x06, 0xdf, 0x43, 0x30, 0x5a, 0x96, 0xf9, + 0x4a, 0x5a, 0x7d, 0x3d, 0xf4, 0x46, 0x37, 0xf4, 0x59, 0xcd, 0x76, 0x77, 0xa3, 0x91, 0xcc, 0x1e, + 0x6f, 0xf6, 0xc4, 0xd8, 0xee, 0x89, 0x71, 0xdc, 0x13, 0xf0, 0xa9, 0x22, 0xe0, 0x5b, 0x45, 0xc0, + 0xba, 0x22, 0x60, 0x53, 0x11, 0xf0, 0xa3, 0x22, 0xe0, 0x57, 0x45, 0x8c, 0x63, 0x45, 0xc0, 0x97, + 0x03, 0x31, 0x36, 0x07, 0x62, 0x6c, 0x0f, 0xc4, 0x78, 0xfd, 0xe7, 0x81, 0x84, 0x50, 0x5f, 0xdb, + 0x47, 0xbf, 0x03, 0x00, 0x00, 0xff, 0xff, 0x83, 0xd9, 0xb8, 0x9a, 0x3e, 0x03, 0x00, 0x00, } func (this *Record) Equal(that interface{}) bool { @@ -366,7 +354,7 @@ func (this *Labels) Equal(that interface{}) bool { return false } for i := range this.Labels { - if !this.Labels[i].Equal(&that1.Labels[i]) { + if !this.Labels[i].Equal(that1.Labels[i]) { return false } } @@ -431,7 +419,7 @@ func (this *Series) Equal(that interface{}) bool { return false } for i := range this.Labels { - if !this.Labels[i].Equal(&that1.Labels[i]) { + if !this.Labels[i].Equal(that1.Labels[i]) { return false } } @@ -476,13 +464,7 @@ func (this *Labels) GoString() string { s := make([]string, 0, 6) s = append(s, "&ingester.Labels{") s = append(s, "Fingerprint: "+fmt.Sprintf("%#v", this.Fingerprint)+",\n") - if this.Labels != nil { - vs := make([]*client.LabelPair, len(this.Labels)) - for i := range vs { - vs[i] = &this.Labels[i] - } - s = append(s, "Labels: "+fmt.Sprintf("%#v", vs)+",\n") - } + s = append(s, "Labels: "+fmt.Sprintf("%#v", this.Labels)+",\n") s = append(s, "}") return strings.Join(s, "") } @@ -506,13 +488,7 @@ func (this *Series) GoString() string { s = append(s, "&ingester.Series{") s = append(s, "UserId: "+fmt.Sprintf("%#v", this.UserId)+",\n") s = append(s, "Fingerprint: "+fmt.Sprintf("%#v", this.Fingerprint)+",\n") - if this.Labels != nil { - vs := make([]*client.LabelPair, len(this.Labels)) - for i := range vs { - vs[i] = &this.Labels[i] - } - s = append(s, "Labels: "+fmt.Sprintf("%#v", vs)+",\n") - } + s = append(s, "Labels: "+fmt.Sprintf("%#v", this.Labels)+",\n") if this.Chunks != nil { vs := make([]*client.Chunk, len(this.Chunks)) for i := range vs { @@ -830,7 +806,7 @@ func (this *Labels) String() string { } s := strings.Join([]string{`&Labels{`, `Fingerprint:` + fmt.Sprintf("%v", this.Fingerprint) + `,`, - `Labels:` + strings.Replace(strings.Replace(fmt.Sprintf("%v", this.Labels), "LabelPair", "client.LabelPair", 1), `&`, ``, 1) + `,`, + `Labels:` + fmt.Sprintf("%v", this.Labels) + `,`, `}`, }, "") return s @@ -854,7 +830,7 @@ func (this *Series) String() string { s := strings.Join([]string{`&Series{`, `UserId:` + fmt.Sprintf("%v", this.UserId) + `,`, `Fingerprint:` + fmt.Sprintf("%v", this.Fingerprint) + `,`, - `Labels:` + strings.Replace(strings.Replace(fmt.Sprintf("%v", this.Labels), "LabelPair", "client.LabelPair", 1), `&`, ``, 1) + `,`, + `Labels:` + fmt.Sprintf("%v", this.Labels) + `,`, `Chunks:` + strings.Replace(strings.Replace(fmt.Sprintf("%v", this.Chunks), "Chunk", "client.Chunk", 1), `&`, ``, 1) + `,`, `}`, }, "") @@ -1098,7 +1074,7 @@ func (m *Labels) Unmarshal(dAtA []byte) error { if postIndex > l { return io.ErrUnexpectedEOF } - m.Labels = append(m.Labels, client.LabelPair{}) + m.Labels = append(m.Labels, github_com_cortexproject_cortex_pkg_ingester_client.LabelAdapter{}) if err := m.Labels[len(m.Labels)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil { return err } @@ -1338,7 +1314,7 @@ func (m *Series) Unmarshal(dAtA []byte) error { if postIndex > l { return io.ErrUnexpectedEOF } - m.Labels = append(m.Labels, client.LabelPair{}) + m.Labels = append(m.Labels, github_com_cortexproject_cortex_pkg_ingester_client.LabelAdapter{}) if err := m.Labels[len(m.Labels)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil { return err } diff --git a/pkg/ingester/wal.proto b/pkg/ingester/wal.proto index 51bb2a30eed..25b5361dde2 100644 --- a/pkg/ingester/wal.proto +++ b/pkg/ingester/wal.proto @@ -15,7 +15,7 @@ message Record { message Labels { uint64 fingerprint = 1; - repeated cortex.LabelPair labels = 2 [(gogoproto.nullable) = false]; + repeated cortex.LabelPair labels = 2 [(gogoproto.nullable) = false, (gogoproto.customtype) = "github.com/cortexproject/cortex/pkg/ingester/client.LabelAdapter"]; } message Sample { @@ -27,6 +27,6 @@ message Sample { message Series { string user_id = 1; uint64 fingerprint = 2; - repeated cortex.LabelPair labels = 3 [(gogoproto.nullable) = false]; + repeated cortex.LabelPair labels = 3 [(gogoproto.nullable) = false, (gogoproto.customtype) = "github.com/cortexproject/cortex/pkg/ingester/client.LabelAdapter"]; repeated cortex.Chunk chunks = 4 [(gogoproto.nullable) = false]; } From 3fc8e31ad95a8ba13e5805c09de913916c2048cf Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Fri, 20 Dec 2019 14:51:25 +0530 Subject: [PATCH 50/56] Fix possible data corruption, goroutine deadlock and memory leak Signed-off-by: Ganesh Vernekar --- pkg/ingester/ingester.go | 3 +- pkg/ingester/wal.go | 78 ++++++++++++++++++++++++---------------- 2 files changed, 50 insertions(+), 31 deletions(-) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index ad80857fad2..160d857df61 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -334,6 +334,7 @@ func (i *Ingester) StopIncomingRequests() { // Push implements client.IngesterServer func (i *Ingester) Push(ctx old_ctx.Context, req *client.WriteRequest) (*client.WriteResponse, error) { + if i.cfg.TSDBEnabled { return i.v2Push(ctx, req) } @@ -374,7 +375,7 @@ func (i *Ingester) Push(ctx old_ctx.Context, req *client.WriteRequest) (*client. return nil, err } } - client.ReuseSlice(req.Timeseries) + defer client.ReuseSlice(req.Timeseries) if lastPartialErr != nil { return &client.WriteResponse{}, lastPartialErr.WrappedError() diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index d3a52b520a0..4c0e89a9783 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -39,8 +39,8 @@ type WALConfig struct { // RegisterFlags adds the flags required to config this to the given FlagSet func (cfg *WALConfig) RegisterFlags(f *flag.FlagSet) { - f.BoolVar(&cfg.walEnabled, "ingester.wal-enable", false, "Enable the WAL.") - f.BoolVar(&cfg.checkpointEnabled, "ingester.checkpoint-enable", false, "Enable checkpointing.") + f.BoolVar(&cfg.walEnabled, "ingester.wal-enabled", false, "Enable the WAL.") + f.BoolVar(&cfg.checkpointEnabled, "ingester.checkpoint-enabled", false, "Enable checkpointing.") f.BoolVar(&cfg.recover, "ingester.recover-from-wal", false, "Recover data from existing WAL.") f.StringVar(&cfg.dir, "ingester.wal-dir", "wal", "Directory to store the WAL.") f.DurationVar(&cfg.checkpointDuration, "ingester.checkpoint-duration", 1*time.Hour, "Duration over which to checkpoint.") @@ -437,10 +437,17 @@ func segmentsExist(dir string) (bool, error) { // processCheckpoint loads the chunks of the series present in the last checkpoint. func processCheckpoint(name string, userStates *userStates, nWorkers int, stateCache []map[string]*userState, seriesCache []map[string]map[uint64]*memorySeries) error { + + reader, closer, err := newWalReader(name, -1) + if err != nil { + return err + } + defer closer.Close() + var ( inputs = make([]chan *Series, nWorkers) // errChan is to capture the errors from goroutine. - // The channel size is nWorkers to not block any worker if all of them error out. + // The channel size is nWorkers+1 to not block any worker if all of them error out. errChan = make(chan error, nWorkers) wg = sync.WaitGroup{} seriesPool = &sync.Pool{ @@ -450,12 +457,6 @@ func processCheckpoint(name string, userStates *userStates, nWorkers int, } ) - reader, closer, err := newWalReader(name, -1) - if err != nil { - return err - } - defer closer.Close() - wg.Add(nWorkers) for i := 0; i < nWorkers; i++ { inputs[i] = make(chan *Series, 300) @@ -465,12 +466,15 @@ func processCheckpoint(name string, userStates *userStates, nWorkers int, }(inputs[i], stateCache[i], seriesCache[i]) } - var errFromChan error + var capturedErr error Loop: for reader.Next() { s := seriesPool.Get().(*Series) if err := proto.Unmarshal(reader.Record(), s); err != nil { - return err + // We don't return here in order to close/drain all the channels and + // make sure all goroutines exit. + capturedErr = err + break Loop } // The yoloString from the unmarshal of LabelAdapter gets corrupted // when travelling through the channel. Hence making a copy of that. @@ -479,7 +483,7 @@ Loop: s.Labels = copyLabelAdapters(s.Labels) select { - case errFromChan = <-errChan: + case capturedErr = <-errChan: // Exit early on an error. // Only acts upon the first error received. break Loop @@ -488,17 +492,24 @@ Loop: inputs[mod] <- s } } + for i := 0; i < nWorkers; i++ { close(inputs[i]) } wg.Wait() + // If any worker errored out, some input channels might not be empty. + // Hence drain them. + for i := 0; i < nWorkers; i++ { + for range inputs[i] { + } + } - if errFromChan != nil { - return errFromChan + if capturedErr != nil { + return capturedErr } select { - case errFromChan = <-errChan: - return errFromChan + case capturedErr = <-errChan: + return capturedErr default: if err := reader.Err(); err != nil { return err @@ -566,6 +577,13 @@ type samplesWithUserID struct { // processWAL processes the records in the WAL concurrently. func processWAL(name string, startSegment int, userStates *userStates, nWorkers int, stateCache []map[string]*userState, seriesCache []map[string]map[uint64]*memorySeries) error { + + reader, closer, err := newWalReader(name, startSegment) + if err != nil { + return err + } + defer closer.Close() + var ( wg sync.WaitGroup inputs = make([]chan *samplesWithUserID, nWorkers) @@ -589,27 +607,24 @@ func processWAL(name string, startSegment int, userStates *userStates, nWorkers }(inputs[i], outputs[i], stateCache[i], seriesCache[i]) } - reader, closer, err := newWalReader(name, startSegment) - if err != nil { - return err - } - defer closer.Close() - var ( - errFromChan error + capturedErr error record = &Record{} ) Loop: for reader.Next() { select { - case errFromChan = <-errChan: + case capturedErr = <-errChan: // Exit early on an error. // Only acts upon the first error received. break Loop default: } if err := proto.Unmarshal(reader.Record(), record); err != nil { - return err + // We don't return here in order to close/drain all the channels and + // make sure all goroutines exit. + capturedErr = err + break Loop } if len(record.Labels) > 0 { @@ -622,7 +637,10 @@ Loop: } _, err := state.createSeriesWithFingerprint(model.Fingerprint(labels.Fingerprint), labels.Labels, nil, true) if err != nil { - return err + // We don't return here in order to close/drain all the channels and + // make sure all goroutines exit. + capturedErr = err + break Loop } } } @@ -680,12 +698,12 @@ Loop: } } - if errFromChan != nil { - return errFromChan + if capturedErr != nil { + return capturedErr } select { - case errFromChan = <-errChan: - return errFromChan + case capturedErr = <-errChan: + return capturedErr default: if err := reader.Err(); err != nil { return err From e618463c77f55337bd78c70ccf9c9423c0ef8d08 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Fri, 3 Jan 2020 11:41:41 +0530 Subject: [PATCH 51/56] Fix review comments Signed-off-by: Ganesh Vernekar --- pkg/ingester/ingester.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 160d857df61..a87df7dd78f 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -48,6 +48,7 @@ type ingesterMetrics struct { queriedSamples prometheus.Histogram queriedSeries prometheus.Histogram queriedChunks prometheus.Histogram + walReplayDuration prometheus.Summary } func newIngesterMetrics(r prometheus.Registerer) *ingesterMetrics { @@ -86,6 +87,11 @@ func newIngesterMetrics(r prometheus.Registerer) *ingesterMetrics { // A small number of chunks per series - 10*(8^(7-1)) = 2.6m. Buckets: prometheus.ExponentialBuckets(10, 8, 7), }), + walReplayDuration: prometheus.NewSummary(prometheus.SummaryOpts{ + Name: "cortex_ingester_wal_replay_duration_seconds", + Help: "Time taken to replay the checkpoint and the WAL.", + Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, + }), } if r != nil { @@ -245,6 +251,7 @@ func New(cfg Config, clientConfig client.Config, limits *validation.Overrides, c } elapsed := time.Since(start) level.Info(util.Logger).Log("msg", "recovery from WAL completed", "time", elapsed.String()) + i.metrics.walReplayDuration.Observe(elapsed.Seconds()) } // If the WAL recover happened, then the userStates would already be set. @@ -334,7 +341,6 @@ func (i *Ingester) StopIncomingRequests() { // Push implements client.IngesterServer func (i *Ingester) Push(ctx old_ctx.Context, req *client.WriteRequest) (*client.WriteResponse, error) { - if i.cfg.TSDBEnabled { return i.v2Push(ctx, req) } From 037d4b37ebba0455e28c2896765d0c2105cda568 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Wed, 8 Jan 2020 19:35:17 +0530 Subject: [PATCH 52/56] memoryChunks counter fix, metics updated, small cleanup Signed-off-by: Ganesh Vernekar --- pkg/ingester/ingester.go | 13 +++++++------ pkg/ingester/series.go | 2 -- pkg/ingester/wal.go | 41 ++++++++++++++-------------------------- pkg/ring/lifecycler.go | 9 +++++---- 4 files changed, 26 insertions(+), 39 deletions(-) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index a87df7dd78f..c823e201deb 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -48,7 +48,7 @@ type ingesterMetrics struct { queriedSamples prometheus.Histogram queriedSeries prometheus.Histogram queriedChunks prometheus.Histogram - walReplayDuration prometheus.Summary + walReplayDuration prometheus.Gauge } func newIngesterMetrics(r prometheus.Registerer) *ingesterMetrics { @@ -87,10 +87,9 @@ func newIngesterMetrics(r prometheus.Registerer) *ingesterMetrics { // A small number of chunks per series - 10*(8^(7-1)) = 2.6m. Buckets: prometheus.ExponentialBuckets(10, 8, 7), }), - walReplayDuration: prometheus.NewSummary(prometheus.SummaryOpts{ - Name: "cortex_ingester_wal_replay_duration_seconds", - Help: "Time taken to replay the checkpoint and the WAL.", - Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, + walReplayDuration: prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "cortex_ingester_wal_replay_duration_seconds", + Help: "Time taken to replay the checkpoint and the WAL.", }), } @@ -103,6 +102,7 @@ func newIngesterMetrics(r prometheus.Registerer) *ingesterMetrics { m.queriedSamples, m.queriedSeries, m.queriedChunks, + m.walReplayDuration, ) } @@ -247,11 +247,12 @@ func New(cfg Config, clientConfig client.Config, limits *validation.Overrides, c level.Info(util.Logger).Log("msg", "recovering from WAL") start := time.Now() if err := recoverFromWAL(i); err != nil { + level.Error(util.Logger).Log("msg", "failed to recover from WAL", "time", time.Since(start).String()) return nil, err } elapsed := time.Since(start) level.Info(util.Logger).Log("msg", "recovery from WAL completed", "time", elapsed.String()) - i.metrics.walReplayDuration.Observe(elapsed.Seconds()) + i.metrics.walReplayDuration.Set(elapsed.Seconds()) } // If the WAL recover happened, then the userStates would already be set. diff --git a/pkg/ingester/series.go b/pkg/ingester/series.go index 5b7b0ffc539..7ce8f42f8bc 100644 --- a/pkg/ingester/series.go +++ b/pkg/ingester/series.go @@ -204,8 +204,6 @@ func (s *memorySeries) setChunks(descs []*desc) error { if len(descs) > 0 { s.lastTime = descs[len(descs)-1].LastTime } - memoryChunks.Add(float64(len(descs))) - return nil } diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index 4c0e89a9783..1a75a5f3120 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -72,11 +72,10 @@ type walWrapper struct { checkpointDeleteTotal prometheus.Counter checkpointCreationFail prometheus.Counter checkpointCreationTotal prometheus.Counter + checkpointDuration prometheus.Summary } -// newWAL creates a WAL object. -// * If the WAL is disabled, then the returned WAL is a no-op WAL. -// * If WAL recovery is enabled, then the userStates is always set for ingester. +// newWAL creates a WAL object. If the WAL is disabled, then the returned WAL is a no-op WAL. func newWAL(cfg WALConfig, userStatesFunc func() map[string]*userState) (WAL, error) { if !cfg.walEnabled { return &noopWAL{}, nil @@ -114,12 +113,18 @@ func newWAL(cfg WALConfig, userStatesFunc func() map[string]*userState) (WAL, er Name: "ingester_checkpoint_creations_total", Help: "Total number of checkpoint creations attempted.", }) + w.checkpointDuration = prometheus.NewSummary(prometheus.SummaryOpts{ + Name: "ingester_checkpoint_duration_seconds", + Help: "Time taken to create a checkpoint.", + Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, + }) if cfg.metricsRegisterer != nil { cfg.metricsRegisterer.MustRegister( w.checkpointDeleteFail, w.checkpointDeleteTotal, w.checkpointCreationFail, w.checkpointCreationTotal, + w.checkpointDuration, ) } @@ -131,7 +136,6 @@ func newWAL(cfg WALConfig, userStatesFunc func() map[string]*userState) (WAL, er func (w *walWrapper) Stop() { close(w.quit) w.wait.Wait() - w.wal.Close() } @@ -161,7 +165,7 @@ func (w *walWrapper) run() { ticker := time.NewTicker(w.cfg.checkpointDuration) defer ticker.Stop() - for !w.isStopped() { + for { select { case <-ticker.C: start := time.Now() @@ -172,7 +176,9 @@ func (w *walWrapper) run() { } elapsed := time.Since(start) level.Info(util.Logger).Log("msg", "checkpoint done", "time", elapsed.String()) + w.checkpointDuration.Observe(elapsed.Seconds()) case <-w.quit: + level.Info(util.Logger).Log("msg", "creating checkpoint before shutdown") if err := w.performCheckpoint(); err != nil { level.Error(util.Logger).Log("msg", "error checkpointing series during shutdown", "err", err) } @@ -181,15 +187,6 @@ func (w *walWrapper) run() { } } -func (w *walWrapper) isStopped() bool { - select { - case <-w.quit: - return true - default: - return false - } -} - const checkpointPrefix = "checkpoint." func (w *walWrapper) performCheckpoint() (err error) { @@ -511,11 +508,8 @@ Loop: case capturedErr = <-errChan: return capturedErr default: - if err := reader.Err(); err != nil { - return err - } + return reader.Err() } - return nil } func copyLabelAdapters(las []client.LabelAdapter) []client.LabelAdapter { @@ -563,6 +557,7 @@ func processCheckpointRecord(userStates *userStates, seriesPool *sync.Pool, stat errChan <- err return } + memoryChunks.Add(float64(len(descs))) seriesCache[s.UserId][s.Fingerprint] = series seriesPool.Put(s) @@ -705,16 +700,8 @@ Loop: case capturedErr = <-errChan: return capturedErr default: - if err := reader.Err(); err != nil { - return err - } + return reader.Err() } - - if err != nil { - return err - } - - return nil } func processWALSamples(userStates *userStates, stateCache map[string]*userState, seriesCache map[string]map[uint64]*memorySeries, diff --git a/pkg/ring/lifecycler.go b/pkg/ring/lifecycler.go index 4ab9d00082b..759921eff3e 100644 --- a/pkg/ring/lifecycler.go +++ b/pkg/ring/lifecycler.go @@ -114,11 +114,12 @@ type Lifecycler struct { actorChan chan func() // These values are initialised at startup, and never change + ID string + Addr string + RingName string + RingKey string - ID string - Addr string - RingName string - RingKey string + // Whether to flush if transfer fails on shutdown. flushOnShutdown bool // We need to remember the ingester state just in case consul goes away and comes From 3fb38cae2446d221ece10ef0b23009097b52bdc7 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Mon, 20 Jan 2020 16:16:18 +0530 Subject: [PATCH 53/56] Update config file and argument doc Signed-off-by: Ganesh Vernekar --- docs/configuration/arguments.md | 18 ++++++++++++++++++ docs/configuration/config-file-reference.md | 21 +++++++++++++++++++++ pkg/ingester/wal.go | 20 ++++++++++---------- 3 files changed, 49 insertions(+), 10 deletions(-) diff --git a/docs/configuration/arguments.md b/docs/configuration/arguments.md index d129adbe712..c6d236d448e 100644 --- a/docs/configuration/arguments.md +++ b/docs/configuration/arguments.md @@ -305,6 +305,24 @@ It also talks to a KVStore and has it's own copies of the same flags used by the Where you don't want to cache every chunk written by ingesters, but you do want to take advantage of chunk write deduplication, this option will make ingesters write a placeholder to the cache for each chunk. Make sure you configure ingesters with a different cache to queriers, which need the whole value. +#### WAL + +- `--ingester.wal-dir` + Directory where the WAL data should be stores and/or recovered from. + +- `--ingester.wal-enabled` + + Setting this to `true` enables writing to WAL during ingestion. + +- `--ingester.checkpoint-enabled` + Set this to `true` to enable checkpointing of in-memory chunks to disk. This is optional which helps in speeding up the replay process. + +- `--ingester.checkpoint-duration` + This is the interval at which checkpoints should be created. + +- `--ingester.recover-from-wal` + Set this to to `true` to recover data from an existing WAL. The data is recovered even if WAL is disabled and this is set to `true`. The WAL dir needs to be set for this. + ## Runtime Configuration file Cortex has a concept of "runtime config" file, which is simply a file that is reloaded while Cortex is running. It is used by some Cortex components to allow operator to change some aspects of Cortex configuration without restarting it. File is specified by using `-runtime-config.file=` flag and reload period (which defaults to 10 seconds) can be changed by `-runtime-config.reload-period=` flag. Previously this mechanism was only used by limits overrides, and flags were called `-limits.per-user-override-config=` and `-limits.per-user-override-period=10s` respectively. These are still used, if `-runtime-config.file=` is not specified. diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index 50235707e78..d295e67409f 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -323,6 +323,27 @@ ring: The `ingester_config` configures the Cortex ingester. ```yaml +walconfig: + # Enable writing of ingested data into WAL. + # CLI flag: -ingester.wal-enabled + [wal_enabled: | default = false] + + # Enable checkpointing of in-memory chunks. + # CLI flag: -ingester.checkpoint-enabled + [checkpoint_enabled: | default = false] + + # Recover data from existing WAL irrespective of WAL enabled/disabled. + # CLI flag: -ingester.recover-from-wal + [recover_from_wal: | default = false] + + # Directory to store the WAL and/or recover from WAL. + # CLI flag: -ingester.wal-dir + [wal_dir: | default = "wal"] + + # Interval at which checkpoints should be created. + # CLI flag: -ingester.checkpoint-duration + [checkpoint_duration: | default = 30m0s] + lifecycler: ring: kvstore: diff --git a/pkg/ingester/wal.go b/pkg/ingester/wal.go index de831628d70..031c0194ae7 100644 --- a/pkg/ingester/wal.go +++ b/pkg/ingester/wal.go @@ -29,21 +29,21 @@ import ( // WALConfig is config for the Write Ahead Log. type WALConfig struct { - walEnabled bool - checkpointEnabled bool - recover bool - dir string - checkpointDuration time.Duration + walEnabled bool `yaml:"wal_enabled,omitempty"` + checkpointEnabled bool `yaml:"checkpoint_enabled,omitempty"` + recover bool `yaml:"recover_from_wal,omitempty"` + dir string `yaml:"wal_dir,omitempty"` + checkpointDuration time.Duration `yaml:"checkpoint_duration,omitempty"` metricsRegisterer prometheus.Registerer } // RegisterFlags adds the flags required to config this to the given FlagSet func (cfg *WALConfig) RegisterFlags(f *flag.FlagSet) { - f.BoolVar(&cfg.walEnabled, "ingester.wal-enabled", false, "Enable the WAL.") - f.BoolVar(&cfg.checkpointEnabled, "ingester.checkpoint-enabled", false, "Enable checkpointing.") - f.BoolVar(&cfg.recover, "ingester.recover-from-wal", false, "Recover data from existing WAL.") - f.StringVar(&cfg.dir, "ingester.wal-dir", "wal", "Directory to store the WAL.") - f.DurationVar(&cfg.checkpointDuration, "ingester.checkpoint-duration", 1*time.Hour, "Duration over which to checkpoint.") + f.StringVar(&cfg.dir, "ingester.wal-dir", "wal", "Directory to store the WAL and/or recover from WAL.") + f.BoolVar(&cfg.recover, "ingester.recover-from-wal", false, "Recover data from existing WAL irrespective of WAL enabled/disabled.") + f.BoolVar(&cfg.walEnabled, "ingester.wal-enabled", false, "Enable writing of ingested data into WAL.") + f.BoolVar(&cfg.checkpointEnabled, "ingester.checkpoint-enabled", false, "Enable checkpointing of in-memory chunks.") + f.DurationVar(&cfg.checkpointDuration, "ingester.checkpoint-duration", 30*time.Minute, "Interval at which checkpoints should be created.") } // WAL interface allows us to have a no-op WAL when the WAL is disabled. From 92c114940fff8c371078d409c038694cf9b7b09b Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Mon, 20 Jan 2020 16:16:48 +0530 Subject: [PATCH 54/56] Add guide to run/migrate-to WAL in ingesters Signed-off-by: Ganesh Vernekar --- docs/guides/ingesters-with-wal.md | 75 +++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 docs/guides/ingesters-with-wal.md diff --git a/docs/guides/ingesters-with-wal.md b/docs/guides/ingesters-with-wal.md new file mode 100644 index 00000000000..d402d187eb1 --- /dev/null +++ b/docs/guides/ingesters-with-wal.md @@ -0,0 +1,75 @@ +--- +title: "Ingesters with WAL" +linkTitle: "Ingesters with WAL" +weight: 5 +slug: ingesters-with-wal +--- + +Currently the ingesters without the TSDB stores all the data in the memory and if it happens to crash for any reason, there will be a loss of data. WAL helps fill this gap of reliability. + +To use WAL, there are some changes that needs to be made in the deployment. + +## Things to change + +1. Since ingesters need to have the same persistent volume across restarts/rollout, all the ingesters should be run on [statefulset](https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/) with fixed volumes. + +2. Following flags needs to be set + * `--ingester.wal-dir` to the directory where the WAL data should be stores and/or recovered from. + * `--ingester.wal-enabled` to `true` which enables writing to WAL during ingestion. + * `--ingester.checkpoint-enabled` to `true` to enable checkpointing of in-memory chunks to disk. This is optional which helps in speeding up the replay process. + * `--ingester.checkpoint-duration` to the interval at which checkpoints should be created. + * `--ingester.recover-from-wal` to `true` to recover data from an existing WAL. The data is recovered even if WAL is disabled and this is set to `true`. The WAL dir needs to be set for this. + * If you are going to enable WAL, it is advisable to always set this to `true`. + +## Stuff that is changed automatically when WAL is enabled + +1. Flushing of data to chunk store during rollouts or scale down is disabled. This is because during a rollout of statefulset there is no 1 ingester leaving and joining each at the same time, rather the same ingester is shut down and broght back again with updated config. Hence flushing is skipped and the data is recovered from the WAL. + +2. As there are no transfers between ingesters, the tokens are stored and recovered from disk between rollout/restarts. This is [not a new thing](https://github.com/cortexproject/cortex/pull/1750) but it is effective when using statefulsets. + +## Migrating from stateless deployments + +The ingester _deployment without WAL_ and _statefulset with WAL_ should be scaled down and up respectively in sync without transfer of data between them to ensure that any ingestion after migration is reliable immediately. + +Let's take an example of 4 ingesters. The migration would look something like this: + +1. Bring up a 1 stateful ingester `ingester-0` and wait till it's ready (accepting read and write requests). +2. Scale down old ingester deployment to 3 and wait till the leaving ingester flushes all the data to chunk store. +3. Once that ingester has disappeared from `kc get pods ...`, add another stateful ingester and wait till it's ready. This assures not transfer. Now you have `ingester-0 ingester-1`. +4. Repeat step 2 to reduce remove another ingester from old deployment. +5. Repeat step 3 to add another stateful ingester. Now you have `ingester-0 ingester-1 ingester-2`. +6. Repeat step 4 and 5, and now you will finally have `ingester-0 ingester-1 ingester-2 ingester-3`. + +## How to scale up/down + +### Scale up + +Scaling up is same as what you would do without WAL or statefulsets. Add 1 ingester at a time. + +### Scale down + +Since Kubernetes doesn't differentiate between rollout and scale down when sending a signal, the flushing of chunks is disabled by default. Hence the only thing to take care during scale down is flushing of chunks. + +There are 2 ways to do it, with the latter being a fallback option. + +**First option** +Consider you have 4 ingesters `ingester-0 ingester-1 ingester-2 ingester-3` and you want to scale down to 2 ingesters, the ingesters which will be shutdown according to statefulset rules are `ingester-2 ingester-3`. + +Hence before actually scaling down in Kubernetes, port forward those ingesters and hit the [`/shutdown`](https://github.com/cortexproject/cortex/pull/1746) endpoint. This will flush the chunks and shut down the ingesters (while also removing itself from the ring). + +After hitting the endpoint for `ingester-2 ingester-3`, scale down the ingesters to 2. + +PS: Given you have to scale down 1 ingester at a time, you can pipeline the shutdown and scaledown process instead of hitting shutdown endpoint for all to-be-scaled-down ingesters at the same time. + +**Fallback option** + +There is a [flush mode ingester](https://github.com/cortexproject/cortex/pull/1747) in progress, and with recent discussions there will be a separate target called flusher in it's place. + +You can run it as a kubernetes job which will +* Attach to the volume of the scaled down ingester +* Recover from the WAL +* And flush all the chunks. + +This job is to be run for all the ingesters that you missed hitting the shutdown endpoint as a first option. + +More info about the flusher target will be added once it's upstream. \ No newline at end of file From 8b55cdbbd38b3ef06d64a600097f9172d4f345b9 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Mon, 20 Jan 2020 20:54:04 +0530 Subject: [PATCH 55/56] Fix review comments Signed-off-by: Ganesh Vernekar --- docs/guides/ingesters-with-wal.md | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/docs/guides/ingesters-with-wal.md b/docs/guides/ingesters-with-wal.md index d402d187eb1..db414a54ec6 100644 --- a/docs/guides/ingesters-with-wal.md +++ b/docs/guides/ingesters-with-wal.md @@ -9,7 +9,7 @@ Currently the ingesters without the TSDB stores all the data in the memory and i To use WAL, there are some changes that needs to be made in the deployment. -## Things to change +## Changes to deployment 1. Since ingesters need to have the same persistent volume across restarts/rollout, all the ingesters should be run on [statefulset](https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/) with fixed volumes. @@ -17,13 +17,14 @@ To use WAL, there are some changes that needs to be made in the deployment. * `--ingester.wal-dir` to the directory where the WAL data should be stores and/or recovered from. * `--ingester.wal-enabled` to `true` which enables writing to WAL during ingestion. * `--ingester.checkpoint-enabled` to `true` to enable checkpointing of in-memory chunks to disk. This is optional which helps in speeding up the replay process. - * `--ingester.checkpoint-duration` to the interval at which checkpoints should be created. + * `--ingester.checkpoint-duration` to the interval at which checkpoints should be created. Default is `30m`, and depending on the number of series, it can be brought down to `15m` if there are less series per ingester (say 1M). * `--ingester.recover-from-wal` to `true` to recover data from an existing WAL. The data is recovered even if WAL is disabled and this is set to `true`. The WAL dir needs to be set for this. * If you are going to enable WAL, it is advisable to always set this to `true`. + * `--ingester.tokens-file-path` should be set to the filepath where the tokens should be stored. Why this is required is described below. -## Stuff that is changed automatically when WAL is enabled +## Changes in lifecycle when WAL is enabled -1. Flushing of data to chunk store during rollouts or scale down is disabled. This is because during a rollout of statefulset there is no 1 ingester leaving and joining each at the same time, rather the same ingester is shut down and broght back again with updated config. Hence flushing is skipped and the data is recovered from the WAL. +1. Flushing of data to chunk store during rollouts or scale down is disabled. This is because during a rollout of statefulset there are no ingesters that are simultaneously leaving and joining, rather the same ingester is shut down and broght back again with updated config. Hence flushing is skipped and the data is recovered from the WAL. 2. As there are no transfers between ingesters, the tokens are stored and recovered from disk between rollout/restarts. This is [not a new thing](https://github.com/cortexproject/cortex/pull/1750) but it is effective when using statefulsets. @@ -33,7 +34,7 @@ The ingester _deployment without WAL_ and _statefulset with WAL_ should be scale Let's take an example of 4 ingesters. The migration would look something like this: -1. Bring up a 1 stateful ingester `ingester-0` and wait till it's ready (accepting read and write requests). +1. Bring up one stateful ingester `ingester-0` and wait till it's ready (accepting read and write requests). 2. Scale down old ingester deployment to 3 and wait till the leaving ingester flushes all the data to chunk store. 3. Once that ingester has disappeared from `kc get pods ...`, add another stateful ingester and wait till it's ready. This assures not transfer. Now you have `ingester-0 ingester-1`. 4. Repeat step 2 to reduce remove another ingester from old deployment. @@ -44,7 +45,7 @@ Let's take an example of 4 ingesters. The migration would look something like th ### Scale up -Scaling up is same as what you would do without WAL or statefulsets. Add 1 ingester at a time. +Scaling up is same as what you would do without WAL or statefulsets. Nothing to change here. ### Scale down @@ -53,7 +54,7 @@ Since Kubernetes doesn't differentiate between rollout and scale down when sendi There are 2 ways to do it, with the latter being a fallback option. **First option** -Consider you have 4 ingesters `ingester-0 ingester-1 ingester-2 ingester-3` and you want to scale down to 2 ingesters, the ingesters which will be shutdown according to statefulset rules are `ingester-2 ingester-3`. +Consider you have 4 ingesters `ingester-0 ingester-1 ingester-2 ingester-3` and you want to scale down to 2 ingesters, the ingesters which will be shutdown according to statefulset rules are `ingester-3` and then `ingester-2`. Hence before actually scaling down in Kubernetes, port forward those ingesters and hit the [`/shutdown`](https://github.com/cortexproject/cortex/pull/1746) endpoint. This will flush the chunks and shut down the ingesters (while also removing itself from the ring). From fe3cc0824e3108c61302bacea79c0898addd5fe2 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Tue, 21 Jan 2020 18:00:58 +0530 Subject: [PATCH 56/56] Fix review comments Signed-off-by: Ganesh Vernekar --- docs/guides/ingesters-with-wal.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/guides/ingesters-with-wal.md b/docs/guides/ingesters-with-wal.md index db414a54ec6..2c9fe28bffe 100644 --- a/docs/guides/ingesters-with-wal.md +++ b/docs/guides/ingesters-with-wal.md @@ -5,7 +5,7 @@ weight: 5 slug: ingesters-with-wal --- -Currently the ingesters without the TSDB stores all the data in the memory and if it happens to crash for any reason, there will be a loss of data. WAL helps fill this gap of reliability. +Currently the ingesters running in the chunks storage mode, store all their data in memory. If there is a crash, there could be loss of data. WAL helps fill this gap in reliability. To use WAL, there are some changes that needs to be made in the deployment. @@ -14,17 +14,17 @@ To use WAL, there are some changes that needs to be made in the deployment. 1. Since ingesters need to have the same persistent volume across restarts/rollout, all the ingesters should be run on [statefulset](https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/) with fixed volumes. 2. Following flags needs to be set - * `--ingester.wal-dir` to the directory where the WAL data should be stores and/or recovered from. + * `--ingester.wal-dir` to the directory where the WAL data should be stores and/or recovered from. Note that this should be on the mounted volume. * `--ingester.wal-enabled` to `true` which enables writing to WAL during ingestion. * `--ingester.checkpoint-enabled` to `true` to enable checkpointing of in-memory chunks to disk. This is optional which helps in speeding up the replay process. * `--ingester.checkpoint-duration` to the interval at which checkpoints should be created. Default is `30m`, and depending on the number of series, it can be brought down to `15m` if there are less series per ingester (say 1M). * `--ingester.recover-from-wal` to `true` to recover data from an existing WAL. The data is recovered even if WAL is disabled and this is set to `true`. The WAL dir needs to be set for this. * If you are going to enable WAL, it is advisable to always set this to `true`. - * `--ingester.tokens-file-path` should be set to the filepath where the tokens should be stored. Why this is required is described below. + * `--ingester.tokens-file-path` should be set to the filepath where the tokens should be stored. Note that this should be on the mounted volume. Why this is required is described below. ## Changes in lifecycle when WAL is enabled -1. Flushing of data to chunk store during rollouts or scale down is disabled. This is because during a rollout of statefulset there are no ingesters that are simultaneously leaving and joining, rather the same ingester is shut down and broght back again with updated config. Hence flushing is skipped and the data is recovered from the WAL. +1. Flushing of data to chunk store during rollouts or scale down is disabled. This is because during a rollout of statefulset there are no ingesters that are simultaneously leaving and joining, rather the same ingester is shut down and brought back again with updated config. Hence flushing is skipped and the data is recovered from the WAL. 2. As there are no transfers between ingesters, the tokens are stored and recovered from disk between rollout/restarts. This is [not a new thing](https://github.com/cortexproject/cortex/pull/1750) but it is effective when using statefulsets.