diff --git a/.gitignore b/.gitignore index b5017a5e1678..e158471cc063 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,7 @@ -pkg/* -*.pyc -datadog-agent -*.lock vendor/ +bin/ + .DS_Store -*.out -profile.cov +*.cov +*.lock +*.pyc diff --git a/README.md b/README.md index 1bebae9f42d3..ccac4419650a 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,13 @@ -Datadog Agent +# Datadog Agent +The Datadog Agent faithfully collects events and metrics and brings +them to [Datadog](https://app.datadoghq.com) on your behalf so that +you can do something useful with your monitoring and performance data. -The future of dd-agent +## Getting started +Binary distributions are not provided yet, to try out the Agent you can build the `master` branch. + +### Building +Just run the `build.sh` script from the repo root + +### Testing +Just run the `runtests.sh` script from the repo root diff --git a/aggregator/singleton.go b/aggregator/singleton.go new file mode 100644 index 000000000000..0381b9d0dd21 --- /dev/null +++ b/aggregator/singleton.go @@ -0,0 +1,11 @@ +package aggregator + +var _aggregator Aggregator + +func Get() Aggregator { + return _aggregator +} + +func Set(aggregatorInstance Aggregator) { + _aggregator = aggregatorInstance +} diff --git a/build.sh b/build.sh new file mode 100755 index 000000000000..3598ad51ba8d --- /dev/null +++ b/build.sh @@ -0,0 +1,10 @@ +#!/bin/sh -e + +ORG_PATH="github.com/DataDog" +REPO_PATH="${ORG_PATH}/datadog-agent" +BIN_PATH="./bin/agent" + +eval $(go env) + +go build -o ${BIN_PATH}/agent ${REPO_PATH}/cmd/agent +cp -r ./pkg/py/dist/ ${BIN_PATH}/dist/ diff --git a/agentmain/main.go b/cmd/agent/app/main.go similarity index 77% rename from agentmain/main.go rename to cmd/agent/app/main.go index 3c4a64a4b2f4..574569de7171 100644 --- a/agentmain/main.go +++ b/cmd/agent/app/main.go @@ -1,18 +1,19 @@ package ddagentmain import ( + "fmt" "time" "github.com/DataDog/datadog-agent/aggregator" - "github.com/DataDog/datadog-agent/checks" - "github.com/DataDog/datadog-agent/checks/system" - "github.com/DataDog/datadog-agent/py" + "github.com/DataDog/datadog-agent/pkg/checks" + "github.com/DataDog/datadog-agent/pkg/checks/system" + "github.com/DataDog/datadog-agent/pkg/py" + "github.com/kardianos/osext" "github.com/op/go-logging" "github.com/sbinet/go-python" ) const AGENT_VERSION = "6.0.0" -const confdPath = "py/conf.d" var log = logging.MustGetLogger("datadog-agent") @@ -44,20 +45,23 @@ func Start() { panic(err.Error()) } // Set the PYTHONPATH + here, _ := osext.ExecutableFolder() + distPath := fmt.Sprintf("%s/dist", here) + confdPath := fmt.Sprintf("%s/conf.d", distPath) path := python.PySys_GetObject("path") - python.PyList_Append(path, python.PyString_FromString("py")) + python.PyList_Append(path, python.PyString_FromString(distPath)) // `python.Initialize` acquires the GIL but we don't need it, let's release it state := python.PyEval_SaveThread() // for now, only Python needs it, build and pass it on the fly - aggregator.InitApi(aggregator.NewUnbufferedAggregator()) + py.InitApi(aggregator.NewUnbufferedAggregator()) // Get a single Runner instance, i.e. we process checks sequentially go checks.Runner(pending) // Get a list of Python checks we want to run - checksNames := []string{"checks.directory", "checks.go_expvar", "checks.process"} + checksNames := []string{"checks.go_expvar"} // Search for and import all the desired Python checks checks := py.CollectChecks(checksNames, confdPath) diff --git a/main.go b/cmd/agent/main.go similarity index 78% rename from main.go rename to cmd/agent/main.go index 4cde25a7c519..49c3e9f9b5db 100644 --- a/main.go +++ b/cmd/agent/main.go @@ -5,7 +5,7 @@ import ( "net/http" _ "net/http/pprof" - "github.com/DataDog/datadog-agent/agentmain" + "github.com/DataDog/datadog-agent/cmd/agent/app" ) func main() { diff --git a/glide.yaml b/glide.yaml index e085a254ab95..1a981ea1e798 100644 --- a/glide.yaml +++ b/glide.yaml @@ -1,10 +1,13 @@ package: github.com/DataDog/datadog-agent -homepage: https://github.com/DataDog/datadog-agent -license: MIT import: -- package: github.com/sbinet/go-python +- package: github.com/DataDog/datadog-go + subpackages: + - statsd +- package: github.com/mitchellh/reflectwalk - package: github.com/op/go-logging +- package: github.com/sbinet/go-python - package: github.com/shirou/gopsutil subpackages: - - /mem + - mem - package: gopkg.in/yaml.v2 +- package: github.com/kardianos/osext diff --git a/models/check.go b/models/check.go deleted file mode 100644 index 98edfbb56d39..000000000000 --- a/models/check.go +++ /dev/null @@ -1,9 +0,0 @@ -package models - -import ( - "github.com/DataDog/datadog-agent/aggregator" -) - -type Check interface { - Check(agg *aggregator.Aggregator) -} diff --git a/omnibus/Berksfile b/omnibus/Berksfile deleted file mode 100644 index 76505fd89194..000000000000 --- a/omnibus/Berksfile +++ /dev/null @@ -1,6 +0,0 @@ -source 'https://api.berkshelf.com' - -cookbook "omnibus", "~> 1.2.2" -cookbook "yum", "< 3.0.0" -cookbook "golang", "~> 1.4.0" -# cookbook "mercurial", "~> 2.0.0" diff --git a/omnibus/Berksfile.lock b/omnibus/Berksfile.lock deleted file mode 100644 index f132129e48c6..000000000000 --- a/omnibus/Berksfile.lock +++ /dev/null @@ -1,47 +0,0 @@ -DEPENDENCIES - golang (~> 1.4.0) - omnibus (~> 1.2.2) - yum (< 3.0.0) - -GRAPH - 7-zip (1.0.2) - windows (>= 1.2.2) - apt (1.9.2) - build-essential (1.4.4) - chef_handler (1.1.6) - dmg (2.2.0) - git (2.3.0) - build-essential (>= 0.0.0) - dmg (>= 0.0.0) - runit (~> 1.0) - windows (>= 0.0.0) - yum (>= 0.0.0) - golang (1.4.0) - homebrew (1.3.2) - ohai (2.0.1) - omnibus (1.2.2) - 7-zip (~> 1.0.0) - apt (~> 1.9.0) - build-essential (~> 1.4.0) - git (~> 2.3.0) - homebrew (~> 1.3.2) - pkgin (~> 0.4.0) - pkgutil (~> 0.0.3) - rbenv (= 1.6.5) - windows (~> 1.8.8) - wix (~> 1.1.0) - yum (~> 2.2.0) - pkgin (0.4.0) - pkgutil (0.0.3) - rbenv (1.6.5) - apt (>= 0.0.0) - build-essential (>= 0.0.0) - git (>= 0.0.0) - ohai (>= 1.1) - runit (1.0.6) - build-essential (>= 0.0.0) - windows (1.8.10) - chef_handler (>= 0.0.0) - wix (1.1.0) - windows (>= 1.2.2) - yum (2.2.4) diff --git a/omnibus/DEPENDENCY_TREE.txt b/omnibus/DEPENDENCY_TREE.txt deleted file mode 100644 index b4bc76a257a4..000000000000 --- a/omnibus/DEPENDENCY_TREE.txt +++ /dev/null @@ -1,66 +0,0 @@ -Legend (W, M, L = windows / mac / linux only) - -adodbapi -boto -** bzip2 -** cacerts -*** cmake -** curl -datadog-gohai -* futures -** gdbm -gui M W -* guidata M W -**** homebrew -httplib2 -kafka-python M L -kazoo -** libedit -* libffi -** libgcc -**** libiconv -* libpq -** libsqlite3 -** libtool -* libyaml -** makedepend -* ncurses -ntplib -* openssl -paramiko -pg8000 -* pip -*** pkg-config -preparation -procps-ng L -psutil -psycopg2 -pycrypto -* pycurl -pymongo -pymysql -pyopenssl -pysnmp -* pysnmp-mibs -* pyro4 -** pyside -* python -python-gearman -python-memcached -python-redis -python-rrdtool -pyyaml -pyvmomo -*** qtorequests -** setuptools -simplejson -snakebite -* spyderlib M W -supervisor M L -sysstat L -tornado -*** util-macros -uuid -version-manifest -*** xproto -zlib (M L ?) diff --git a/omnibus/Gemfile b/omnibus/Gemfile deleted file mode 100644 index c1e1b19aa15e..000000000000 --- a/omnibus/Gemfile +++ /dev/null @@ -1,4 +0,0 @@ -source 'https://rubygems.org' -gem 'omnibus', git: 'git://github.com/datadog/omnibus-ruby.git', branch: 'datadog-4.0.0' -gem 'omnibus-software', git: 'git://github.com/datadog/omnibus-software.git', branch: ENV['OMNIBUS_SOFTWARE_BRANCH'] -gem 'httparty' diff --git a/omnibus/README.md b/omnibus/README.md deleted file mode 100644 index 6bfe7d58babc..000000000000 --- a/omnibus/README.md +++ /dev/null @@ -1,75 +0,0 @@ -Datadog Agent - Omnibus Project -================ - -This is an [Omnibus](https://github.com/opscode/omnibus) project to build the Datadog Agent packages. - -It's using a [fork](https://github.com/chef/omnibus/compare/v4.0.0...DataDog:datadog-4.0.0) of the official 4.0.0 release of the Omnibus project. - -Builds are run in docker containers with Circleci. -See: -* https://github.com/DataDog/docker-dd-agent-build-deb-i386 -* https://github.com/DataDog/docker-dd-agent-build-rpm-i386 -* https://github.com/DataDog/docker-dd-agent-build-deb-x64 -* https://github.com/DataDog/docker-dd-agent-build-rpm-x64 - - -## Build a package locally - -* Install Docker - -* Run the following script with the desired parameters - -```bash -PLATFORM="deb-x64" # must be in "deb-x64", "deb-i386", "rpm-x64", "rpm-i386" -AGENT_BRANCH="master" # Branch of dd-agent repo to use, default "master" -OMNIBUS_BRANCH="master" # Branch of dd-agent-omnibus repo to use, default "master" -AGENT_VERSION="5.4.0" # default to the latest tag on that branch -LOG_LEVEL="debug" # default to "info" -LOCAL_AGENT_REPO="~/dd-agent" # Path to a local repo of the agent to build from. Defaut is not set and the build will be done against the github repo - -# The passphrase of the key you want to use to sign your .rpm package (if -# building an RPM package). If you don't set this variable, the RPM won't be -# signed but the build should succeed. Note that you must also mount a volume -# under /keys and bind it to a folder containing an RPM-SIGNING-KEY.private -# file containing your exported signing key. Finally, be aware that the -# package_maintainer DSL defined in config/projects/datadog_agent.rb and the -# full key name (My Name (comments) ) must match. -RPM_SIGNING_PASSPHRASE="my_super_secret_passphrase" - -mkdir -p pkg -mkdir -p "cache/$PLATFORM" -docker run --name "dd-agent-build-$PLATFORM" \ - -e OMNIBUS_BRANCH=$OMNIBUS_BRANCH \ - -e LOG_LEVEL=$LOG_LEVEL \ - -e AGENT_BRANCH=$AGENT_BRANCH \ - -e AGENT_VERSION=$AGENT_VERSION \ - -e RPM_SIGNING_PASSPHRASE=$RPM_SIGNING_PASSPHRASE \ - -e $LOCAL_AGENT_REPO=/dd-agent-repo # Only to use if you want to build from a local repo \ - -v `pwd`/pkg:/dd-agent-omnibus/pkg \ - -v `pwd`/keys:/keys \ - -v "`pwd`/cache/$PLATFORM:/var/cache/omnibus" \ - -v $LOCAL_AGENT_REPO:/dd-agent-repo # Only to use if you want to build from a local repo \ - "datadog/docker-dd-agent-build-$PLATFORM" - -# Cleanup (necessary to launch another build) -docker rm dd-agent-build-$PLATFORM -``` - -## Build on Mac OS X - -The Mac build platform should have: - -* Xcode installed (type `git` in a terminal), -* [Go](http://golang.org/dl/) installed, -* sudoer rights for the build user, -* Bundler installed: `sudo gem install bundler`, -* Important directories created: `sudo mkdir -p /var/cache/omnibus /opt/datadog-agent`, -* Owned by the right user: `sudo chown $USER:nogroup /var/cache/omnibus /opt/datadog-agent`. -* Xcode license accepted (to sign package) `sudo xcodebuild -license` -* Datadog signing key - -Then run: -```bash -AGENT_BRANCH= OMNIBUS_BRANCH= OMNIBUS_SOFTWARE_BRANCH= sh omnibus_build.sh -``` -The env vars have the same meaning as the Dockerized build above. Omitting them will cause the default of `master` to be used for all 3 diff --git a/omnibus/Vagrantfile b/omnibus/Vagrantfile deleted file mode 100644 index d03272579999..000000000000 --- a/omnibus/Vagrantfile +++ /dev/null @@ -1,122 +0,0 @@ -# -*- mode: ruby -*- -# vi: set ft=ruby : - -require "vagrant" - -if Vagrant::VERSION < "1.2.1" - raise "The Omnibus Build Lab is only compatible with Vagrant 1.2.1+" -end - -host_project_path = File.expand_path("..", __FILE__) -guest_project_path = "/home/vagrant/#{File.basename(host_project_path)}" -project_name = "datadog-agent" - -Vagrant.configure("2") do |config| - - config.vm.hostname = "#{project_name}-omnibus-build-lab.com" - - # Let's cache stuff to reduce build time using vagrant-cachier - # Require vagrant-cachier plugin - config.cache.scope = :box - - vms_to_use = { - 'ubuntu-i386' => 'ubuntu-10.04-i386', - 'ubuntu-x64' => 'ubuntu-10.04', - 'debian-i386' => 'debian-6.0.8-i386', - 'debian-x64' => 'debian-6.0.8', - 'fedora-i386' => 'fedora-19-i386', - 'fedora-x64' => 'fedora-19', - 'centos-i386' => 'centos-5.10-i386', - 'centos-x64' => 'centos-5.10', - } - - vms_to_use.each_pair do |key, platform| - - config.vm.define key do |c| - c.vm.box = "opscode-#{platform}" - c.vm.box_url = "http://opscode-vm-bento.s3.amazonaws.com/vagrant/virtualbox/opscode_#{platform}_chef-provisionerless.box" - end - - end - - config.vm.provider :virtualbox do |vb| - # Give enough horsepower to build without taking all day. - vb.customize [ - "modifyvm", :id, - "--memory", "3072", - "--cpus", "3", - "--ioapic", "on" # Required for the centos-5-32 bits to boot - ] - end - - # Ensure a recent version of the Chef Omnibus packages are installed - config.omnibus.chef_version = "11.16.4" - - # Enable the berkshelf-vagrant plugin - config.berkshelf.enabled = true - # The path to the Berksfile to use with Vagrant Berkshelf - config.berkshelf.berksfile_path = "./Berksfile" - - config.ssh.forward_agent = true - - # Mount omnibus to have the builder code! - current_dir = File.expand_path('..', __FILE__) - config.vm.synced_folder current_dir, '/home/vagrant/dd-agent-omnibus' - # Mount local agent repo if asked to - if ENV['LOCAL_AGENT_REPO'] - config.vm.synced_folder ENV['LOCAL_AGENT_REPO'], '/home/vagrant/dd-agent' - # For the VM replace by the new path where we mounted it - ENV['LOCAL_AGENT_REPO'] = '/home/vagrant/dd-agent' - end - - # prepare VM to be an Omnibus builder - config.vm.provision :chef_solo do |chef| - chef.custom_config_path = "Vagrantfile.chef" - chef.json = { - "omnibus" => { - "build_user" => "vagrant", - "build_dir" => guest_project_path, - "install_dir" => "/opt/#{project_name}" - }, - "go" => { - "version" => "1.2.2", - "scm" => false - }, - } - - chef.run_list = [ - "recipe[omnibus::default]", - "recipe[golang]" - ] - end - - # Export the defaults we need to run the scripts - # No better way of passing args in the VM :/ - profile_file = "/etc/profile.d/vagrant.sh" - env_variables_script = < #{profile_file} -ENVSCRIPT - env_variables_passthru = %w[ - AGENT_BRANCH - AGENT_VERSION - DISTRO - LOCAL_AGENT_REPO - LOG_LEVEL - S3_OMNIBUS_BUCKET - S3_ACCESS_KEY - S3_SECRET_KEY - ] - env_variables_passthru.each do |var| - env_variables_script += "\necho export #{var}=#{ENV[var]} >> #{profile_file}" - end - config.vm.provision 'shell', inline: env_variables_script - - # Do the real work, build it! - config.vm.provision 'shell', path: 'omnibus_build.sh' - - if ENV['CLEAR_CACHE'] == "true" - config.vm.provision "shell", - inline: "echo Clearing Omnibus cache && rm -rf /var/cache/omnibus/*" - end -end diff --git a/omnibus/Vagrantfile.chef b/omnibus/Vagrantfile.chef deleted file mode 100644 index dc06693d10a6..000000000000 --- a/omnibus/Vagrantfile.chef +++ /dev/null @@ -1 +0,0 @@ -Chef::Config.ssl_verify_mode = :verify_peer \ No newline at end of file diff --git a/omnibus/bin/bundler b/omnibus/bin/bundler deleted file mode 100755 index 72c62ec0b5d3..000000000000 --- a/omnibus/bin/bundler +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env ruby -# -# This file was generated by Bundler. -# -# The application 'bundler' is installed as part of a gem, and -# this file is here to facilitate running it. -# - -require 'pathname' -ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", - Pathname.new(__FILE__).realpath) - -require 'rubygems' -require 'bundler/setup' - -load Gem.bin_path('bundler', 'bundler') diff --git a/omnibus/bin/ffi-yajl-bench b/omnibus/bin/ffi-yajl-bench deleted file mode 100755 index af15547ea219..000000000000 --- a/omnibus/bin/ffi-yajl-bench +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env ruby -# -# This file was generated by Bundler. -# -# The application 'ffi-yajl-bench' is installed as part of a gem, and -# this file is here to facilitate running it. -# - -require 'pathname' -ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", - Pathname.new(__FILE__).realpath) - -require 'rubygems' -require 'bundler/setup' - -load Gem.bin_path('ffi-yajl', 'ffi-yajl-bench') diff --git a/omnibus/bin/fpm b/omnibus/bin/fpm deleted file mode 100755 index 606005397dff..000000000000 --- a/omnibus/bin/fpm +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env ruby -# -# This file was generated by Bundler. -# -# The application 'fpm' is installed as part of a gem, and -# this file is here to facilitate running it. -# - -require 'pathname' -ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", - Pathname.new(__FILE__).realpath) - -require 'rubygems' -require 'bundler/setup' - -load Gem.bin_path('fpm', 'fpm') diff --git a/omnibus/bin/ohai b/omnibus/bin/ohai deleted file mode 100755 index 3990a9df62a7..000000000000 --- a/omnibus/bin/ohai +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env ruby -# -# This file was generated by Bundler. -# -# The application 'ohai' is installed as part of a gem, and -# this file is here to facilitate running it. -# - -require 'pathname' -ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", - Pathname.new(__FILE__).realpath) - -require 'rubygems' -require 'bundler/setup' - -load Gem.bin_path('ohai', 'ohai') diff --git a/omnibus/bin/omnibus b/omnibus/bin/omnibus deleted file mode 100755 index e3f3899d7934..000000000000 --- a/omnibus/bin/omnibus +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env ruby -# -# This file was generated by Bundler. -# -# The application 'omnibus' is installed as part of a gem, and -# this file is here to facilitate running it. -# - -require 'pathname' -ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", - Pathname.new(__FILE__).realpath) - -require 'rubygems' -require 'bundler/setup' - -load Gem.bin_path('omnibus', 'omnibus') diff --git a/omnibus/bin/rake b/omnibus/bin/rake deleted file mode 100755 index 26c7a2d5b5f1..000000000000 --- a/omnibus/bin/rake +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env ruby -# -# This file was generated by Bundler. -# -# The application 'rake' is installed as part of a gem, and -# this file is here to facilitate running it. -# - -require 'pathname' -ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", - Pathname.new(__FILE__).realpath) - -require 'rubygems' -require 'bundler/setup' - -load Gem.bin_path('rake', 'rake') diff --git a/omnibus/bin/rubygems-cabin-test b/omnibus/bin/rubygems-cabin-test deleted file mode 100755 index 3b151505993c..000000000000 --- a/omnibus/bin/rubygems-cabin-test +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env ruby -# -# This file was generated by Bundler. -# -# The application 'rubygems-cabin-test' is installed as part of a gem, and -# this file is here to facilitate running it. -# - -require 'pathname' -ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", - Pathname.new(__FILE__).realpath) - -require 'rubygems' -require 'bundler/setup' - -load Gem.bin_path('cabin', 'rubygems-cabin-test') diff --git a/omnibus/bin/thor b/omnibus/bin/thor deleted file mode 100755 index 8421e001eac7..000000000000 --- a/omnibus/bin/thor +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env ruby -# -# This file was generated by Bundler. -# -# The application 'thor' is installed as part of a gem, and -# this file is here to facilitate running it. -# - -require 'pathname' -ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", - Pathname.new(__FILE__).realpath) - -require 'rubygems' -require 'bundler/setup' - -load Gem.bin_path('thor', 'thor') diff --git a/omnibus/config/projects/datadog-agent.rb b/omnibus/config/projects/datadog-agent.rb deleted file mode 100644 index 3dbffe9530b5..000000000000 --- a/omnibus/config/projects/datadog-agent.rb +++ /dev/null @@ -1,168 +0,0 @@ -require "./lib/ostools.rb" - -name 'datadog-agent' -maintainer 'Datadog Packages ' -homepage 'http://www.datadoghq.com' -install_dir '/opt/datadog-agent' - -build_version do - source :git, from_dependency: 'datadog-agent' - output_format :dd_agent_format -end - -build_iteration 1 - -description 'Datadog Monitoring Agent - The Datadog Monitoring Agent is a lightweight process that monitors system - processes and services, and sends information back to your Datadog account. - . - This package installs and runs the advanced Agent daemon, which queues and - forwards metrics from your applications as well as system services. - . - See http://www.datadoghq.com/ for more information -' - -# ------------------------------------ -# Generic package information -# ------------------------------------ - -# .deb specific flags -package :deb do - vendor 'Datadog ' - epoch 1 - license 'Simplified BSD License' - section 'utils' - priority 'extra' -end - -# .rpm specific flags -package :rpm do - vendor 'Datadog ' - epoch 1 - license 'Simplified BSD License' - category 'System Environment/Daemons' - priority 'extra' - if ENV.has_key?('RPM_SIGNING_PASSPHRASE') and not ENV['RPM_SIGNING_PASSPHRASE'].empty? - signing_passphrase "#{ENV['RPM_SIGNING_PASSPHRASE']}" - end -end - -# OSX .pkg specific flags -package :pkg do - identifier 'com.datadoghq.agent' - signing_identity 'Developer ID Installer: Datadog, Inc. (JKFCB4CN7C)' -end -compress :dmg do - window_bounds '200, 200, 750, 600' - pkg_position '10, 10' -end - -# Note: this is to try to avoid issues when upgrading from an -# old version of the agent which shipped also a datadog-agent-base -# package. -if redhat? - replace 'datadog-agent-base < 5.0.0' - replace 'datadog-agent-lib < 5.0.0' -elsif debian? - replace 'datadog-agent-base (<< 5.0.0)' - replace 'datadog-agent-lib (<< 5.0.0)' - conflict 'datadog-agent-base (<< 5.0.0)' -end - -# ------------------------------------ -# OS specific DSLs and dependencies -# ------------------------------------ - -# Linux -if linux? - # Debian - if debian? - extra_package_file '/lib/systemd/system/datadog-agent.service' - end - - # SysVInit service file - if redhat? - extra_package_file '/etc/rc.d/init.d/datadog-agent' - else - extra_package_file '/etc/init.d/datadog-agent' - end - - # Supervisord config file for the agent - extra_package_file '/etc/dd-agent/supervisor.conf' - - # Example configuration files for the agent and the checks - extra_package_file '/etc/dd-agent/datadog.conf.example' - extra_package_file '/etc/dd-agent/conf.d' - - # Custom checks directory - extra_package_file '/etc/dd-agent/checks.d' - - # Just a dummy file that needs to be in the RPM package list if we don't want it to be removed - # during RPM upgrades. (the old files from the RPM file listthat are not in the new RPM file - # list will get removed, that's why we need this one here) - extra_package_file '/usr/bin/dd-agent' - - # Linux-specific dependencies - dependency 'procps-ng' - dependency 'sysstat' -end - -# Mac and Windows -if osx? or windows? - dependency 'gui' -end - -# ------------------------------------ -# Dependencies -# ------------------------------------ - -# creates required build directories -dependency 'preparation' - -# Agent dependencies -dependency 'boto' -dependency 'docker-py' -dependency 'ntplib' -dependency 'pycrypto' -dependency 'pyopenssl' -dependency 'pyyaml' -dependency 'simplejson' -dependency 'supervisor' -dependency 'tornado' -dependency 'uptime' -dependency 'uuid' -dependency 'zlib' - -# Check dependencies -dependency 'adodbapi' -dependency 'httplib2' -dependency 'kafka-python' -dependency 'kazoo' -dependency 'paramiko' -dependency 'pg8000' -dependency 'psutil' -dependency 'psycopg2' -dependency 'pymongo' -dependency 'pymysql' -dependency 'pysnmp' -dependency 'python-gearman' -dependency 'python-memcached' -dependency 'python-redis' -dependency 'python-rrdtool' -dependency 'pyvmomi' -dependency 'requests' -dependency 'snakebite' - -# Datadog gohai is built last before dataadog agent since it should always -# be rebuilt (if put above, it would dirty the cache of the dependencies below -# and trigger a useless rebuild of many packages) -dependency 'datadog-gohai' - -# Datadog agent -dependency 'datadog-agent' - -# version manifest file -dependency 'version-manifest' - -exclude '\.git*' -exclude 'bundler\/git' diff --git a/omnibus/config/software/datadog-agent.rb b/omnibus/config/software/datadog-agent.rb deleted file mode 100644 index 5e4979bd7192..000000000000 --- a/omnibus/config/software/datadog-agent.rb +++ /dev/null @@ -1,127 +0,0 @@ -require './lib/ostools.rb' - -name 'datadog-agent' - -local_agent_repo = ENV['LOCAL_AGENT_REPO'] -if local_agent_repo.nil? || local_agent_repo.empty? - source git: 'https://github.com/DataDog/dd-agent.git' -else - # For local development - source path: ENV['LOCAL_AGENT_REPO'] -end - -agent_branch = ENV['AGENT_BRANCH'] -if agent_branch.nil? || agent_branch.empty? - default_version 'master' -else - default_version agent_branch -end - -relative_path 'dd-agent' - -build do - ship_license 'https://raw.githubusercontent.com/DataDog/dd-agent/master/LICENSE' - # Agent code - mkdir "#{install_dir}/agent/" - copy 'checks.d', "#{install_dir}/agent/" - copy 'checks', "#{install_dir}/agent/" - copy 'dogstream', "#{install_dir}/agent/" - copy 'resources', "#{install_dir}/agent/" - copy 'utils', "#{install_dir}/agent/" - command "cp *.py #{install_dir}/agent/" - copy 'datadog-cert.pem', "#{install_dir}/agent/" - - mkdir "#{install_dir}/run/" - - - if linux? - # Configuration files - mkdir '/etc/dd-agent' - if ohai['platform_family'] == 'rhel' - copy 'packaging/centos/datadog-agent.init', '/etc/rc.d/init.d/datadog-agent' - elsif ohai['platform_family'] == 'debian' - copy 'packaging/debian/datadog-agent.init', '/etc/init.d/datadog-agent' - mkdir '/lib/systemd/system' - copy 'packaging/debian/datadog-agent.service', '/lib/systemd/system/datadog-agent.service' - copy 'packaging/debian/start_agent.sh', '/opt/datadog-agent/bin/start_agent.sh' - command 'chmod 755 /opt/datadog-agent/bin/start_agent.sh' - end - copy 'packaging/supervisor.conf', '/etc/dd-agent/supervisor.conf' - copy 'datadog.conf.example', '/etc/dd-agent/datadog.conf.example' - copy 'conf.d', '/etc/dd-agent/' - mkdir '/etc/dd-agent/checks.d/' - command 'chmod 755 /etc/init.d/datadog-agent' - touch '/usr/bin/dd-agent' - end - - if osx? - env = { - 'PATH' => "#{install_dir}/embedded/bin/:#{ENV['PATH']}" - } - - app_temp_dir = "#{install_dir}/agent/dist/Datadog Agent.app/Contents" - app_temp_dir_escaped = "#{install_dir}/agent/dist/Datadog\\ Agent.app/Contents" - pyside_build_dir = "#{install_dir}/agent/build/bdist.macosx-10.5-intel/python2.7-standalone/app/collect/PySide" - command_fix_shiboken = 'install_name_tool -change @rpath/libshiboken-python2.7.1.2.dylib'\ - ' @executable_path/../Frameworks/libshiboken-python2.7.1.2.dylib ' - command_fix_pyside = 'install_name_tool -change @rpath/libpyside-python2.7.1.2.dylib'\ - ' @executable_path/../Frameworks/libpyside-python2.7.1.2.dylib ' - - # Command line tool - copy 'packaging/osx/datadog-agent', "#{install_dir}/bin" - command "chmod 755 #{install_dir}/bin/datadog-agent" - - # GUI - copy 'packaging/datadog-agent/win32/install_files/guidata/images', "#{install_dir}/agent" - copy 'win32/gui.py', "#{install_dir}/agent" - copy 'win32/status.html', "#{install_dir}/agent" - mkdir "#{install_dir}/agent/packaging" - copy 'packaging/osx/app/*', "#{install_dir}/agent/packaging" - - command "cd #{install_dir}/agent && "\ - "#{install_dir}/embedded/bin/python #{install_dir}/agent/setup.py py2app"\ - ' && cd -', env: env - # Time to patch the install, see py2app bug: (dependencies to system PySide) - # https://bitbucket.org/ronaldoussoren/py2app/issue/143/resulting-app-mistakenly-looks-for-pyside - copy "#{pyside_build_dir}/libshiboken-python2.7.1.2.dylib", "#{app_temp_dir}/Frameworks/libshiboken-python2.7.1.2.dylib" - copy "#{pyside_build_dir}/libpyside-python2.7.1.2.dylib", "#{app_temp_dir}/Frameworks/libpyside-python2.7.1.2.dylib" - - command "chmod a+x #{app_temp_dir_escaped}/Frameworks/{libpyside,libshiboken}-python2.7.1.2.dylib" - command "#{command_fix_shiboken} #{app_temp_dir_escaped}/Frameworks/libpyside-python2.7.1.2.dylib" - command 'install_name_tool -change /usr/local/lib/QtCore.framework/Versions/4/QtCore '\ - '@executable_path/../Frameworks/QtCore.framework/Versions/4/QtCore '\ - "#{app_temp_dir_escaped}/Frameworks/libpyside-python2.7.1.2.dylib" - command "#{command_fix_shiboken} #{app_temp_dir_escaped}/Resources/lib/python2.7/lib-dynload/PySide/QtCore.so" - command "#{command_fix_shiboken} #{app_temp_dir_escaped}/Resources/lib/python2.7/lib-dynload/PySide/QtGui.so" - command "#{command_fix_pyside} #{app_temp_dir_escaped}/Resources/lib/python2.7/lib-dynload/PySide/QtCore.so" - command "#{command_fix_pyside} #{app_temp_dir_escaped}/Resources/lib/python2.7/lib-dynload/PySide/QtGui.so" - - # And finally - command "cp -Rf #{install_dir}/agent/dist/Datadog\\ Agent.app #{install_dir}" - - # Clean GUI related things - %w(build dist images gui.py status.html packaging Datadog_Agent.egg-info).each do |file| - delete "#{install_dir}/agent/#{file}" - end - %w(py2app macholib modulegraph altgraph).each do |package| - command "yes | #{install_dir}/embedded/bin/pip uninstall #{package}" - end - %w(pyside guidata spyderlib).each do |dependency_name| - # Installed with `python setup.py install`, needs to be uninstalled manually - command "cat #{install_dir}/embedded/#{dependency_name}-files.txt | xargs rm -rf \"{}\"" - delete "#{install_dir}/embedded/#{dependency_name}-files.txt" - end - - # conf - mkdir "#{install_dir}/etc" - command "grep -v 'user=dd-agent' packaging/supervisor.conf > #{install_dir}/etc/supervisor.conf" - copy 'datadog.conf.example', "#{install_dir}/etc/datadog.conf.example" - command "cp -R conf.d #{install_dir}/etc/" - copy 'packaging/osx/com.datadoghq.Agent.plist.example', "#{install_dir}/etc/" - end - - # The file below is touched by software builds that don't put anything in the installation - # directory (libgcc right now) so that the git_cache gets updated let's remove it from the - # final package - delete "#{install_dir}/uselessfile" -end diff --git a/omnibus/lib/ostools.rb b/omnibus/lib/ostools.rb deleted file mode 100644 index 8f447dd4647e..000000000000 --- a/omnibus/lib/ostools.rb +++ /dev/null @@ -1,22 +0,0 @@ -# ------------------------------------ -# OS-detection helper functions -# ------------------------------------ -def linux?() - return %w(rhel debian fedora suse gentoo slackware arch exherbo).include? ohai['platform_family'] -end - -def redhat?() - return %w(rhel fedora suse).include? ohai['platform_family'] -end - -def debian?() - return ohai['platform_family'] == 'debian' -end - -def osx?() - return ohai['platform_family'] == 'mac_os_x' -end - -def windows?() - return ohai['platform_family'] == 'windows' -end diff --git a/omnibus/omnibus.rb b/omnibus/omnibus.rb deleted file mode 100644 index 170096a82714..000000000000 --- a/omnibus/omnibus.rb +++ /dev/null @@ -1,14 +0,0 @@ -bucket = ENV['S3_OMNIBUS_BUCKET'] - -append_timestamp false - -if bucket.nil? || bucket.empty? - use_s3_caching false -else - s3_access_key ENV['S3_ACCESS_KEY'] - s3_secret_key ENV['S3_SECRET_KEY'] - s3_bucket ENV['S3_OMNIBUS_BUCKET'] - use_s3_caching true -end - -append_timestamp false diff --git a/omnibus/omnibus_build.sh b/omnibus/omnibus_build.sh deleted file mode 100644 index c223b49fa729..000000000000 --- a/omnibus/omnibus_build.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash -e - -########################### -# -# WARNING: You need to rebuild the docker images if you do any changes to this file -# -############################ - -PROJECT_DIR=dd-agent-omnibus -PROJECT_NAME=datadog-agent -LOG_LEVEL=${LOG_LEVEL:-"info"} -OMNIBUS_BRANCH=${OMNIBUS_BRANCH:-"master"} -OMNIBUS_SOFTWARE_BRANCH=${OMNIBUS_SOFTWARE_BRANCH:-"master"} - -# Clean up omnibus artifacts -rm -rf /var/cache/omnibus/pkg/* - -# Clean up what we installed -rm -f /etc/init.d/datadog-agent -rm -rf /etc/dd-agent -rm -rf /opt/$PROJECT_NAME/* - -cd $PROJECT_DIR -# Allow to use a different dd-agent-omnibus branch -git fetch --all -git checkout $OMNIBUS_BRANCH -git reset --hard origin/$OMNIBUS_BRANCH - -# If an RPM_SIGNING_PASSPHRASE has been passed, let's import the signing key -if [ -n "$RPM_SIGNING_PASSPHRASE" ]; then - gpg --import /keys/RPM-SIGNING-KEY.private -fi - -# Last but not least, let's make sure that we rebuild the agent everytime because -# the extra package files are destroyed when the build container stops (we have -# to tweak omnibus-git-cache directly for that). Same for gohai. -git --git-dir=/var/cache/omnibus/cache/git_cache/opt/datadog-agent tag -d `git --git-dir=/var/cache/omnibus/cache/git_cache/opt/datadog-agent tag -l | grep datadog-agent` -git --git-dir=/var/cache/omnibus/cache/git_cache/opt/datadog-agent tag -d `git --git-dir=/var/cache/omnibus/cache/git_cache/opt/datadog-agent tag -l | grep datadog-gohai` - -# Install the gems we need, with stubs in bin/ -bundle update # Make sure to update to the latest version of omnibus-software -bin/omnibus build -l=$LOG_LEVEL $PROJECT_NAME diff --git a/omnibus/package-scripts/datadog-agent/README.md b/omnibus/package-scripts/datadog-agent/README.md deleted file mode 100644 index 8c1d30331907..000000000000 --- a/omnibus/package-scripts/datadog-agent/README.md +++ /dev/null @@ -1,25 +0,0 @@ -The order in which these script are executed varies between APT and YUM which can lead to some -rather sneaky bugs. Here's the standard order for updates: - -APT (source https://debian-handbook.info/browse/stable/sect.package-meta-information.html): -------------------------------------------------------------------------------------------- -* `prerm` script of the old package (with arguments: `upgrade `) -* `preinst` script of the new package (with arguments: `upgrade `) -* New files get unpacked based on the file list embedded in the `.deb` package -* `postrm` script from the old package (with arguments `upgrade `) -* `dpkg` updates the files list, removes the files that don't exist anymore, etc. -* `postinst` of the new script is run (with arguments `configure ` - -YUM (source: various Stackoverflow posts + local experiments): --------------------------------------------------------------- - -* `pretrans` of new package -* `preinst` of new package` -* Files in the list get copied -* `prerm` of old package -* Files in the old package file list that are not in the new one's get removed -* `postrm` of the old package gets run -* `posttrans` of the old package is ran - -One thing to notice is that if you remove files or other components in the `postrm` script, -updates won't work as expected with YUM. diff --git a/omnibus/package-scripts/datadog-agent/postinst b/omnibus/package-scripts/datadog-agent/postinst deleted file mode 100755 index c27959a23ead..000000000000 --- a/omnibus/package-scripts/datadog-agent/postinst +++ /dev/null @@ -1,219 +0,0 @@ -#!/bin/sh - -INSTALL_DIR=/opt/datadog-agent -LOG_DIR=/var/log/datadog -RUN_DIR=$INSTALL_DIR/run - -DISTRIBUTION=$(grep -Eo "(Debian|Ubuntu|RedHat|CentOS|openSUSE|Amazon)" /etc/issue 2>/dev/null || uname -s) - -error_exit() -{ - echo "${PROGNAME}: ${1:-"Unknown Error"}" 1>&2 - exit 1 -} - -mkdir -p ${LOG_DIR} || error_exit "Cannot create ${LOG_DIR}!" - -# If we are inside the Docker container, do nothing -if [ -n "$DOCKER_DD_AGENT" ]; then - echo "Installation from docker-dd-agent, nothing to do in postinst" - exit 0 -fi - -# Linux installation -if [ "$DISTRIBUTION" != "Darwin" ]; then - # Linus specific variables - CONFIG_DIR=/etc/dd-agent - - # Since we now package our own supervisor config, we no longer want - # the old config to be loaded. Since supervisor automatically loads - # configs in conf.d, we have to delete the old config file. - if [ -f "/etc/supervisor/conf.d/ddagent.conf" ]; then - echo "Removing old configuration from system supervisord" - rm /etc/supervisor/conf.d/ddagent.conf - fi - - if [ -f "/etc/debian_version" ] || [ "$DISTRIBUTION" == "Debian" ] || [ "$DISTRIBUTION" == "Ubuntu" ]; then - set -e - case "$1" in - configure) - echo "Registering service datadog-agent" - update-rc.d datadog-agent defaults - echo "Enabling service datadog-agent" - update-rc.d datadog-agent enable >/dev/null 2>&1 - # Only add dd-agent user if it doesn't already exist - set +e - id -u dd-agent >/dev/null 2>&1 - USER_EXISTS=$? - set -e - if [ ! $USER_EXISTS -eq 0 ]; then - echo "Creating dd-agent user" - adduser --system dd-agent --disabled-login --shell /bin/sh --no-create-home --quiet - usermod -d /opt/datadog-agent dd-agent - fi - set +e - ;; - abort-upgrade|abort-remove|abort-deconfigure) - ;; - - *) - ;; - esac - #DEBHELPER# - fi - - # Set proper rights to the dd-agent user - chown -R dd-agent:root ${CONFIG_DIR} - chown -R dd-agent:root ${LOG_DIR} - chown root:root /etc/init.d/datadog-agent - chown -R root:root /opt/datadog-agent - chown -R dd-agent:root ${RUN_DIR} - - if command -v chkconfig >/dev/null 2>&1; then - chkconfig --add datadog-agent - fi - - # Create symlinks to the various agent's components - ln -sf $INSTALL_DIR/agent/agent.py /usr/bin/dd-agent - chown -R dd-agent:root /usr/bin/dd-agent - chmod 755 /usr/bin/dd-agent - - # The configcheck call will return zero if the config is valid, which means we - # can restart the agent without taking the risk to trigger an error in the - # postinst script . If the config file doesn't exist (RETVAL=3), the user is - # probably using the source install script so let's consider the postinst script - # did its job and exist zero, otherwise, if the file exists but it's wrong we - # have to return a non zero exit status so that the system (and the user) are - # notified the installation went wrong. - /etc/init.d/datadog-agent configcheck - RETVAL=$? - if [ $RETVAL -eq 0 ]; then - echo "(Re)starting datadog-agent now..." - if command -v invoke-rc.d >/dev/null 2>&1; then - invoke-rc.d datadog-agent restart - else - /etc/init.d/datadog-agent restart - fi - fi - - if [ $RETVAL -ne 0 ]; then - if [ $RETVAL -eq 3 ]; then - # No datadog.conf file is present. The user is probably following - # the step-by-step instructions and will add the config file next. - exit 0 - else - exit $RETVAL - fi - fi -# OSX installation -elif [ "$DISTRIBUTION" = "Darwin" ]; then - # OSX specific variables - OPT_APP_DIR="$INSTALL_DIR/Datadog Agent.app" - APP_DIR="/Applications/Datadog Agent.app" - CONF_DIR=$INSTALL_DIR/etc - RUN_DIR=$INSTALL_DIR/run - - # Let's log the standard outputs of this script - LOG_FILE="$LOG_DIR/postinstall.log" - mkdir -vp $LOG_DIR - exec > $LOG_FILE 2>&1 - - # Let's talk to our user installing the Agent a bit - echo "# State at the beginning" - echo "## Agent version" - grep AGENT_VERSION $INSTALL_DIR/agent/cofnig.py || echo "No config file" - echo "## $INSTALL_DIR" - ls -al $INSTALL_DIR || "No agent installed" - echo "## $APP_DIR/Contents/Resources" - ls -al "$APP_DIR/Contents/Resources" || echo "No app installed" - - # Determine current user if he is using the Graphical installer - INSTALL_USER=$(ps aux | grep "CoreServices/Installer" | grep -v grep | awk '{print $1;}') - - # Otherwise, we hope he is using the install script and try to this user - # If it fails, no choice but to use root :'( - if [ -z "$INSTALL_USER" ] || [ "$INSTALL_USER" == "root" ]; then - SCRIPT_INSTALL="yes" - INSTALL_USER=`cat /tmp/datadog-install-user || echo 'root'` - rm -v /tmp/datadog-install-user || true - fi - echo "INSTALL_USER: $INSTALL_USER" - - echo "# Prepareing log dir" - chown -vR $INSTALL_USER:admin $LOG_DIR - chmod -v 755 $LOG_DIR - - echo "# Installing the app" - mv -v "$OPT_APP_DIR" /Applications || echo "App already installed" - - # Set the run directory for the agent - mkdir -vp "$RUN_DIR" - chown -vR $INSTALL_USER:admin "$RUN_DIR" - chmod -v 755 "$RUN_DIR" - - echo "# Creating default plist" - sed "s|USER_NAME|$INSTALL_USER|" "$CONF_DIR/com.datadoghq.Agent.plist.example" > "$CONF_DIR/com.datadoghq.Agent.plist" - - echo "# Copying conf" - mkdir -vp $CONF_DIR/checks.d - - if [ -e "/tmp/datadog.conf" ]; then - mv -vf /tmp/datadog.conf $CONF_DIR - mv -vf /tmp/conf.d/* $CONF_DIR/conf.d - cp -vn /tmp/checks.d/* $CONF_DIR/checks.d - rm -vrf /tmp/datadog.conf /tmp/conf.d /tmp/checks.d - # Or copying default - else - sed -E 's/^api_key:$/api_key: APIKEY/' $CONF_DIR/datadog.conf.example > $CONF_DIR/datadog.conf - fi - - echo "# Setting correct rights on conf" - chown -v $INSTALL_USER:admin $CONF_DIR/datadog.conf - chown -vR $INSTALL_USER:admin $CONF_DIR/conf.d $CONF_DIR/checks.d - - # `datadog-agent` command line - mkdir -vp /usr/local/bin - ln -vs $INSTALL_DIR/bin/datadog-agent /usr/local/bin/datadog-agent - - # Link for conf files (let's ease the user's life) - USER_HOME=`sudo -Hu $INSTALL_USER sh -c 'echo $HOME'` - sudo -Hu $INSTALL_USER mkdir -vp "$USER_HOME/.datadog-agent" - rm -vf "$USER_HOME/.datadog-agent/conf.d" "$USER_HOME/.datadog-agent/datadog.conf" "$USER_HOME/.datadog-agent/checks.d" - sudo -Hu $INSTALL_USER ln -vs $CONF_DIR/conf.d "$USER_HOME/.datadog-agent/conf.d" - sudo -Hu $INSTALL_USER ln -vs $CONF_DIR/datadog.conf "$USER_HOME/.datadog-agent/datadog.conf" - sudo -Hu $INSTALL_USER ln -vs $CONF_DIR/checks.d "$USER_HOME/.datadog-agent/checks.d" - - # Error if app not properly installed or root - if [ "$INSTALL_USER" == "root" ]; then - echo 'INSTALL_USER is set to root, Datadog Agent app has been installed' - echo 'but is not configured. Running Datadog Agent as root is not advised!' - exit 1 - fi - - if [ ! -e "$CONF_DIR/datadog.conf" ]; then - exit 1 - fi - - # Start the app only if it's not a script install - if [ -z "$SCRIPT_INSTALL" ]; then - echo "# Starting the app" - # -a for application, -F for fresh, do not restore old app - export TMPDIR=`sudo -u $INSTALL_USER getconf DARWIN_USER_TEMP_DIR` - sudo -u $INSTALL_USER open -Fa 'Datadog Agent' - fi - - echo "# Configuring the login launch of the app" - sudo -u $INSTALL_USER osascript -e 'tell application "System Events" to delete every login item whose name is "Datadog Agent"' - sudo -u $INSTALL_USER osascript -e 'tell application "System Events" to make login item at end with properties {path:"/Applications/Datadog Agent.app", name:"Datadog Agent", hidden:false}' - - # A little debriefing won't hurt - echo "# State at the end" - echo "## AGent version" - grep AGENT_VERSION $INSTALL_DIR/agent/config.py || echo "No config file" - echo "## $INSTALL_DIR" - ls -al $INSTALL_DIR || echo "No agent installed :(" - echo "## $APP_DIR/Contents/Resources" - ls -al "$APP_DIR/Contents/Resources" || echo "No app installed ;-(" -fi - -exit 0 diff --git a/omnibus/package-scripts/datadog-agent/postrm b/omnibus/package-scripts/datadog-agent/postrm deleted file mode 100755 index aa2ffe4fec18..000000000000 --- a/omnibus/package-scripts/datadog-agent/postrm +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/sh -LINUX_DISTRIBUTION=$(grep -Eo "(Debian|Ubuntu|RedHat|CentOS|openSUSE|Amazon)" /etc/issue) - -if [ -f "/etc/debian_version" ] || [ "$LINUX_DISTRIBUTION" == "Debian" ] || [ "$LINUX_DISTRIBUTION" == "Ubuntu" ]; then - set -e - - if [ "$1" = purge ]; then - deluser dd-agent - rm -rf /opt/datadog-agent - rm -rf /var/log/datadog - rm -rf /etc/dd-agent - rm -rf /var/log/datadog - fi -elif [ -f "/etc/redhat-release" ] || [ "$LINUX_DISTRIBUTION" == "RedHat" ] || [ "$LINUX_DISTRIBUTION" == "CentOS" ] || [ "$LINUX_DISTRIBUTION" == "openSUSE" ] || [ "$LINUX_DISTRIBUTION" == "Amazon" ]; then - case "$*" in - 0) - # We're uninstalling. - getent passwd dd-agent > /dev/null && userdel dd-agent - getent group dd-agent >/dev/null && groupdel dd-agent - ;; - 1) - # We're upgrading. - ;; - *) - ;; - esac -else - echo "[ ${Red}FAILED ${RCol}]\tYour system is currently not supported by this script."; - exit 1; -fi -exit 0 diff --git a/omnibus/package-scripts/datadog-agent/posttrans b/omnibus/package-scripts/datadog-agent/posttrans deleted file mode 100755 index 55cbf0840ee0..000000000000 --- a/omnibus/package-scripts/datadog-agent/posttrans +++ /dev/null @@ -1,6 +0,0 @@ -getent group dd-agent >/dev/null || groupadd -r dd-agent -getent passwd dd-agent >/dev/null || \ - useradd -r -M -g dd-agent -d /usr/share/datadog/agent -s /bin/sh \ - -c "Datadog Agent" dd-agent -/etc/init.d/datadog-agent restart -exit 0 diff --git a/omnibus/package-scripts/datadog-agent/preinst b/omnibus/package-scripts/datadog-agent/preinst deleted file mode 100755 index 35fdf004965e..000000000000 --- a/omnibus/package-scripts/datadog-agent/preinst +++ /dev/null @@ -1,110 +0,0 @@ -#! /bin/sh - -INSTALL_DIR=/opt/datadog-agent -LOG_DIR=/var/log/datadog -mkdir -p $LOG_DIR - -DISTRIBUTION=$(grep -Eo "(Debian|Ubuntu|RedHat|CentOS|openSUSE|Amazon)" /etc/issue 2>/dev/null || uname -s) - -# Linux installation -if [ "$DISTRIBUTION" != "Darwin" ]; then - if [ -f "/etc/debian_version" ] || [ "$DISTRIBUTION" == "Debian" ] || [ "$DISTRIBUTION" == "Ubuntu" ]; then - set -e - if [ -f "/etc/init.d/datadog-agent" ]; then - if command -v invoke-rc.d >/dev/null 2>&1; then - invoke-rc.d datadog-agent stop || true - else - /etc/init.d/datadog-agent stop || true - fi - fi - - # Since we now package our own supervisor config, we no longer want - # the old config to be loaded. Since supervisor automatically loads - # configs in conf.d, we have to delete the old config file. - if [ -f "/etc/supervisor/conf.d/ddagent.conf" ]; then - echo "Removing old configuration from system supervisord" - rm /etc/supervisor/conf.d/ddagent.conf - fi - - # Previous versions of dd-agent created this file but didn't do - # anything with it. - if [ -f "/etc/dd-agent/supervisor_ddagent.conf" ]; then - rm /etc/dd-agent/supervisor_ddagent.conf - fi - - #DEBHELPER# - - elif [ -f "/etc/redhat-release" ] || [ -f "/etc/system-release" ] || [ "$DISTRIBUTION" == "RedHat" ] || [ "$DISTRIBUTION" == "CentOS" ] || [ "$DISTRIBUTION" == "openSUSE" ] || [ "$DISTRIBUTION" == "Amazon" ]; then - getent group dd-agent >/dev/null || groupadd -r dd-agent - getent passwd dd-agent >/dev/null || \ - useradd -r -M -g dd-agent -d $INSTALL_DIR -s /bin/sh \ - -c "Datadog Agent" dd-agent - # Stop the old agent before installing - if [ -f "/etc/init.d/datadog-agent" ]; then - /etc/init.d/datadog-agent stop || true - fi - else - echo "[ ${Red}FAILED ${RCol}]\tYour system is currently not supported by this script."; - exit 1; - fi - - # Delete .pyc files - # FIXME: it shouldn't be done there, but only in prerm (see 6.6 - # of https://www.debian.org/doc/debian-policy/ch-maintainerscripts.html) - # It is also here because version < 5.4 didn't delete .pyc, - # so we need to be sure to clean them here (if a file is deleted for instance) - find $INSTALL_DIR/agent -name '*.py[co]' -type f -delete || true - - # FIXME: remove when CentOS5 support is dropped (03/31/2017) or when everybody - # has stopped using dd-agent 5.3 (and older versions ofc) - rm -f $INSTALL_DIR/agent/checks/utils.py - -elif [ "$DISTRIBUTION" = "Darwin" ]; then - DD_COMMAND="$INSTALL_DIR/bin/datadog-agent" - CONF_DIR="$INSTALL_DIR/etc" - APP_DIR="/Applications/Datadog Agent.app" - - LOG_FILE="$LOG_DIR/preinstall.log" - exec > $LOG_FILE 2>&1 - - if [ -e "$CONF_DIR/datadog.conf" ]; then - echo "# State at the beginning" - echo "## Agent version" - grep AGENT_VERSION $INSTALL_DIR/agent/config.py || echo "No config file" - echo "## $INSTALL_DIR" - ls -al $INSTALL_DIR || echo "No agent installed" - echo "## $APP_DIR/Contents/Resources" - ls -al "$APP_DIR/Contents/Resources" || echo "No app installed" - - echo '# Stop old agent' - $DD_COMMAND stop || true - kill `ps aux | grep 'Datadog Agent.app' | grep -v grep | cut -d ' ' -f 4` || true - - echo '# Stop old GUI'okill `ps aux | grep 'Datadog Agent.app' | grep -v grep | awk '{ print $2 }'` || true - - # Save old conf - mkdir -vp /tmp/{conf,checks}.d - rm -rvf /tmp/{checks,conf}.d/* /tmp/datadog.conf - cp -vf $CONF_DIR/datadog.conf /tmp - cp -vf $CONF_DIR/conf.d/*.yaml /tmp/conf.d - cp -vfR $CONF_DIR/checks.d/* /tmp/checks.d - fi - - echo '# Deleting old datadog-agent link' - rm -vf /usr/local/bin/datadog-agent - - echo '# Deleting old datadog-agent files' - rm -rf $INSTALL_DIR/agent || true - - # Debriefing time - echo "# State at the end" - echo "## Agent version" - grep AGENT_VERSION $INSTALL_DIR/agent/config.py || echo "No config file" - echo "## $INSTALL_DIR" - ls -al $INSTALL_DIR || echo "No agent installed" - echo "## $APP_DIR/Contents/Resources" - ls -al "$APP_DIR/Contents/Resources" || echo "No app installed" -fi - - -exit 0 diff --git a/omnibus/package-scripts/datadog-agent/prerm b/omnibus/package-scripts/datadog-agent/prerm deleted file mode 100755 index e4687c3103a8..000000000000 --- a/omnibus/package-scripts/datadog-agent/prerm +++ /dev/null @@ -1,35 +0,0 @@ -#! /bin/sh -LINUX_DISTRIBUTION=$(grep -Eo "(Debian|Ubuntu|RedHat|CentOS|openSUSE|Amazon)" /etc/issue) - -if [ -f "/etc/debian_version" ] || [ "$LINUX_DISTRIBUTION" == "Debian" ] || [ "$LINUX_DISTRIBUTION" == "Ubuntu" ]; then - if command -v invoke-rc.d >/dev/null 2>&1; then - invoke-rc.d datadog-agent stop || true - - # Removing the service form startup (since it's not there anymore) - update-rc.d -f datadog-agent disable >/dev/null 2>&1 - update-rc.d -f datadog-agent remove - else - /etc/init.d/datadog-agent stop || true - fi -elif [ -f "/etc/redhat-release" ] || [ "$LINUX_DISTRIBUTION" == "RedHat" ] || [ "$LINUX_DISTRIBUTION" == "CentOS" ] || [ "$LINUX_DISTRIBUTION" == "openSUSE" ] || [ "$LINUX_DISTRIBUTION" == "Amazon" ]; then - case "$*" in - 0) - # We're uninstalling. - /etc/init.d/datadog-agent stop - ;; - 1) - # We're upgrading. Do nothing. - ;; - *) - ;; - esac -else - echo "[ ${Red}FAILED ${RCol}]\tYour system is currently not supported by this script."; - exit 1; -fi - -# Delete all.pyc files -find /opt/datadog-agent/agent -name '*.py[co]' -type f -delete || echo 'Unable to delete .pyc files' - -exit 0 - diff --git a/omnibus/resources/datadog-agent/dmg/background.png b/omnibus/resources/datadog-agent/dmg/background.png deleted file mode 100644 index 5f67c9e6a4d9..000000000000 Binary files a/omnibus/resources/datadog-agent/dmg/background.png and /dev/null differ diff --git a/omnibus/resources/datadog-agent/dmg/icon.png b/omnibus/resources/datadog-agent/dmg/icon.png deleted file mode 100644 index de0c34718ece..000000000000 Binary files a/omnibus/resources/datadog-agent/dmg/icon.png and /dev/null differ diff --git a/omnibus/resources/datadog-agent/pkg/background.png b/omnibus/resources/datadog-agent/pkg/background.png deleted file mode 100644 index f920a50c3d55..000000000000 Binary files a/omnibus/resources/datadog-agent/pkg/background.png and /dev/null differ diff --git a/omnibus/resources/datadog-agent/pkg/license.html.erb b/omnibus/resources/datadog-agent/pkg/license.html.erb deleted file mode 100644 index 4b6a58eb076f..000000000000 --- a/omnibus/resources/datadog-agent/pkg/license.html.erb +++ /dev/null @@ -1,31 +0,0 @@ -Simplified BSD License - -Copyright (c) 2009, Boxed Ice -Copyright (c) 2010-2015, Datadog -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - * Neither the name of Boxed Ice nor the names of its contributors - may be used to endorse or promote products derived from this software - without specific prior written permission. - * Neither the name of Datadog nor the names of its contributors - may be used to endorse or promote products derived from this software - without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/omnibus/resources/datadog-agent/pkg/welcome.html.erb b/omnibus/resources/datadog-agent/pkg/welcome.html.erb deleted file mode 100644 index c45028d369cd..000000000000 --- a/omnibus/resources/datadog-agent/pkg/welcome.html.erb +++ /dev/null @@ -1,4 +0,0 @@ -The installer will help you install <%= friendly_name %> on your Mac. - --------------------------------------------------- -You will be guided through the steps necessary to install this software. diff --git a/omnibus/rpm-sign b/omnibus/rpm-sign deleted file mode 100755 index 227d36aa3fd5..000000000000 --- a/omnibus/rpm-sign +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/expect -f -# -# rpmsign-batch.expect : expect powered rpm signing command -# - -proc usage {} { - send_user "Usage: rpm-sign gpgname passphrase rpmfile\n\n" - exit -} - -if {[llength $argv]!=3} usage - -set gpgname [lindex $argv 0] -set passphrase [lindex $argv 1] -set rpmfile [lindex $argv 2] - -spawn rpm --addsign -D "\"_signature gpg\"" -D "\"_gpg_name $gpgname\"" $rpmfile -expect -exact "Enter pass phrase: " -send -- "$passphrase\r" -expect eof \ No newline at end of file diff --git a/checks/common.go b/pkg/checks/common.go similarity index 100% rename from checks/common.go rename to pkg/checks/common.go diff --git a/checks/common_test.go b/pkg/checks/common_test.go similarity index 100% rename from checks/common_test.go rename to pkg/checks/common_test.go diff --git a/checks/system/memory.go b/pkg/checks/system/memory.go similarity index 100% rename from checks/system/memory.go rename to pkg/checks/system/memory.go diff --git a/aggregator/api.c b/pkg/py/api.c similarity index 100% rename from aggregator/api.c rename to pkg/py/api.c diff --git a/aggregator/api.go b/pkg/py/api.go similarity index 78% rename from aggregator/api.go rename to pkg/py/api.go index 9169357426ce..5a8fba1cbffb 100644 --- a/aggregator/api.go +++ b/pkg/py/api.go @@ -1,16 +1,20 @@ -package aggregator +package py -import "fmt" +import ( + "fmt" + + "github.com/DataDog/datadog-agent/aggregator" +) // #cgo pkg-config: python2 // #include "api.h" import "C" -var _aggregator Aggregator - //export SubmitData func SubmitData(check *C.PyObject, mt C.MetricType, name *C.char, value C.float, tags *C.PyObject) *C.PyObject { + agg := aggregator.Get() + // TODO: cleanup memory, C.stuff is going to stay there!!! _name := C.GoString(name) @@ -33,20 +37,16 @@ func SubmitData(check *C.PyObject, mt C.MetricType, name *C.char, value C.float, fallthrough case C.GAUGE: fmt.Println("Submitting Gauge to the aggregator...", _name, _value, _tags) - _aggregator.Gauge(_name, _value, "", _tags) + agg.Gauge(_name, _value, "", _tags) case C.HISTOGRAM: fmt.Println("Submitting Histogram to the aggregator...", _name, _value, _tags) - _aggregator.Histogram(_name, _value, "", _tags) + agg.Histogram(_name, _value, "", _tags) } return C._none() } -func Get() Aggregator { - return _aggregator -} - -func InitApi(aggregatorInstance Aggregator) { - _aggregator = aggregatorInstance +func InitApi(aggregatorInstance aggregator.Aggregator) { + aggregator.Set(aggregatorInstance) C.initaggregator() } diff --git a/aggregator/api.h b/pkg/py/api.h similarity index 100% rename from aggregator/api.h rename to pkg/py/api.h diff --git a/py/check.go b/pkg/py/check.go similarity index 98% rename from py/check.go rename to pkg/py/check.go index c514dce4728c..35149433769e 100644 --- a/py/check.go +++ b/pkg/py/check.go @@ -4,7 +4,7 @@ import ( "errors" "runtime" - "github.com/DataDog/datadog-agent/checks" + "github.com/DataDog/datadog-agent/pkg/checks" "github.com/op/go-logging" "github.com/sbinet/go-python" ) diff --git a/py/check_test.go b/pkg/py/check_test.go similarity index 100% rename from py/check_test.go rename to pkg/py/check_test.go diff --git a/py/config.go b/pkg/py/config.go similarity index 100% rename from py/config.go rename to pkg/py/config.go diff --git a/py/config_test.go b/pkg/py/config_test.go similarity index 100% rename from py/config_test.go rename to pkg/py/config_test.go diff --git a/py/checks/__init__.py b/pkg/py/dist/checks/__init__.py similarity index 100% rename from py/checks/__init__.py rename to pkg/py/dist/checks/__init__.py diff --git a/py/checks/go_expvar.py b/pkg/py/dist/checks/go_expvar.py similarity index 100% rename from py/checks/go_expvar.py rename to pkg/py/dist/checks/go_expvar.py diff --git a/py/conf.d/go_expvar.yaml b/pkg/py/dist/conf.d/go_expvar.yaml similarity index 100% rename from py/conf.d/go_expvar.yaml rename to pkg/py/dist/conf.d/go_expvar.yaml diff --git a/py/config.py b/pkg/py/dist/config.py similarity index 100% rename from py/config.py rename to pkg/py/dist/config.py diff --git a/py/util.py b/pkg/py/dist/util.py similarity index 100% rename from py/util.py rename to pkg/py/dist/util.py diff --git a/py/tests/__init__.py b/pkg/py/tests/__init__.py similarity index 100% rename from py/tests/__init__.py rename to pkg/py/tests/__init__.py diff --git a/py/tests/bad.yaml b/pkg/py/tests/bad.yaml similarity index 100% rename from py/tests/bad.yaml rename to pkg/py/tests/bad.yaml diff --git a/py/tests/bar.py b/pkg/py/tests/bar.py similarity index 100% rename from py/tests/bar.py rename to pkg/py/tests/bar.py diff --git a/py/tests/complex.py b/pkg/py/tests/complex.py similarity index 100% rename from py/tests/complex.py rename to pkg/py/tests/complex.py diff --git a/py/tests/complex.yaml b/pkg/py/tests/complex.yaml similarity index 100% rename from py/tests/complex.yaml rename to pkg/py/tests/complex.yaml diff --git a/py/tests/foo.py b/pkg/py/tests/foo.py similarity index 100% rename from py/tests/foo.py rename to pkg/py/tests/foo.py diff --git a/py/tests/testcheck.py b/pkg/py/tests/testcheck.py similarity index 100% rename from py/tests/testcheck.py rename to pkg/py/tests/testcheck.py diff --git a/py/tests/testcheck.yaml b/pkg/py/tests/testcheck.yaml similarity index 100% rename from py/tests/testcheck.yaml rename to pkg/py/tests/testcheck.yaml diff --git a/py/tests/testcheck2.py b/pkg/py/tests/testcheck2.py similarity index 100% rename from py/tests/testcheck2.py rename to pkg/py/tests/testcheck2.py diff --git a/py/utils.go b/pkg/py/utils.go similarity index 100% rename from py/utils.go rename to pkg/py/utils.go diff --git a/py/utils_test.go b/pkg/py/utils_test.go similarity index 95% rename from py/utils_test.go rename to pkg/py/utils_test.go index 2401ae884a1e..b974a0daf7e4 100644 --- a/py/utils_test.go +++ b/pkg/py/utils_test.go @@ -17,12 +17,13 @@ func TestMain(m *testing.M) { // Set the PYTHONPATH path := python.PySys_GetObject("path") python.PyList_Append(path, python.PyString_FromString(".")) + python.PyList_Append(path, python.PyString_FromString("dist")) // Initialize acquires the GIL but we don't need it, release it state := python.PyEval_SaveThread() // for now, only Python needs it, build and pass it on the fly - aggregator.InitApi(aggregator.NewUnbufferedAggregator()) + InitApi(aggregator.NewUnbufferedAggregator()) ret := m.Run() diff --git a/py/checks/activemq_xml.py b/py/checks/activemq_xml.py deleted file mode 100644 index f583d4d2dc0f..000000000000 --- a/py/checks/activemq_xml.py +++ /dev/null @@ -1,156 +0,0 @@ -# stdlib -from xml.etree import ElementTree - -# third party -import requests - -# project -from checks import AgentCheck - -QUEUE_URL = "/admin/xml/queues.jsp" -TOPIC_URL = "/admin/xml/topics.jsp" -SUBSCRIBER_URL = "/admin/xml/subscribers.jsp" - -TOPIC_QUEUE_METRICS = { - "consumerCount": "consumer_count", - "dequeueCount": "dequeue_count", - "enqueueCount": "enqueue_count", - "size": "size" -} - -SUBSCRIBER_TAGS = [ - "connectionId", - "subscriptionName", - "destinationName", - "selector", - "active", -] - -MAX_ELEMENTS = 300 - - -class ActiveMQXML(AgentCheck): - - def check(self, instance): - url = instance.get("url") - username = instance.get("username") - password = instance.get("password") - custom_tags = instance.get('tags', []) - max_queues = int(instance.get("max_queues", MAX_ELEMENTS)) - max_topics = int(instance.get("max_topics", MAX_ELEMENTS)) - max_subscribers = int(instance.get("max_subscribers", MAX_ELEMENTS)) - detailed_queues = instance.get("detailed_queues", []) - detailed_topics = instance.get("detailed_topics", []) - detailed_subscribers = instance.get("detailed_subscribers", []) - - tags = custom_tags + ["url:{0}".format(url)] - - self.log.debug("Processing ActiveMQ data for %s" % url) - data = self._fetch_data(url, QUEUE_URL, username, password) - self._process_data(data, "queue", tags, max_queues, detailed_queues) - - data = self._fetch_data(url, TOPIC_URL, username, password) - self._process_data(data, "topic", tags, max_topics, detailed_topics) - - data = self._fetch_data(url, SUBSCRIBER_URL, username, password) - self._process_subscriber_data(data, tags, max_subscribers, detailed_subscribers) - - def _fetch_data(self, base_url, xml_url, username, password): - auth = None - if username and password: - auth = (username, password) - url = "%s%s" % (base_url, xml_url) - self.log.debug("ActiveMQ Fetching queue data from: %s" % url) - r = requests.get(url, auth=auth) - r.raise_for_status() - return r.text - - def _process_data(self, data, el_type, tags, max_elements, detailed_elements): - root = ElementTree.fromstring(data) - # if list provided in config, only send those metrics - if detailed_elements: - elements = [e for e in root.findall(el_type) if e.get('name') in detailed_elements] - else: - elements = [e for e in root.findall(el_type) if e.get('name')] - count = len(elements) - - if count > max_elements: - if not detailed_elements: - self.warning("Number of {0} is too high ({1} > {2}). " - "Please use the detailed_{0}s parameter" - " to list the {0} you want to monitor.".format(el_type, - count, - max_elements)) - - for el in elements[:max_elements]: - name = el.get("name") - stats = el.find("stats") - if stats is None: - continue - - el_tags = tags + ["{0}:{1}".format(el_type, name)] - for attr_name, alias in TOPIC_QUEUE_METRICS.iteritems(): - metric_name = "activemq.{0}.{1}".format(el_type, alias) - value = stats.get(attr_name, 0) - self.gauge(metric_name, value, tags=el_tags) - - self.log.debug("ActiveMQ {0} count: {1}".format(el_type, count)) - self.gauge("activemq.{0}.count".format(el_type), count, tags=tags) - - def _process_subscriber_data(self, data, tags, max_subscribers, detailed_subscribers): - root = ElementTree.fromstring(data) - # if subscribers list provided in config, only send those metrics - if detailed_subscribers: - subscribers = [s for s in root.findall("subscriber") if s.get("clientId") in detailed_subscribers] - else: - subscribers = [s for s in root.findall("subscriber") if s.get("clientId")] - - count = len(subscribers) - if count > max_subscribers: - if not detailed_subscribers: - self.warning("Number of subscribers is too high ({0} > {1})." - "Please use the detailed_subscribers parameter " - "to list the {0} you want to monitor.".format(count, - max_subscribers)) - - for subscriber in subscribers[:max_subscribers]: - clientId = subscriber.get("clientId") - if not clientId: - continue - subscribers.append(clientId) - stats = subscriber.find("stats") - if stats is None: - continue - - el_tags = tags + ["clientId:{0}".format(clientId)] - - for name in SUBSCRIBER_TAGS: - value = subscriber.get(name) - if value is not None: - el_tags.append("%s:%s" % (name, value)) - - pending_queue_size = stats.get("pendingQueueSize", 0) - dequeue_counter = stats.get("dequeueCounter", 0) - enqueue_counter = stats.get("enqueueCounter", 0) - dispatched_queue_size = stats.get("dispatchedQueueSize", 0) - dispatched_counter = stats.get("dispatchedCounter", 0) - - self.log.debug( - "ActiveMQ Subscriber %s: %s %s %s %s %s" % ( - clientId, pending_queue_size, dequeue_counter, - enqueue_counter, dispatched_queue_size, dispatched_counter - ) - ) - self.gauge("activemq.subscriber.pending_queue_size", - pending_queue_size, tags=el_tags) - self.gauge("activemq.subscriber.dequeue_counter", - dequeue_counter, tags=el_tags) - self.gauge("activemq.subscriber.enqueue_counter", - enqueue_counter, tags=el_tags) - self.gauge("activemq.subscriber.dispatched_queue_size", - dispatched_queue_size, tags=el_tags) - self.gauge("activemq.subscriber.dispatched_counter", - dispatched_counter, tags=el_tags) - - self.log.debug("ActiveMQ Subscriber Count: {0}".format(count)) - self.gauge("activemq.subscriber.count", count, tags=tags) diff --git a/py/checks/agent_metrics.py b/py/checks/agent_metrics.py deleted file mode 100644 index ed79bfe1e9f7..000000000000 --- a/py/checks/agent_metrics.py +++ /dev/null @@ -1,151 +0,0 @@ -# stdlib -import threading - -# 3p -try: - import psutil -except ImportError: - psutil = None - -# project -from checks import AgentCheck -from checks.metric_types import MetricTypes -from config import _is_affirmative - -MAX_THREADS_COUNT = 50 -MAX_COLLECTION_TIME = 30 -MAX_EMIT_TIME = 5 -MAX_CPU_PCT = 10 - - -class UnsupportedMetricType(Exception): - """ - Raised by :class:`AgentMetrics` when a metric type outside outside of AgentMetrics.ALLOWED_METRIC_TYPES - is requested for measurement of a particular statistic - """ - def __init__(self, metric_name, metric_type): - message = 'Unsupported Metric Type for {0} : {1}'.format(metric_name, metric_type) - Exception.__init__(self, message) - -class AgentMetrics(AgentCheck): - """ - New-style version of `CollectorMetrics` - Gets information about agent performance on every collector loop - """ - - def __init__(self, *args, **kwargs): - AgentCheck.__init__(self, *args, **kwargs) - self._collector_payload = {} - self._metric_context = {} - - def _psutil_config_to_stats(self, instance): - """ - Reads `init_config` for `psutil` methods to call on the current process - Calls those methods and stores the raw output - - :returns a dictionary of statistic_name: value - """ - process_metrics = instance.get('process_metrics', self.init_config.get('process_metrics', None)) - if not process_metrics: - self.log.error('No metrics configured for AgentMetrics check!') - return {} - - methods, metric_types = zip( - *[(p['name'], p.get('type', MetricTypes.GAUGE)) - for p in process_metrics if _is_affirmative(p.get('active'))] - ) - - names_to_metric_types = {} - for i, m in enumerate(methods): - names_to_metric_types[AgentCheck._get_statistic_name_from_method(m)] = metric_types[i] - - stats = AgentCheck._collect_internal_stats(methods) - return stats, names_to_metric_types - - def _send_single_metric(self, metric_name, metric_value, metric_type): - if metric_type == MetricTypes.GAUGE: - self.gauge(metric_name, metric_value) - elif metric_type == MetricTypes.RATE: - self.rate(metric_name, metric_value) - else: - raise UnsupportedMetricType(metric_name, metric_type) - - def _register_psutil_metrics(self, stats, names_to_metric_types): - """ - Saves sample metrics from psutil - - :param stats: a dictionary that looks like: - { - 'memory_info': OrderedDict([('rss', 24395776), ('vms', 144666624)]), - 'io_counters': OrderedDict([('read_count', 4536), - ('write_count', 100), - ('read_bytes', 0), - ('write_bytes', 61440)]) - ... - } - - This creates a metric like `datadog.agent.collector.{key_1}.{key_2}` where key_1 is a top-level - key in `stats`, and key_2 is a nested key. - E.g. datadog.agent.collector.memory_info.rss - """ - - base_metric = 'datadog.agent.collector.{0}.{1}' - # TODO: May have to call self.normalize(metric_name) to get a compliant name - for k, v in stats.iteritems(): - metric_type = names_to_metric_types[k] - if isinstance(v, dict): - for _k, _v in v.iteritems(): - full_metric_name = base_metric.format(k, _k) - self._send_single_metric(full_metric_name, _v, metric_type) - else: - full_metric_name = 'datadog.agent.collector.{0}'.format(k) - self._send_single_metric(full_metric_name, v, metric_type) - - def set_metric_context(self, payload, context): - self._collector_payload = payload - self._metric_context = context - - def get_metric_context(self): - return self._collector_payload, self._metric_context - - def check(self, instance): - if self.in_developer_mode: - stats, names_to_metric_types = self._psutil_config_to_stats(instance) - self._register_psutil_metrics(stats, names_to_metric_types) - - payload, context = self.get_metric_context() - collection_time = context.get('collection_time', None) - emit_time = context.get('emit_time', None) - cpu_time = context.get('cpu_time', None) - - if threading.activeCount() > MAX_THREADS_COUNT: - self.gauge('datadog.agent.collector.threads.count', threading.activeCount()) - self.log.info("Thread count is high: %d" % threading.activeCount()) - - collect_time_exceeds_threshold = collection_time > MAX_COLLECTION_TIME - if collection_time is not None and \ - (collect_time_exceeds_threshold or self.in_developer_mode): - - self.gauge('datadog.agent.collector.collection.time', collection_time) - if collect_time_exceeds_threshold: - self.log.info("Collection time (s) is high: %.1f, metrics count: %d, events count: %d", - collection_time, len(payload['metrics']), len(payload['events'])) - - emit_time_exceeds_threshold = emit_time > MAX_EMIT_TIME - if emit_time is not None and \ - (emit_time_exceeds_threshold or self.in_developer_mode): - self.gauge('datadog.agent.emitter.emit.time', emit_time) - if emit_time_exceeds_threshold: - self.log.info("Emit time (s) is high: %.1f, metrics count: %d, events count: %d", - emit_time, len(payload['metrics']), len(payload['events'])) - - if cpu_time is not None: - try: - cpu_used_pct = 100.0 * float(cpu_time)/float(collection_time) - if cpu_used_pct > MAX_CPU_PCT: - self.gauge('datadog.agent.collector.cpu.used', cpu_used_pct) - self.log.info("CPU consumed (%%) is high: %.1f, metrics count: %d, events count: %d", - cpu_used_pct, len(payload['metrics']), len(payload['events'])) - except Exception, e: - self.log.debug("Couldn't compute cpu used by collector with values %s %s %s", - cpu_time, collection_time, str(e)) diff --git a/py/checks/apache.py b/py/checks/apache.py deleted file mode 100644 index 26ea7a580ee4..000000000000 --- a/py/checks/apache.py +++ /dev/null @@ -1,99 +0,0 @@ -# stdlib -import urlparse - -# 3rd party -import requests - -# project -from checks import AgentCheck -from util import headers - - -class Apache(AgentCheck): - """Tracks basic connection/requests/workers metrics - - See http://httpd.apache.org/docs/2.2/mod/mod_status.html for more details - """ - GAUGES = { - 'IdleWorkers': 'apache.performance.idle_workers', - 'BusyWorkers': 'apache.performance.busy_workers', - 'CPULoad': 'apache.performance.cpu_load', - 'Uptime': 'apache.performance.uptime', - 'Total kBytes': 'apache.net.bytes', - 'Total Accesses': 'apache.net.hits', - } - - RATES = { - 'Total kBytes': 'apache.net.bytes_per_s', - 'Total Accesses': 'apache.net.request_per_s' - } - - def __init__(self, name, init_config, agentConfig, instances=None): - AgentCheck.__init__(self, name, init_config, agentConfig, instances) - self.assumed_url = {} - - def check(self, instance): - if 'apache_status_url' not in instance: - raise Exception("Missing 'apache_status_url' in Apache config") - - url = self.assumed_url.get(instance['apache_status_url'], instance['apache_status_url']) - - tags = instance.get('tags', []) - - auth = None - if 'apache_user' in instance and 'apache_password' in instance: - auth = (instance['apache_user'], instance['apache_password']) - - # Submit a service check for status page availability. - parsed_url = urlparse.urlparse(url) - apache_host = parsed_url.hostname - apache_port = parsed_url.port or 80 - service_check_name = 'apache.can_connect' - service_check_tags = ['host:%s' % apache_host, 'port:%s' % apache_port] - try: - r = requests.get(url, auth=auth, headers=headers(self.agentConfig)) - r.raise_for_status() - - except Exception: - self.service_check(service_check_name, AgentCheck.CRITICAL, - tags=service_check_tags) - raise - else: - self.service_check(service_check_name, AgentCheck.OK, - tags=service_check_tags) - - response = r.content - metric_count = 0 - # Loop through and extract the numerical values - for line in response.splitlines(): - values = line.split(': ') - if len(values) == 2: # match - metric, value = values - try: - value = float(value) - except ValueError: - continue - - # Special case: kBytes => bytes - if metric == 'Total kBytes': - value = value * 1024 - - # Send metric as a gauge, if applicable - if metric in self.GAUGES: - metric_count += 1 - metric_name = self.GAUGES[metric] - self.gauge(metric_name, value, tags=tags) - - # Send metric as a rate, if applicable - if metric in self.RATES: - metric_count += 1 - metric_name = self.RATES[metric] - self.rate(metric_name, value, tags=tags) - - if metric_count == 0: - if self.assumed_url.get(instance['apache_status_url'], None) is None and url[-5:] != '?auto': - self.assumed_url[instance['apache_status_url']] = '%s?auto' % url - self.warning("Assuming url was not correct. Trying to add ?auto suffix to the url") - self.check(instance) - else: - raise Exception("No metrics were fetched for this instance. Make sure that %s is the proper url." % instance['apache_status_url']) diff --git a/py/checks/btrfs.py b/py/checks/btrfs.py deleted file mode 100644 index 25a1cdccaa7c..000000000000 --- a/py/checks/btrfs.py +++ /dev/null @@ -1,140 +0,0 @@ -# stdlib -import array -from collections import defaultdict -import fcntl -import itertools -import os -import struct - -# 3rd party -import psutil - -# project -from checks import AgentCheck - -MIXED = "mixed" -DATA = "data" -METADATA = "metadata" -SYSTEM = "system" -SINGLE = "single" -RAID0 = "raid0" -RAID1 = "raid1" -RAID10 = "raid10" -DUP = "dup" -UNKNOWN = "unknown" - -FLAGS_MAPPER = defaultdict(lambda: (SINGLE, UNKNOWN),{ - 1: (SINGLE, DATA), - 2: (SINGLE, SYSTEM), - 4: (SINGLE, METADATA), - 5: (SINGLE, MIXED), - 9: (RAID0, DATA), - 10: (RAID0, SYSTEM), - 12: (RAID0, METADATA), - 13: (RAID0, MIXED), - 17: (RAID1, DATA), - 18: (RAID1, SYSTEM), - 20: (RAID1, METADATA), - 21: (RAID1, MIXED), - 33: (DUP, DATA), - 34: (DUP, SYSTEM), - 36: (DUP, METADATA), - 37: (DUP, MIXED), - 65: (RAID10, DATA), - 66: (RAID10, SYSTEM), - 68: (RAID10, METADATA), - 69: (RAID10, MIXED), - -}) - -BTRFS_IOC_SPACE_INFO = 0xc0109414 - -TWO_LONGS_STRUCT = struct.Struct("=2Q") # 2 Longs -THREE_LONGS_STRUCT = struct.Struct("=3Q") # 3 Longs - - -def sized_array(count): - return array.array("B", itertools.repeat(0, count)) - - -class FileDescriptor(object): - - def __init__(self, mountpoint): - self.fd = os.open(mountpoint, os.O_DIRECTORY) - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - os.close(self.fd) - - def fileno(self): - return self.fd - - def open(self, dir): - return self.fd - - -class BTRFS(AgentCheck): - - def __init__(self, name, init_config, agentConfig, instances=None): - AgentCheck.__init__(self, name, init_config, agentConfig, instances=instances) - if instances is not None and len(instances) > 1: - raise Exception("BTRFS check only supports one configured instance.") - - def get_usage(self, mountpoint): - results = [] - - with FileDescriptor(mountpoint) as fd: - - # Get the struct size needed - # https://github.com/spotify/linux/blob/master/fs/btrfs/ioctl.h#L46-L50 - ret = sized_array(TWO_LONGS_STRUCT.size) - fcntl.ioctl(fd, BTRFS_IOC_SPACE_INFO, ret) - _, total_spaces = TWO_LONGS_STRUCT.unpack(ret) - - # Allocate it - buffer_size = (TWO_LONGS_STRUCT.size - + total_spaces * THREE_LONGS_STRUCT.size) - - data = sized_array(buffer_size) - TWO_LONGS_STRUCT.pack_into(data, 0, total_spaces, 0) - fcntl.ioctl(fd, BTRFS_IOC_SPACE_INFO, data) - - _, total_spaces = TWO_LONGS_STRUCT.unpack_from(ret, 0) - for offset in xrange(TWO_LONGS_STRUCT.size, - buffer_size, - THREE_LONGS_STRUCT.size): - - # https://github.com/spotify/linux/blob/master/fs/btrfs/ioctl.h#L40-L44 - flags, total_bytes, used_bytes = THREE_LONGS_STRUCT.unpack_from(data, offset) - results.append((flags, total_bytes, used_bytes)) - - return results - - def check(self, instance): - btrfs_devices = {} - excluded_devices = instance.get('excluded_devices', []) - for p in psutil.disk_partitions(): - if (p.fstype == 'btrfs' and p.device not in btrfs_devices - and p.device not in excluded_devices): - btrfs_devices[p.device] = p.mountpoint - - if len(btrfs_devices) == 0: - raise Exception("No btrfs device found") - - for device, mountpoint in btrfs_devices.iteritems(): - for flags, total_bytes, used_bytes in self.get_usage(mountpoint): - replication_type, usage_type = FLAGS_MAPPER[flags] - tags = [ - 'usage_type:{0}'.format(usage_type), - 'replication_type:{0}'.format(replication_type), - ] - - free = total_bytes - used_bytes - usage = float(used_bytes) / float(total_bytes) - - self.gauge('system.disk.btrfs.total', total_bytes, tags=tags, device_name=device) - self.gauge('system.disk.btrfs.used', used_bytes, tags=tags, device_name=device) - self.gauge('system.disk.btrfs.free', free, tags=tags, device_name=device) - self.gauge('system.disk.btrfs.usage', usage, tags=tags, device_name=device) diff --git a/py/checks/cacti.py b/py/checks/cacti.py deleted file mode 100644 index 70f04dd8d4d0..000000000000 --- a/py/checks/cacti.py +++ /dev/null @@ -1,239 +0,0 @@ -# stdlib -from collections import namedtuple -from fnmatch import fnmatch -import os -import time - -# 3rd party -try: - import rrdtool -except ImportError: - rrdtool = None -import pymysql - -# project -from checks import AgentCheck - -CFUNC_TO_AGGR = { - 'AVERAGE': 'avg', - 'MAXIMUM': 'max', - 'MINIMUM': 'min' -} - -CACTI_TO_DD = { - 'hdd_free': 'system.disk.free', - 'hdd_used': 'system.disk.used', - 'swap_free': 'system.swap.free', - 'load_1min': 'system.load.1', - 'load_5min': 'system.load.5', - 'load_15min': 'system.load.15', - 'mem_buffers': 'system.mem.buffered', - 'proc': 'system.proc.running', - 'users': 'system.users.current', - 'mem_swap': 'system.swap.free', - 'ping': 'system.ping.latency' -} - -class Cacti(AgentCheck): - def __init__(self, name, init_config, agentConfig): - AgentCheck.__init__(self, name, init_config, agentConfig) - self.last_ts = {} - - def get_library_versions(self): - if rrdtool is not None: - return {"rrdtool": rrdtool.__version__} - return {"rrdtool": "Not Found"} - - def check(self, instance): - if rrdtool is None: - raise Exception("Unable to import python rrdtool module") - - # Load the instance config - config = self._get_config(instance) - - connection = pymysql.connect(config.host, config.user, config.password, config.db) - - self.log.debug("Connected to MySQL to fetch Cacti metadata") - - # Get whitelist patterns, if available - patterns = self._get_whitelist_patterns(config.whitelist) - - # Fetch the RRD metadata from MySQL - rrd_meta = self._fetch_rrd_meta(connection, config.rrd_path, patterns, config.field_names) - - # Load the metrics from each RRD, tracking the count as we go - metric_count = 0 - for hostname, device_name, rrd_path in rrd_meta: - m_count = self._read_rrd(rrd_path, hostname, device_name) - metric_count += m_count - - self.gauge('cacti.metrics.count', metric_count) - - def _get_whitelist_patterns(self, whitelist): - patterns = [] - if whitelist: - if not os.path.isfile(whitelist) or not os.access(whitelist, os.R_OK): - # Don't run the check if the whitelist is unavailable - self.log.exception("Unable to read whitelist file at %s" % (whitelist)) - - wl = open(whitelist) - for line in wl: - patterns.append(line.strip()) - wl.close() - - return patterns - - - def _get_config(self, instance): - required = ['mysql_host', 'mysql_user', 'rrd_path'] - for param in required: - if not instance.get(param): - raise Exception("Cacti instance missing %s. Skipping." % (param)) - - host = instance.get('mysql_host') - user = instance.get('mysql_user') - password = instance.get('mysql_password', '') or '' - db = instance.get('mysql_db', 'cacti') - rrd_path = instance.get('rrd_path') - whitelist = instance.get('rrd_whitelist') - field_names = instance.get('field_names', ['ifName', 'dskDevice']) - - Config = namedtuple('Config', [ - 'host', - 'user', - 'password', - 'db', - 'rrd_path', - 'whitelist', - 'field_names'] - ) - - return Config(host, user, password, db, rrd_path, whitelist, field_names) - - def _read_rrd(self, rrd_path, hostname, device_name): - ''' Main metric fetching method ''' - metric_count = 0 - - try: - info = rrdtool.info(rrd_path) - except Exception: - # Unable to read RRD file, ignore it - self.log.exception("Unable to read RRD file at %s" % rrd_path) - return metric_count - - # Find the consolidation functions for the RRD metrics - c_funcs = set([v for k,v in info.items() if k.endswith('.cf')]) - - for c in list(c_funcs): - last_ts_key = '%s.%s' % (rrd_path, c) - if last_ts_key not in self.last_ts: - self.last_ts[last_ts_key] = int(time.time()) - continue - - start = self.last_ts[last_ts_key] - last_ts = start - - try: - fetched = rrdtool.fetch(rrd_path, c, '--start', str(start)) - except rrdtool.error: - # Start time was out of range, skip this RRD - self.log.warn("Time %s out of range for %s" % (rrd_path, start)) - return metric_count - - # Extract the data - (start_ts, end_ts, interval) = fetched[0] - metric_names = fetched[1] - points = fetched[2] - for k, m_name in enumerate(metric_names): - m_name = self._format_metric_name(m_name, c) - for i, p in enumerate(points): - ts = start_ts + (i * interval) - - if p[k] is None: - continue - - # Save this metric as a gauge - val = self._transform_metric(m_name, p[k]) - self.gauge(m_name, val, hostname=hostname, - device_name=device_name, timestamp=ts) - metric_count += 1 - last_ts = (ts + interval) - - # Update the last timestamp based on the last valid metric - self.last_ts[last_ts_key] = last_ts - return metric_count - - def _fetch_rrd_meta(self, connection, rrd_path_root, whitelist, field_names): - ''' Fetch metadata about each RRD in this Cacti DB, returning a list of - tuples of (hostname, device_name, rrd_path) - ''' - def _in_whitelist(rrd): - path = rrd.replace('/','') - for p in whitelist: - if fnmatch(path, p): - return True - return False - - c = connection.cursor() - - and_parameters = " OR ".join(["hsc.field_name = '%s'" % field_name for field_name in field_names]) - - # Check for the existence of the `host_snmp_cache` table - rrd_query = """ - SELECT - h.hostname as hostname, - hsc.field_value as device_name, - dt.data_source_path as rrd_path - FROM data_local dl - JOIN host h on dl.host_id = h.id - JOIN data_template_data dt on dt.local_data_id = dl.id - LEFT JOIN host_snmp_cache hsc on h.id = hsc.host_id - AND dl.snmp_index = hsc.snmp_index - WHERE dt.data_source_path IS NOT NULL - AND dt.data_source_path != '' - AND (%s OR hsc.field_name is NULL) """ % and_parameters - - c.execute(rrd_query) - res = [] - for hostname, device_name, rrd_path in c.fetchall(): - if not whitelist or _in_whitelist(rrd_path): - if hostname in ('localhost', '127.0.0.1'): - hostname = self.hostname - rrd_path = rrd_path.replace('', rrd_path_root) - device_name = device_name or None - res.append((hostname, device_name, rrd_path)) - - # Collect stats - num_hosts = len(set([r[0] for r in res])) - self.gauge('cacti.rrd.count', len(res)) - self.gauge('cacti.hosts.count', num_hosts) - - return res - - def _format_metric_name(self, m_name, cfunc): - ''' Format a cacti metric name into a Datadog-friendly name ''' - try: - aggr = CFUNC_TO_AGGR[cfunc] - except KeyError: - aggr = cfunc.lower() - - try: - m_name = CACTI_TO_DD[m_name] - if aggr != 'avg': - m_name += '.%s' % (aggr) - return m_name - except KeyError: - return "cacti.%s.%s" % (m_name.lower(), aggr) - - def _transform_metric(self, m_name, val): - ''' Add any special case transformations here ''' - # Report memory in MB - if m_name[0:11] in ('system.mem.', 'system.disk'): - return val / 1024 - return val - - - ''' - For backwards compatability with pre-checks.d configuration. - Convert old-style config to new-style config. - ''' diff --git a/py/checks/consul.py b/py/checks/consul.py deleted file mode 100644 index 7e444255e274..000000000000 --- a/py/checks/consul.py +++ /dev/null @@ -1,320 +0,0 @@ -# stdlib -from collections import defaultdict -from datetime import datetime, timedelta -from itertools import islice -from urlparse import urljoin - -# project -from checks import AgentCheck - -# 3p -import requests - - -class ConsulCheck(AgentCheck): - CONSUL_CHECK = 'consul.up' - HEALTH_CHECK = 'consul.check' - - CONSUL_CATALOG_CHECK = 'consul.catalog' - - SOURCE_TYPE_NAME = 'consul' - - MAX_CONFIG_TTL = 300 # seconds - MAX_SERVICES = 50 # cap on distinct Consul ServiceIDs to interrogate - - STATUS_SC = { - 'up': AgentCheck.OK, - 'passing': AgentCheck.OK, - 'warning': AgentCheck.WARNING, - 'critical': AgentCheck.CRITICAL, - } - - def __init__(self, name, init_config, agentConfig, instances=None): - AgentCheck.__init__(self, name, init_config, agentConfig, instances) - if instances is not None and len(instances) > 1: - raise Exception("Consul check only supports one configured instance.") - - self._local_config = None - self._last_config_fetch_time = None - self._last_known_leader = None - - def consul_request(self, instance, endpoint): - url = urljoin(instance.get('url'), endpoint) - try: - - clientcertfile = instance.get('client_cert_file', self.init_config.get('client_cert_file', False)) - privatekeyfile = instance.get('private_key_file', self.init_config.get('private_key_file', False)) - cabundlefile = instance.get('ca_bundle_file', self.init_config.get('ca_bundle_file', True)) - - if clientcertfile: - if privatekeyfile: - resp = requests.get(url, cert=(clientcertfile,privatekeyfile), verify=cabundlefile) - else: - resp = requests.get(url, cert=clientcertfile, verify=cabundlefile) - else: - resp = requests.get(url, verify=cabundlefile) - - except requests.exceptions.Timeout: - self.log.exception('Consul request to {0} timed out'.format(url)) - raise - - resp.raise_for_status() - return resp.json() - - ### Consul Config Accessors - def _get_local_config(self, instance): - if not self._local_config or datetime.now() - self._last_config_fetch_time > timedelta(seconds=self.MAX_CONFIG_TTL): - self._local_config = self.consul_request(instance, '/v1/agent/self') - self._last_config_fetch_time = datetime.now() - - return self._local_config - - def _get_cluster_leader(self, instance): - return self.consul_request(instance, '/v1/status/leader') - - def _get_agent_url(self, instance): - self.log.debug("Starting _get_agent_url") - local_config = self._get_local_config(instance) - agent_addr = local_config.get('Config', {}).get('AdvertiseAddr') - agent_port = local_config.get('Config', {}).get('Ports', {}).get('Server') - agent_url = "{0}:{1}".format(agent_addr, agent_port) - self.log.debug("Agent url is %s" % agent_url) - return agent_url - - def _get_agent_datacenter(self, instance): - local_config = self._get_local_config(instance) - agent_dc = local_config.get('Config', {}).get('Datacenter') - return agent_dc - - ### Consul Leader Checks - def _is_instance_leader(self, instance): - try: - agent_url = self._get_agent_url(instance) - leader = self._last_known_leader or self._get_cluster_leader(instance) - self.log.debug("Consul agent lives at %s . Consul Leader lives at %s" % (agent_url,leader)) - return agent_url == leader - - except Exception as e: - return False - - def _check_for_leader_change(self, instance): - agent_dc = self._get_agent_datacenter(instance) - leader = self._get_cluster_leader(instance) - - if not leader: - # A few things could be happening here. - # 1. Consul Agent is Down - # 2. The cluster is in the midst of a leader election - # 3. The Datadog agent is not able to reach the Consul instance (network partition et al.) - self.log.warn('Consul Leader information is not available!') - return - - if not self._last_known_leader: - # We have no state preserved, store some and return - self._last_known_leader = leader - return - - if leader != self._last_known_leader: - self.log.info(('Leader change from {0} to {1}. Sending new leader event').format( - self._last_known_leader, leader)) - - self.event({ - "timestamp": int(datetime.now().strftime("%s")), - "event_type": "consul.new_leader", - "source_type_name": self.SOURCE_TYPE_NAME, - "msg_title": "New Consul Leader Elected in consul_datacenter:{0}".format(agent_dc), - "aggregation_key": "consul.new_leader", - "msg_text": "The Node at {0} is the new leader of the consul datacenter {1}".format( - leader, - agent_dc - ), - "tags": ["prev_consul_leader:{0}".format(self._last_known_leader), - "curr_consul_leader:{0}".format(leader), - "consul_datacenter:{0}".format(agent_dc)] - }) - - self._last_known_leader = leader - - ### Consul Catalog Accessors - def get_peers_in_cluster(self, instance): - return self.consul_request(instance, '/v1/status/peers') - - def get_services_in_cluster(self, instance): - return self.consul_request(instance, '/v1/catalog/services') - - def get_nodes_with_service(self, instance, service): - consul_request_url = '/v1/health/service/{0}'.format(service) - - return self.consul_request(instance, consul_request_url) - - def _cull_services_list(self, services, service_whitelist): - if service_whitelist: - if len(service_whitelist) > self.MAX_SERVICES: - self.warning('More than %d services in whitelist. Service list will be truncated.' % self.MAX_SERVICES) - - services = [s for s in services if s in service_whitelist][:self.MAX_SERVICES] - else: - if len(services) <= self.MAX_SERVICES: - self.warning('Consul service whitelist not defined. Agent will poll for all %d services found' % len(services)) - else: - self.warning('Consul service whitelist not defined. Agent will poll for at most %d services' % self.MAX_SERVICES) - services = list(islice(services.iterkeys(), 0, self.MAX_SERVICES)) - - return services - - def check(self, instance): - perform_new_leader_checks = instance.get('new_leader_checks', - self.init_config.get('new_leader_checks', False)) - if perform_new_leader_checks: - self._check_for_leader_change(instance) - - peers = self.get_peers_in_cluster(instance) - main_tags = [] - agent_dc = self._get_agent_datacenter(instance) - - if agent_dc is not None: - main_tags.append('consul_datacenter:{0}'.format(agent_dc)) - - if not self._is_instance_leader(instance): - self.gauge("consul.peers", len(peers), tags=main_tags + ["mode:follower"]) - self.log.debug("This consul agent is not the cluster leader." + - "Skipping service and catalog checks for this instance") - return - else: - self.gauge("consul.peers", len(peers), tags=main_tags + ["mode:leader"]) - - service_check_tags = ['consul_url:{0}'.format(instance.get('url'))] - perform_catalog_checks = instance.get('catalog_checks', - self.init_config.get('catalog_checks')) - - try: - # Make service checks from health checks for all services in catalog - health_state = self.consul_request(instance, '/v1/health/state/any') - - for check in health_state: - status = self.STATUS_SC.get(check['Status']) - if status is None: - continue - - tags = ["check:{0}".format(check["CheckID"])] - if check["ServiceName"]: - tags.append("service:{0}".format(check["ServiceName"])) - if check["ServiceID"]: - tags.append("consul_service_id:{0}".format(check["ServiceID"])) - - self.service_check(self.HEALTH_CHECK, status, tags=main_tags+tags) - - except Exception as e: - self.service_check(self.CONSUL_CHECK, AgentCheck.CRITICAL, - tags=service_check_tags) - else: - self.service_check(self.CONSUL_CHECK, AgentCheck.OK, - tags=service_check_tags) - - if perform_catalog_checks: - # Collect node by service, and service by node counts for a whitelist of services - - services = self.get_services_in_cluster(instance) - service_whitelist = instance.get('service_whitelist', - self.init_config.get('service_whitelist', [])) - - services = self._cull_services_list(services, service_whitelist) - - # {node_id: {"up: 0, "passing": 0, "warning": 0, "critical": 0} - nodes_to_service_status = defaultdict(lambda: defaultdict(int)) - - for service in services: - # For every service in the cluster, - # Gauge the following: - # `consul.catalog.nodes_up` : # of Nodes registered with that service - # `consul.catalog.nodes_passing` : # of Nodes with service status `passing` from those registered - # `consul.catalog.nodes_warning` : # of Nodes with service status `warning` from those registered - # `consul.catalog.nodes_critical` : # of Nodes with service status `critical` from those registered - - service_tags = ['consul_service_id:{0}'.format(service)] - - nodes_with_service = self.get_nodes_with_service(instance, service) - - # {'up': 0, 'passing': 0, 'warning': 0, 'critical': 0} - node_status = defaultdict(int) - - for node in nodes_with_service: - # The node_id is n['Node']['Node'] - node_id = node.get('Node', {}).get("Node") - - # An additional service is registered on this node. Bump up the counter - nodes_to_service_status[node_id]["up"] += 1 - - # If there is no Check for the node then Consul and dd-agent consider it up - if 'Checks' not in node: - node_status['passing'] += 1 - node_status['up'] += 1 - else: - found_critical = False - found_warning = False - found_serf_health = False - - for check in node['Checks']: - if check['CheckID'] == 'serfHealth': - found_serf_health = True - - # For backwards compatibility, the "up" node_status is computed - # based on the total # of nodes 'running' as part of the service. - - # If the serfHealth is `critical` it means the Consul agent isn't even responding, - # and we don't register the node as `up` - if check['Status'] != 'critical': - node_status["up"] += 1 - continue - - if check['Status'] == 'critical': - found_critical = True - break - elif check['Status'] == 'warning': - found_warning = True - # Keep looping in case there is a critical status - - # Increment the counters based on what was found in Checks - # `critical` checks override `warning`s, and if neither are found, register the node as `passing` - if found_critical: - node_status['critical'] += 1 - nodes_to_service_status[node_id]["critical"] += 1 - elif found_warning: - node_status['warning'] += 1 - nodes_to_service_status[node_id]["warning"] += 1 - else: - if not found_serf_health: - # We have not found a serfHealth check for this node, which is unexpected - # If we get here assume this node's status is "up", since we register it as 'passing' - node_status['up'] += 1 - - node_status['passing'] += 1 - nodes_to_service_status[node_id]["passing"] += 1 - - for status_key in self.STATUS_SC: - status_value = node_status[status_key] - self.gauge( - '{0}.nodes_{1}'.format(self.CONSUL_CATALOG_CHECK, status_key), - status_value, - tags=main_tags+service_tags - ) - - for node, service_status in nodes_to_service_status.iteritems(): - # For every node discovered for whitelisted services, gauge the following: - # `consul.catalog.services_up` : Total services registered on node - # `consul.catalog.services_passing` : Total passing services on node - # `consul.catalog.services_warning` : Total warning services on node - # `consul.catalog.services_critical` : Total critical services on node - - node_tags = ['consul_node_id:{0}'.format(node)] - self.gauge('{0}.services_up'.format(self.CONSUL_CATALOG_CHECK), - len(services), - tags=main_tags+node_tags) - - for status_key in self.STATUS_SC: - status_value = service_status[status_key] - self.gauge( - '{0}.services_{1}'.format(self.CONSUL_CATALOG_CHECK, status_key), - status_value, - tags=main_tags+node_tags - ) diff --git a/py/checks/couch.py b/py/checks/couch.py deleted file mode 100644 index c26efb3ec61f..000000000000 --- a/py/checks/couch.py +++ /dev/null @@ -1,130 +0,0 @@ -# stdlib -from urlparse import urljoin - -# 3rd party -import requests - -# project -from checks import AgentCheck -from util import headers - - -class CouchDb(AgentCheck): - """Extracts stats from CouchDB via its REST API - http://wiki.apache.org/couchdb/Runtime_Statistics - """ - - MAX_DB = 50 - SERVICE_CHECK_NAME = 'couchdb.can_connect' - SOURCE_TYPE_NAME = 'couchdb' - TIMEOUT = 5 - - def __init__(self, name, init_config, agentConfig, instances=None): - AgentCheck.__init__(self, name, init_config, agentConfig, instances) - self.db_blacklist = {} - - def _create_metric(self, data, tags=None): - overall_stats = data.get('stats', {}) - for key, stats in overall_stats.items(): - for metric, val in stats.items(): - if val['current'] is not None: - metric_name = '.'.join(['couchdb', key, metric]) - self.gauge(metric_name, val['current'], tags=tags) - - for db_name, db_stats in data.get('databases', {}).items(): - for name, val in db_stats.items(): - if name in ['doc_count', 'disk_size'] and val is not None: - metric_name = '.'.join(['couchdb', 'by_db', name]) - metric_tags = list(tags) - metric_tags.append('db:%s' % db_name) - self.gauge(metric_name, val, tags=metric_tags, device_name=db_name) - - def _get_stats(self, url, instance): - "Hit a given URL and return the parsed json" - self.log.debug('Fetching Couchdb stats at url: %s' % url) - - auth = None - if 'user' in instance and 'password' in instance: - auth = (instance['user'], instance['password']) - # Override Accept request header so that failures are not redirected to the Futon web-ui - request_headers = headers(self.agentConfig) - request_headers['Accept'] = 'text/json' - r = requests.get(url, auth=auth, headers=request_headers, - timeout=int(instance.get('timeout', self.TIMEOUT))) - r.raise_for_status() - return r.json() - - def check(self, instance): - server = instance.get('server', None) - if server is None: - raise Exception("A server must be specified") - data = self.get_data(server, instance) - self._create_metric(data, tags=['instance:%s' % server]) - - def get_data(self, server, instance): - # The dictionary to be returned. - couchdb = {'stats': None, 'databases': {}} - - # First, get overall statistics. - endpoint = '/_stats/' - - url = urljoin(server, endpoint) - - # Fetch initial stats and capture a service check based on response. - service_check_tags = ['instance:%s' % server] - try: - overall_stats = self._get_stats(url, instance) - except requests.exceptions.Timeout as e: - self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, - tags=service_check_tags, message="Request timeout: {0}, {1}".format(url, e)) - raise - except requests.exceptions.HTTPError as e: - self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, - tags=service_check_tags, message=str(e.message)) - raise - except Exception as e: - self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, - tags=service_check_tags, message=str(e)) - raise - else: - self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, - tags=service_check_tags, - message='Connection to %s was successful' % url) - - # No overall stats? bail out now - if overall_stats is None: - raise Exception("No stats could be retrieved from %s" % url) - - couchdb['stats'] = overall_stats - - # Next, get all database names. - endpoint = '/_all_dbs/' - - url = urljoin(server, endpoint) - - # Get the list of whitelisted databases. - db_whitelist = instance.get('db_whitelist') - self.db_blacklist.setdefault(server,[]) - self.db_blacklist[server].extend(instance.get('db_blacklist',[])) - whitelist = set(db_whitelist) if db_whitelist else None - databases = set(self._get_stats(url, instance)) - set(self.db_blacklist[server]) - databases = databases.intersection(whitelist) if whitelist else databases - - if len(databases) > self.MAX_DB: - self.warning('Too many databases, only the first %s will be checked.' % self.MAX_DB) - databases = list(databases)[:self.MAX_DB] - - for dbName in databases: - url = urljoin(server, dbName) - try: - db_stats = self._get_stats(url, instance) - except requests.exceptions.HTTPError as e: - couchdb['databases'][dbName] = None - if (e.response.status_code == 403) or (e.response.status_code == 401): - self.db_blacklist[server].append(dbName) - self.warning('Database %s is not readable by the configured user. It will be added to the blacklist. Please restart the agent to clear.' % dbName) - del couchdb['databases'][dbName] - continue - if db_stats is not None: - couchdb['databases'][dbName] = db_stats - return couchdb diff --git a/py/checks/couchbase.py b/py/checks/couchbase.py deleted file mode 100644 index fcdfbb68e9c1..000000000000 --- a/py/checks/couchbase.py +++ /dev/null @@ -1,238 +0,0 @@ -# stdlib -import re - -# 3rd party -import requests - -# project -from checks import AgentCheck -from util import headers - -# Constants -COUCHBASE_STATS_PATH = '/pools/default' -DEFAULT_TIMEOUT = 10 - - -class Couchbase(AgentCheck): - """Extracts stats from Couchbase via its REST API - http://docs.couchbase.com/couchbase-manual-2.0/#using-the-rest-api - """ - SERVICE_CHECK_NAME = 'couchbase.can_connect' - - # Selected metrics to send amongst all the bucket stats, after name normalization - BUCKET_STATS = set([ - 'avg_bg_wait_time', - 'avg_disk_commit_time', - 'bytes_read', - 'bytes_written', - 'cas_hits', - 'cas_misses', - 'cmd_get', - 'cmd_set', - 'couch_docs_actual_disk_size', - 'couch_docs_data_size', - 'couch_docs_disk_size', - 'couch_docs_fragmentation', - 'couch_total_disk_size', - 'couch_views_fragmentation', - 'couch_views_ops', - 'cpu_idle_ms', - 'cpu_utilization_rate', - 'curr_connections', - 'curr_items', - 'curr_items_tot', - 'decr_hits', - 'decr_misses', - 'delete_hits', - 'delete_misses', - 'disk_commit_count', - 'disk_update_count', - 'disk_write_queue', - 'ep_bg_fetched', - 'ep_cache_miss_rate', - 'ep_cache_miss_ratio', - 'ep_diskqueue_drain', - 'ep_diskqueue_fill', - 'ep_flusher_todo', - 'ep_item_commit_failed', - 'ep_max_size', - 'ep_mem_high_wat', - 'ep_mem_low_wat', - 'ep_num_non_resident', - 'ep_num_value_ejects', - 'ep_oom_errors', - 'ep_ops_create', - 'ep_ops_update', - 'ep_overhead', - 'ep_queue_size', - 'ep_resident_items_rate', - 'ep_tap_replica_queue_drain', - 'ep_tap_total_queue_drain', - 'ep_tap_total_queue_fill', - 'ep_tap_total_total_backlog_size', - 'ep_tmp_oom_errors', - 'evictions', - 'get_hits', - 'get_misses', - 'hit_ratio', - 'incr_hits', - 'incr_misses', - 'mem_free', - 'mem_total', - 'mem_used', - 'misses', - 'ops', - 'page_faults', - 'replication_docs_rep_queue', - 'replication_meta_latency_aggr', - 'vb_active_num', - 'vb_active_queue_drain', - 'vb_active_queue_size', - 'vb_active_resident_items_ratio', - 'vb_avg_total_queue_age', - 'vb_pending_ops_create', - 'vb_pending_queue_fill', - 'vb_replica_curr_items', - 'vb_replica_meta_data_memory', - 'vb_replica_num', - 'vb_replica_queue_size', - 'xdc_ops', - ]) - - def _create_metrics(self, data, tags=None): - storage_totals = data['stats']['storageTotals'] - for key, storage_type in storage_totals.items(): - for metric_name, val in storage_type.items(): - if val is not None: - metric_name = '.'.join(['couchbase', key, self.camel_case_to_joined_lower(metric_name)]) - self.gauge(metric_name, val, tags=tags) - - for bucket_name, bucket_stats in data['buckets'].items(): - for metric_name, val in bucket_stats.items(): - if val is not None: - norm_metric_name = self.camel_case_to_joined_lower(metric_name) - if norm_metric_name in self.BUCKET_STATS: - full_metric_name = '.'.join(['couchbase', 'by_bucket', norm_metric_name]) - metric_tags = list(tags) - metric_tags.append('bucket:%s' % bucket_name) - self.gauge(full_metric_name, val[0], tags=metric_tags, device_name=bucket_name) - - for node_name, node_stats in data['nodes'].items(): - for metric_name, val in node_stats['interestingStats'].items(): - if val is not None: - metric_name = '.'.join(['couchbase', 'by_node', self.camel_case_to_joined_lower(metric_name)]) - metric_tags = list(tags) - metric_tags.append('node:%s' % node_name) - self.gauge(metric_name, val, tags=metric_tags, device_name=node_name) - - def _get_stats(self, url, instance): - """ Hit a given URL and return the parsed json. """ - self.log.debug('Fetching Couchbase stats at url: %s' % url) - - timeout = float(instance.get('timeout', DEFAULT_TIMEOUT)) - - auth = None - if 'user' in instance and 'password' in instance: - auth = (instance['user'], instance['password']) - - r = requests.get(url, auth=auth, headers=headers(self.agentConfig), - timeout=timeout) - r.raise_for_status() - return r.json() - - def check(self, instance): - server = instance.get('server', None) - if server is None: - raise Exception("The server must be specified") - tags = instance.get('tags', []) - # Clean up tags in case there was a None entry in the instance - # e.g. if the yaml contains tags: but no actual tags - if tags is None: - tags = [] - else: - tags = list(set(tags)) - tags.append('instance:%s' % server) - data = self.get_data(server, instance) - self._create_metrics(data, tags=list(set(tags))) - - def get_data(self, server, instance): - # The dictionary to be returned. - couchbase = { - 'stats': None, - 'buckets': {}, - 'nodes': {} - } - - # build couchbase stats entry point - url = '%s%s' % (server, COUCHBASE_STATS_PATH) - - # Fetch initial stats and capture a service check based on response. - service_check_tags = ['instance:%s' % server] - try: - overall_stats = self._get_stats(url, instance) - # No overall stats? bail out now - if overall_stats is None: - raise Exception("No data returned from couchbase endpoint: %s" % url) - except requests.exceptions.HTTPError as e: - self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, - tags=service_check_tags, message=str(e.message)) - raise - except Exception as e: - self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, - tags=service_check_tags, message=str(e)) - raise - else: - self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, - tags=service_check_tags) - - couchbase['stats'] = overall_stats - - nodes = overall_stats['nodes'] - - # Next, get all the nodes - if nodes is not None: - for node in nodes: - couchbase['nodes'][node['hostname']] = node - - # Next, get all buckets . - endpoint = overall_stats['buckets']['uri'] - - url = '%s%s' % (server, endpoint) - buckets = self._get_stats(url, instance) - - if buckets is not None: - for bucket in buckets: - bucket_name = bucket['name'] - - # Fetch URI for the stats bucket - endpoint = bucket['stats']['uri'] - url = '%s%s' % (server, endpoint) - - try: - bucket_stats = self._get_stats(url, instance) - except requests.exceptions.HTTPError: - url_backup = '%s/pools/nodes/buckets/%s/stats' % (server, bucket_name) - bucket_stats = self._get_stats(url_backup, instance) - - bucket_samples = bucket_stats['op']['samples'] - if bucket_samples is not None: - couchbase['buckets'][bucket['name']] = bucket_samples - - return couchbase - - # Takes a camelCased variable and returns a joined_lower equivalent. - # Returns input if non-camelCase variable is detected. - def camel_case_to_joined_lower(self, variable): - # replace non-word with _ - converted_variable = re.sub('\W+', '_', variable) - - # insert _ in front of capital letters and lowercase the string - converted_variable = re.sub('([A-Z])', '_\g<1>', converted_variable).lower() - - # remove duplicate _ - converted_variable = re.sub('_+', '_', converted_variable) - - # handle special case of starting/ending underscores - converted_variable = re.sub('^_|_$', '', converted_variable) - - return converted_variable diff --git a/py/checks/directory.py b/py/checks/directory.py deleted file mode 100644 index d64ac3e4aebb..000000000000 --- a/py/checks/directory.py +++ /dev/null @@ -1,93 +0,0 @@ -# stdlib -from fnmatch import fnmatch -from os import stat, walk -from os.path import abspath, exists, join -import time - -# project -from checks import AgentCheck - - -def _is_affirmative(s): - # int or real bool - if isinstance(s, int): - return bool(s) - # try string cast - return s.lower() in ('yes', 'true', '1') - - -class DirectoryCheck(AgentCheck): - """This check is for monitoring and reporting metrics on the files for a provided directory - - WARNING: the user/group that dd-agent runs as must have access to stat the files in the desired directory - - Config options: - "directory" - string, the directory to gather stats for. required - "name" - string, the name to use when tagging the metrics. defaults to the "directory" - "dirtagname" - string, the name of the tag used for the directory. defaults to "name" - "filetagname" - string, the name of the tag used for each file. defaults to "filename" - "filegauges" - boolean, when true stats will be an individual gauge per file (max. 20 files!) and not a histogram of the whole directory. default False - "pattern" - string, the `fnmatch` pattern to use when reading the "directory"'s files. default "*" - "recursive" - boolean, when true the stats will recurse into directories. default False - """ - - SOURCE_TYPE_NAME = 'system' - - def check(self, instance): - if "directory" not in instance: - raise Exception('DirectoryCheck: missing "directory" in config') - - directory = instance["directory"] - abs_directory = abspath(directory) - name = instance.get("name", directory) - pattern = instance.get("pattern", "*") - recursive = _is_affirmative(instance.get("recursive", False)) - dirtagname = instance.get("dirtagname", "name") - filetagname = instance.get("filetagname", "filename") - filegauges = _is_affirmative(instance.get("filegauges", False)) - - if not exists(abs_directory): - raise Exception("DirectoryCheck: the directory (%s) does not exist" % abs_directory) - - self._get_stats(abs_directory, name, dirtagname, filetagname, filegauges, pattern, recursive) - - def _get_stats(self, directory, name, dirtagname, filetagname, filegauges, pattern, recursive): - dirtags = [dirtagname + ":%s" % name] - directory_bytes = 0 - directory_files = 0 - for root, dirs, files in walk(directory): - for filename in files: - filename = join(root, filename) - # check if it passes our filter - if not fnmatch(filename, pattern): - continue - try: - file_stat = stat(filename) - - except OSError, ose: - self.warning("DirectoryCheck: could not stat file %s - %s" % (filename, ose)) - else: - # file specific metrics - directory_files += 1 - directory_bytes += file_stat.st_size - if filegauges and directory_files <= 20: - filetags = list(dirtags) - filetags.append(filetagname + ":%s" % filename) - self.gauge("system.disk.directory.file.bytes", file_stat.st_size, tags=filetags) - self.gauge("system.disk.directory.file.modified_sec_ago", time.time() - file_stat.st_mtime, tags=filetags) - self.gauge("system.disk.directory.file.created_sec_ago", time.time() - file_stat.st_ctime, tags=filetags) - elif not filegauges: - self.histogram("system.disk.directory.file.bytes", file_stat.st_size, tags=dirtags) - self.histogram("system.disk.directory.file.modified_sec_ago", time.time() - file_stat.st_mtime, tags=dirtags) - self.histogram("system.disk.directory.file.created_sec_ago", time.time() - file_stat.st_ctime, tags=dirtags) - - # os.walk gives us all sub-directories and their files - # if we do not want to do this recursively and just want - # the top level directory we gave it, then break - if not recursive: - break - - # number of files - self.gauge("system.disk.directory.files", directory_files, tags=dirtags) - # total file size - self.gauge("system.disk.directory.bytes", directory_bytes, tags=dirtags) diff --git a/py/checks/disk.py b/py/checks/disk.py deleted file mode 100644 index 2af44cc793b9..000000000000 --- a/py/checks/disk.py +++ /dev/null @@ -1,249 +0,0 @@ -# stdlib -import os -import re - -# 3p -try: - import psutil -except ImportError: - psutil = None - -# project -from checks import AgentCheck -from config import _is_affirmative -from util import Platform -from utils.subprocess_output import get_subprocess_output - - -class Disk(AgentCheck): - """ Collects metrics about the machine's disks. """ - # -T for filesystem info - DF_COMMAND = ['df', '-T'] - METRIC_DISK = 'system.disk.{0}' - METRIC_INODE = 'system.fs.inodes.{0}' - - def __init__(self, name, init_config, agentConfig, instances=None): - if instances is not None and len(instances) > 1: - raise Exception("Disk check only supports one configured instance.") - AgentCheck.__init__(self, name, init_config, - agentConfig, instances=instances) - # Get the configuration once for all - self._load_conf(instances[0]) - - def check(self, instance): - """Get disk space/inode stats""" - # Windows and Mac will always have psutil - # (we have packaged for both of them) - if self._psutil(): - self.collect_metrics_psutil() - else: - # FIXME: implement all_partitions (df -a) - self.collect_metrics_manually() - - @classmethod - def _psutil(cls): - return psutil is not None - - def _load_conf(self, instance): - self._excluded_filesystems = instance.get('excluded_filesystems', []) - self._excluded_disks = instance.get('excluded_disks', []) - self._tag_by_filesystem = _is_affirmative( - instance.get('tag_by_filesystem', False)) - self._all_partitions = _is_affirmative( - instance.get('all_partitions', False)) - - # Force exclusion of CDROM (iso9660) from disk check - self._excluded_filesystems.append('iso9660') - - # FIXME: 6.x, drop use_mount option in datadog.conf - self._load_legacy_option(instance, 'use_mount', False, - operation=_is_affirmative) - # FIXME: 6.x, drop device_blacklist_re option in datadog.conf - self._load_legacy_option(instance, 'excluded_disk_re', '^$', - legacy_name='device_blacklist_re', - operation=re.compile) - - def _load_legacy_option(self, instance, option, default, - legacy_name=None, operation=lambda l: l): - value = instance.get(option, default) - legacy_name = legacy_name or option - - if value == default and legacy_name in self.agentConfig: - self.log.warn( - "Using `{0}` in datadog.conf has been deprecated" - " in favor of `{1}` in disk.yaml".format(legacy_name, option) - ) - value = self.agentConfig.get(legacy_name) or default - setattr(self, '_{0}'.format(option), operation(value)) - - def collect_metrics_psutil(self): - self._valid_disks = {} - for part in psutil.disk_partitions(all=True): - # we check all exclude conditions - if self._exclude_disk_psutil(part): - continue - # Get disk metrics here to be able to exclude on total usage - try: - disk_usage = psutil.disk_usage(part.mountpoint) - except Exception, e: - self.log.debug("Unable to get disk metrics for %s: %s", - part.mountpoint, e) - continue - # Exclude disks with total disk size 0 - if disk_usage.total == 0: - continue - # For later, latency metrics - self._valid_disks[part.device] = (part.fstype, part.mountpoint) - self.log.debug('Passed: {0}'.format(part.device)) - - tags = [part.fstype] if self._tag_by_filesystem else [] - device_name = part.mountpoint if self._use_mount else part.device - - # Note: psutil (0.3.0 to at least 3.1.1) calculates in_use as (used / total) - # The problem here is that total includes reserved space the user - # doesn't have access to. This causes psutil to calculate a misleadng - # percentage for in_use; a lower percentage than df shows. - - # Calculate in_use w/o reserved space; consistent w/ df's Use% metric. - pmets = self._collect_part_metrics(part, disk_usage) - used = 'system.disk.used' - free = 'system.disk.free' - pmets['system.disk.in_use'] = pmets[used] / (pmets[used] + pmets[free]) - - # legacy check names c: vs psutil name C:\\ - if Platform.is_win32(): - device_name = device_name.strip('\\').lower() - for metric_name, metric_value in pmets.iteritems(): - self.gauge(metric_name, metric_value, - tags=tags, device_name=device_name) - # And finally, latency metrics, a legacy gift from the old Windows Check - if Platform.is_win32(): - self.collect_latency_metrics() - - def _exclude_disk_psutil(self, part): - # skip cd-rom drives with no disk in it; they may raise - # ENOENT, pop-up a Windows GUI error for a non-ready - # partition or just hang; - # and all the other excluded disks - return ((Platform.is_win32() and ('cdrom' in part.opts or - part.fstype == '')) or - self._exclude_disk(part.device, part.fstype)) - - # We don't want all those incorrect devices - def _exclude_disk(self, name, filesystem): - return (((not name or name == 'none') and not self._all_partitions) or - name in self._excluded_disks or - self._excluded_disk_re.match(name) or - filesystem in self._excluded_filesystems) - - def _collect_part_metrics(self, part, usage): - metrics = {} - for name in ['total', 'used', 'free']: - # For legacy reasons, the standard unit it kB - metrics[self.METRIC_DISK.format(name)] = getattr(usage, name) / 1024.0 - # FIXME: 6.x, use percent, a lot more logical than in_use - metrics[self.METRIC_DISK.format('in_use')] = usage.percent / 100.0 - if Platform.is_unix(): - metrics.update(self._collect_inodes_metrics(part.mountpoint)) - - return metrics - - def _collect_inodes_metrics(self, mountpoint): - metrics = {} - inodes = os.statvfs(mountpoint) - if inodes.f_files != 0: - total = inodes.f_files - free = inodes.f_ffree - metrics[self.METRIC_INODE.format('total')] = total - metrics[self.METRIC_INODE.format('free')] = free - metrics[self.METRIC_INODE.format('used')] = total - free - # FIXME: 6.x, use percent, a lot more logical than in_use - metrics[self.METRIC_INODE.format('in_use')] = \ - (total - free) / float(total) - return metrics - - def collect_latency_metrics(self): - for disk_name, disk in psutil.disk_io_counters(True).iteritems(): - self.log.debug('IO Counters: {0} -> {1}'.format(disk_name, disk)) - # x100 to have it as a percentage, - # /1000 as psutil returns the value in ms - read_time_pct = disk.read_time * 100.0 / 1000.0 - write_time_pct = disk.write_time * 100.0 / 1000.0 - self.rate(self.METRIC_DISK.format('read_time_pct'), - read_time_pct, device_name=disk_name) - self.rate(self.METRIC_DISK.format('write_time_pct'), - write_time_pct, device_name=disk_name) - - # no psutil, let's use df - def collect_metrics_manually(self): - df_out, _, _ = get_subprocess_output(self.DF_COMMAND + ['-k'], self.log) - self.log.debug(df_out) - for device in self._list_devices(df_out): - self.log.debug("Passed: {0}".format(device)) - tags = [device[1]] if self._tag_by_filesystem else [] - device_name = device[-1] if self._use_mount else device[0] - for metric_name, value in self._collect_metrics_manually(device).iteritems(): - self.gauge(metric_name, value, tags=tags, - device_name=device_name) - - def _collect_metrics_manually(self, device): - result = {} - - used = float(device[3]) - free = float(device[4]) - - # device is - # ["/dev/sda1", "ext4", 524288, 171642, 352646, "33%", "/"] - result[self.METRIC_DISK.format('total')] = float(device[2]) - result[self.METRIC_DISK.format('used')] = used - result[self.METRIC_DISK.format('free')] = free - - # Rather than grabbing in_use, let's calculate it to be more precise - result[self.METRIC_DISK.format('in_use')] = used / (used + free) - - result.update(self._collect_inodes_metrics(device[-1])) - return result - - def _keep_device(self, device): - # device is for Unix - # [/dev/disk0s2, ext4, 244277768, 88767396, 155254372, 37%, /] - # First, skip empty lines. - # then filter our fake hosts like 'map -hosts'. - # Filesystem Type 1024-blocks Used Available Capacity Mounted on - # /dev/disk0s2 ext4 244277768 88767396 155254372 37% / - # map -hosts tmpfs 0 0 0 100% /net - # and finally filter out fake devices - return (device and len(device) > 1 and - device[2].isdigit() and - not self._exclude_disk(device[0], device[1])) - - def _flatten_devices(self, devices): - # Some volumes are stored on their own line. Rejoin them here. - previous = None - for parts in devices: - if len(parts) == 1: - previous = parts[0] - elif previous and self._is_number(parts[0]): - # collate with previous line - parts.insert(0, previous) - previous = None - else: - previous = None - return devices - - def _list_devices(self, df_output): - """ - Given raw output for the df command, transform it into a normalized - list devices. A 'device' is a list with fields corresponding to the - output of df output on each platform. - """ - all_devices = [l.strip().split() for l in df_output.splitlines()] - - # Skip the header row and empty lines. - raw_devices = [l for l in all_devices[1:] if l] - - # Flatten the disks that appear in the mulitple lines. - flattened_devices = self._flatten_devices(raw_devices) - - # Filter fake or unwanteddisks. - return [d for d in flattened_devices if self._keep_device(d)] diff --git a/py/checks/docker.py b/py/checks/docker.py deleted file mode 100644 index 1386f565369d..000000000000 --- a/py/checks/docker.py +++ /dev/null @@ -1,471 +0,0 @@ -# stdlib -import urllib2 -import urllib -import httplib -import socket -import os -import re -import time -from urlparse import urlsplit -from util import json -from collections import defaultdict - -# project -from checks import AgentCheck -from config import _is_affirmative - -EVENT_TYPE = SOURCE_TYPE_NAME = 'docker' - -CGROUP_METRICS = [ - { - "cgroup": "memory", - "file": "memory.stat", - "metrics": { - # Default metrics - "cache": ("docker.mem.cache", "gauge", True), - "rss": ("docker.mem.rss", "gauge", True), - "swap": ("docker.mem.swap", "gauge", True), - # Optional metrics - "active_anon": ("docker.mem.active_anon", "gauge", False), - "active_file": ("docker.mem.active_file", "gauge", False), - "inactive_anon": ("docker.mem.inactive_anon", "gauge", False), - "inactive_file": ("docker.mem.inactive_file", "gauge", False), - "mapped_file": ("docker.mem.mapped_file", "gauge", False), - "pgfault": ("docker.mem.pgfault", "rate", False), - "pgmajfault": ("docker.mem.pgmajfault", "rate", False), - "pgpgin": ("docker.mem.pgpgin", "rate", False), - "pgpgout": ("docker.mem.pgpgout", "rate", False), - "unevictable": ("docker.mem.unevictable", "gauge", False), - } - }, - { - "cgroup": "cpuacct", - "file": "cpuacct.stat", - "metrics": { - "user": ("docker.cpu.user", "rate", True), - "system": ("docker.cpu.system", "rate", True), - }, - }, -] - -DOCKER_METRICS = { - "SizeRw": ("docker.disk.size", "gauge"), -} - -DOCKER_TAGS = [ - "Command", - "Image", -] - -NEW_TAGS_MAP = { - "name": "container_name", - "image": "docker_image", - "command": "container_command", -} - -DEFAULT_SOCKET_TIMEOUT = 5 - -class DockerJSONDecodeError(Exception): - """ Raised when there is trouble parsing the API response sent by Docker Remote API """ - pass - -class UnixHTTPConnection(httplib.HTTPConnection): - """Class used in conjuction with UnixSocketHandler to make urllib2 - compatible with Unix sockets.""" - - socket_timeout = DEFAULT_SOCKET_TIMEOUT - - def __init__(self, unix_socket): - self._unix_socket = unix_socket - - def connect(self): - sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - sock.connect(self._unix_socket) - sock.settimeout(self.socket_timeout) - self.sock = sock - - def __call__(self, *args, **kwargs): - httplib.HTTPConnection.__init__(self, *args, **kwargs) - return self - - -class UnixSocketHandler(urllib2.AbstractHTTPHandler): - """Class that makes Unix sockets work with urllib2 without any additional - dependencies.""" - def unix_open(self, req): - full_path = "%s%s" % urlsplit(req.get_full_url())[1:3] - path = os.path.sep - for part in full_path.split("/"): - path = os.path.join(path, part) - if not os.path.exists(path): - break - unix_socket = path - # add a host or else urllib2 complains - url = req.get_full_url().replace(unix_socket, "/localhost") - new_req = urllib2.Request(url, req.get_data(), dict(req.header_items())) - new_req.timeout = req.timeout - return self.do_open(UnixHTTPConnection(unix_socket), new_req) - - unix_request = urllib2.AbstractHTTPHandler.do_request_ - - -class Docker(AgentCheck): - """Collect metrics and events from Docker API and cgroups""" - - def __init__(self, name, init_config, agentConfig, instances=None): - AgentCheck.__init__(self, name, init_config, agentConfig, instances) - - # Initialize a HTTP opener with Unix socket support - socket_timeout = int(init_config.get('socket_timeout', 0)) or DEFAULT_SOCKET_TIMEOUT - UnixHTTPConnection.socket_timeout = socket_timeout - self.url_opener = urllib2.build_opener(UnixSocketHandler()) - - # Locate cgroups directories - self._mountpoints = {} - self._cgroup_filename_pattern = None - docker_root = init_config.get('docker_root', '/') - for metric in CGROUP_METRICS: - self._mountpoints[metric["cgroup"]] = self._find_cgroup(metric["cgroup"], docker_root) - - self._last_event_collection_ts = defaultdict(lambda: None) - - def check(self, instance): - # Report image metrics - self.warning('Using the "docker" check is deprecated and will be removed' - ' in a future version of the agent. Please use the "docker_daemon" one instead') - if _is_affirmative(instance.get('collect_images_stats', True)): - self._count_images(instance) - - # Get the list of containers and the index of their names - containers, ids_to_names = self._get_and_count_containers(instance) - - # Report container metrics from cgroups - skipped_container_ids = self._report_containers_metrics(containers, instance) - - # Send events from Docker API - if _is_affirmative(instance.get('collect_events', True)): - self._process_events(instance, ids_to_names, skipped_container_ids) - - - # Containers - - def _count_images(self, instance): - # It's not an important metric, keep going if it fails - try: - tags = instance.get("tags", []) - active_images = len(self._get_images(instance, get_all=False)) - all_images = len(self._get_images(instance, get_all=True)) - - self.gauge("docker.images.available", active_images, tags=tags) - self.gauge("docker.images.intermediate", (all_images - active_images), tags=tags) - except Exception, e: - self.warning("Failed to count Docker images. Exception: {0}".format(e)) - - def _get_and_count_containers(self, instance): - tags = instance.get("tags", []) - with_size = _is_affirmative(instance.get('collect_container_size', False)) - - service_check_name = 'docker.service_up' - try: - running_containers = self._get_containers(instance, with_size=with_size) - all_containers = self._get_containers(instance, get_all=True) - except (socket.timeout, urllib2.URLError), e: - self.service_check(service_check_name, AgentCheck.CRITICAL, - message="Unable to list Docker containers: {0}".format(e)) - raise Exception("Failed to collect the list of containers. Exception: {0}".format(e)) - self.service_check(service_check_name, AgentCheck.OK) - - running_containers_ids = set([container['Id'] for container in running_containers]) - - for container in all_containers: - container_tags = list(tags) - for key in DOCKER_TAGS: - tag = self._make_tag(key, container[key], instance) - if tag: - container_tags.append(tag) - if container['Id'] in running_containers_ids: - self.set("docker.containers.running", container['Id'], tags=container_tags) - else: - self.set("docker.containers.stopped", container['Id'], tags=container_tags) - - # The index of the names is used to generate and format events - ids_to_names = {} - for container in all_containers: - ids_to_names[container['Id']] = self._get_container_name(container) - - return running_containers, ids_to_names - - def _get_container_name(self, container): - # Use either the first container name or the container ID to name the container in our events - if container.get('Names', []): - return container['Names'][0].lstrip("/") - return container['Id'][:11] - - def _prepare_filters(self, instance): - # The reasoning is to check exclude first, so we can skip if there is no exclude - if not instance.get("exclude"): - return False - - # Compile regex - instance["exclude_patterns"] = [re.compile(rule) for rule in instance.get("exclude", [])] - instance["include_patterns"] = [re.compile(rule) for rule in instance.get("include", [])] - - return True - - def _is_container_excluded(self, instance, tags): - if self._tags_match_patterns(tags, instance.get("exclude_patterns")): - if self._tags_match_patterns(tags, instance.get("include_patterns")): - return False - return True - return False - - def _tags_match_patterns(self, tags, filters): - for rule in filters: - for tag in tags: - if re.match(rule, tag): - return True - return False - - def _report_containers_metrics(self, containers, instance): - skipped_container_ids = [] - collect_uncommon_metrics = _is_affirmative(instance.get("collect_all_metrics", False)) - tags = instance.get("tags", []) - - # Pre-compile regex to include/exclude containers - use_filters = self._prepare_filters(instance) - - for container in containers: - container_tags = list(tags) - for name in container["Names"]: - container_tags.append(self._make_tag("name", name.lstrip("/"), instance)) - for key in DOCKER_TAGS: - tag = self._make_tag(key, container[key], instance) - if tag: - container_tags.append(tag) - - # Check if the container is included/excluded via its tags - if use_filters and self._is_container_excluded(instance, container_tags): - skipped_container_ids.append(container['Id']) - continue - - for key, (dd_key, metric_type) in DOCKER_METRICS.iteritems(): - if key in container: - getattr(self, metric_type)(dd_key, int(container[key]), tags=container_tags) - for cgroup in CGROUP_METRICS: - stat_file = self._get_cgroup_file(cgroup["cgroup"], container['Id'], cgroup['file']) - stats = self._parse_cgroup_file(stat_file) - if stats: - for key, (dd_key, metric_type, common_metric) in cgroup['metrics'].iteritems(): - if key in stats and (common_metric or collect_uncommon_metrics): - getattr(self, metric_type)(dd_key, int(stats[key]), tags=container_tags) - if use_filters: - self.log.debug("List of excluded containers: {0}".format(skipped_container_ids)) - - return skipped_container_ids - - def _make_tag(self, key, value, instance): - tag_name = key.lower() - if tag_name == "command" and not instance.get("tag_by_command", False): - return None - if instance.get("new_tag_names", False): - tag_name = self._new_tags_conversion(tag_name) - - return "%s:%s" % (tag_name, value.strip()) - - def _new_tags_conversion(self, tag): - # Prefix tags to avoid conflict with AWS tags - return NEW_TAGS_MAP.get(tag, tag) - - - # Events - - def _process_events(self, instance, ids_to_names, skipped_container_ids): - try: - api_events = self._get_events(instance) - aggregated_events = self._pre_aggregate_events(api_events, skipped_container_ids) - events = self._format_events(aggregated_events, ids_to_names) - self._report_events(events) - except (socket.timeout, urllib2.URLError): - self.warning('Timeout during socket connection. Events will be missing.') - - def _pre_aggregate_events(self, api_events, skipped_container_ids): - # Aggregate events, one per image. Put newer events first. - events = defaultdict(list) - for event in api_events: - # Skip events related to filtered containers - if event['id'] in skipped_container_ids: - self.log.debug("Excluded event: container {0} status changed to {1}".format( - event['id'], event['status'])) - continue - # Known bug: from may be missing - if 'from' in event: - events[event['from']].insert(0, event) - - return events - - def _format_events(self, aggregated_events, ids_to_names): - events = [] - for image_name, event_group in aggregated_events.iteritems(): - max_timestamp = 0 - status = defaultdict(int) - status_change = [] - for event in event_group: - max_timestamp = max(max_timestamp, int(event['time'])) - status[event['status']] += 1 - container_name = event['id'][:12] - if event['id'] in ids_to_names: - container_name = "%s %s" % (container_name, ids_to_names[event['id']]) - status_change.append([container_name, event['status']]) - - status_text = ", ".join(["%d %s" % (count, st) for st, count in status.iteritems()]) - msg_title = "%s %s on %s" % (image_name, status_text, self.hostname) - msg_body = ("%%%\n" - "{image_name} {status} on {hostname}\n" - "```\n{status_changes}\n```\n" - "%%%").format( - image_name=image_name, - status=status_text, - hostname=self.hostname, - status_changes="\n".join( - ["%s \t%s" % (change[1].upper(), change[0]) for change in status_change]) - ) - events.append({ - 'timestamp': max_timestamp, - 'host': self.hostname, - 'event_type': EVENT_TYPE, - 'msg_title': msg_title, - 'msg_text': msg_body, - 'source_type_name': EVENT_TYPE, - 'event_object': 'docker:%s' % image_name, - }) - - return events - - def _report_events(self, events): - for ev in events: - self.log.debug("Creating event: %s" % ev['msg_title']) - self.event(ev) - - - # Docker API - - def _get_containers(self, instance, with_size=False, get_all=False): - """Gets the list of running/all containers in Docker.""" - return self._get_json("%(url)s/containers/json" % instance, params={'size': with_size, 'all': get_all}) - - def _get_images(self, instance, with_size=True, get_all=False): - """Gets the list of images in Docker.""" - return self._get_json("%(url)s/images/json" % instance, params={'all': get_all}) - - def _get_events(self, instance): - """Get the list of events """ - now = int(time.time()) - try: - result = self._get_json( - "%s/events" % instance["url"], - params={ - "until": now, - "since": self._last_event_collection_ts[instance["url"]] or now - 60, - }, - multi=True - ) - self._last_event_collection_ts[instance["url"]] = now - if type(result) == dict: - result = [result] - return result - except DockerJSONDecodeError: - return [] - - def _get_json(self, uri, params=None, multi=False): - """Utility method to get and parse JSON streams.""" - if params: - uri = "%s?%s" % (uri, urllib.urlencode(params)) - self.log.debug("Connecting to Docker API at: %s" % uri) - req = urllib2.Request(uri, None) - - try: - request = self.url_opener.open(req) - except urllib2.URLError, e: - if "Errno 13" in str(e): - raise Exception("Unable to connect to socket. dd-agent user must be part of the 'docker' group") - raise - - response = request.read() - response = response.replace('\n', '') # Some Docker API versions occassionally send newlines in responses - self.log.debug('Docker API response: %s', response) - if multi and "}{" in response: # docker api sometimes returns juxtaposed json dictionaries - response = "[{0}]".format(response.replace("}{", "},{")) - - if not response: - return [] - - try: - return json.loads(response) - except Exception as e: - self.log.error('Failed to parse Docker API response: %s', response) - raise DockerJSONDecodeError - - # Cgroups - - def _find_cgroup_filename_pattern(self): - if self._mountpoints: - # We try with different cgroups so that it works even if only one is properly working - for mountpoint in self._mountpoints.values(): - stat_file_path_lxc = os.path.join(mountpoint, "lxc") - stat_file_path_docker = os.path.join(mountpoint, "docker") - stat_file_path_coreos = os.path.join(mountpoint, "system.slice") - - if os.path.exists(stat_file_path_lxc): - return os.path.join('%(mountpoint)s/lxc/%(id)s/%(file)s') - elif os.path.exists(stat_file_path_docker): - return os.path.join('%(mountpoint)s/docker/%(id)s/%(file)s') - elif os.path.exists(stat_file_path_coreos): - return os.path.join('%(mountpoint)s/system.slice/docker-%(id)s.scope/%(file)s') - - raise Exception("Cannot find Docker cgroup directory. Be sure your system is supported.") - - def _get_cgroup_file(self, cgroup, container_id, filename): - # This can't be initialized at startup because cgroups may not be mounted yet - if not self._cgroup_filename_pattern: - self._cgroup_filename_pattern = self._find_cgroup_filename_pattern() - - return self._cgroup_filename_pattern % (dict( - mountpoint=self._mountpoints[cgroup], - id=container_id, - file=filename, - )) - - def _find_cgroup(self, hierarchy, docker_root): - """Finds the mount point for a specified cgroup hierarchy. Works with - old style and new style mounts.""" - with open(os.path.join(docker_root, "/proc/mounts"), 'r') as fp: - mounts = map(lambda x: x.split(), fp.read().splitlines()) - - cgroup_mounts = filter(lambda x: x[2] == "cgroup", mounts) - if len(cgroup_mounts) == 0: - raise Exception("Can't find mounted cgroups. If you run the Agent inside a container," - " please refer to the documentation.") - # Old cgroup style - if len(cgroup_mounts) == 1: - return os.path.join(docker_root, cgroup_mounts[0][1]) - - candidate = None - for _, mountpoint, _, opts, _, _ in cgroup_mounts: - if hierarchy in opts: - if mountpoint.startswith("/host/"): - return os.path.join(docker_root, mountpoint) - candidate = mountpoint - if candidate is not None: - return os.path.join(docker_root, candidate) - raise Exception("Can't find mounted %s cgroups." % hierarchy) - - - def _parse_cgroup_file(self, stat_file): - """Parses a cgroup pseudo file for key/values.""" - self.log.debug("Opening cgroup file: %s" % stat_file) - try: - with open(stat_file, 'r') as fp: - return dict(map(lambda x: x.split(), fp.read().splitlines())) - except IOError: - # It is possible that the container got stopped between the API call and now - self.log.info("Can't open %s. Metrics for this container are skipped." % stat_file) diff --git a/py/checks/docker_daemon.py b/py/checks/docker_daemon.py deleted file mode 100644 index 028945d18c88..000000000000 --- a/py/checks/docker_daemon.py +++ /dev/null @@ -1,722 +0,0 @@ -# stdlib -import os -import re -import requests -import time -import socket -import urllib2 -from collections import defaultdict, Counter, deque - -# project -from checks import AgentCheck -from config import _is_affirmative -from utils.dockerutil import find_cgroup, find_cgroup_filename_pattern, get_client, MountException, \ - set_docker_settings, image_tag_extractor, container_name_extractor -from utils.kubeutil import get_kube_labels -from utils.platform import Platform - - -EVENT_TYPE = 'docker' -SERVICE_CHECK_NAME = 'docker.service_up' -SIZE_REFRESH_RATE = 5 # Collect container sizes every 5 iterations of the check -MAX_CGROUP_LISTING_RETRIES = 3 -CONTAINER_ID_RE = re.compile('[0-9a-f]{64}') -POD_NAME_LABEL = "io.kubernetes.pod.name" - -GAUGE = AgentCheck.gauge -RATE = AgentCheck.rate -HISTORATE = AgentCheck.generate_historate_func(["container_name"]) -HISTO = AgentCheck.generate_histogram_func(["container_name"]) -FUNC_MAP = { - GAUGE: {True: HISTO, False: GAUGE}, - RATE: {True: HISTORATE, False: RATE} -} - -CGROUP_METRICS = [ - { - "cgroup": "memory", - "file": "memory.stat", - "metrics": { - "cache": ("docker.mem.cache", GAUGE), - "rss": ("docker.mem.rss", GAUGE), - "swap": ("docker.mem.swap", GAUGE), - }, - "to_compute": { - # We only get these metrics if they are properly set, i.e. they are a "reasonable" value - "docker.mem.limit": (["hierarchical_memory_limit"], lambda x: float(x) if float(x) < 2 ** 60 else None, GAUGE), - "docker.mem.sw_limit": (["hierarchical_memsw_limit"], lambda x: float(x) if float(x) < 2 ** 60 else None, GAUGE), - "docker.mem.in_use": (["rss", "hierarchical_memory_limit"], lambda x,y: float(x)/float(y) if float(y) < 2 ** 60 else None, GAUGE), - "docker.mem.sw_in_use": (["swap", "rss", "hierarchical_memsw_limit"], lambda x,y,z: float(x + y)/float(z) if float(z) < 2 ** 60 else None, GAUGE) - - } - }, - { - "cgroup": "cpuacct", - "file": "cpuacct.stat", - "metrics": { - "user": ("docker.cpu.user", RATE), - "system": ("docker.cpu.system", RATE), - }, - }, - { - "cgroup": "blkio", - "file": 'blkio.throttle.io_service_bytes', - "metrics": { - "io_read": ("docker.io.read_bytes", RATE), - "io_write": ("docker.io.write_bytes", RATE), - }, - }, -] - -DEFAULT_CONTAINER_TAGS = [ - "docker_image", - "image_name", - "image_tag", -] - -DEFAULT_PERFORMANCE_TAGS = [ - "container_name", - "docker_image", - "image_name", - "image_tag", -] - -DEFAULT_IMAGE_TAGS = [ - 'image_name', - 'image_tag' -] - - -TAG_EXTRACTORS = { - "docker_image": lambda c: [c["Image"]], - "image_name": lambda c: image_tag_extractor(c, 0), - "image_tag": lambda c: image_tag_extractor(c, 1), - "container_command": lambda c: [c["Command"]], - "container_name": container_name_extractor, -} - -CONTAINER = "container" -PERFORMANCE = "performance" -FILTERED = "filtered" -IMAGE = "image" - - -def get_mountpoints(docker_root): - mountpoints = {} - for metric in CGROUP_METRICS: - mountpoints[metric["cgroup"]] = find_cgroup(metric["cgroup"], docker_root) - return mountpoints - -def get_filters(include, exclude): - # The reasoning is to check exclude first, so we can skip if there is no exclude - if not exclude: - return - - filtered_tag_names = [] - exclude_patterns = [] - include_patterns = [] - - # Compile regex - for rule in exclude: - exclude_patterns.append(re.compile(rule)) - filtered_tag_names.append(rule.split(':')[0]) - for rule in include: - include_patterns.append(re.compile(rule)) - filtered_tag_names.append(rule.split(':')[0]) - - return set(exclude_patterns), set(include_patterns), set(filtered_tag_names) - - -class DockerDaemon(AgentCheck): - """Collect metrics and events from Docker API and cgroups.""" - - def __init__(self, name, init_config, agentConfig, instances=None): - if instances is not None and len(instances) > 1: - raise Exception("Docker check only supports one configured instance.") - AgentCheck.__init__(self, name, init_config, - agentConfig, instances=instances) - - self.init_success = False - self.init() - - def is_k8s(self): - return self.is_check_enabled("kubernetes") - - def init(self): - try: - # We configure the check with the right cgroup settings for this host - # Just needs to be done once - instance = self.instances[0] - set_docker_settings(self.init_config, instance) - - self.client = get_client() - self._docker_root = self.init_config.get('docker_root', '/') - self._mountpoints = get_mountpoints(self._docker_root) - self.cgroup_listing_retries = 0 - self._latest_size_query = 0 - self._filtered_containers = set() - self._disable_net_metrics = False - - # At first run we'll just collect the events from the latest 60 secs - self._last_event_collection_ts = int(time.time()) - 60 - - # Set tagging options - self.custom_tags = instance.get("tags", []) - self.collect_labels_as_tags = instance.get("collect_labels_as_tags", []) - self.kube_labels = {} - - self.use_histogram = _is_affirmative(instance.get('use_histogram', False)) - performance_tags = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS) - - self.tag_names = { - CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS), - PERFORMANCE: performance_tags, - IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS) - - } - - # Set filtering settings - if not instance.get("exclude"): - self._filtering_enabled = False - if instance.get("include"): - self.log.warning("You must specify an exclude section to enable filtering") - else: - self._filtering_enabled = True - include = instance.get("include", []) - exclude = instance.get("exclude", []) - self._exclude_patterns, self._include_patterns, _filtered_tag_names = get_filters(include, exclude) - self.tag_names[FILTERED] = _filtered_tag_names - - - # Other options - self.collect_image_stats = _is_affirmative(instance.get('collect_images_stats', False)) - self.collect_container_size = _is_affirmative(instance.get('collect_container_size', False)) - self.collect_events = _is_affirmative(instance.get('collect_events', True)) - self.collect_image_size = _is_affirmative(instance.get('collect_image_size', False)) - self.collect_ecs_tags = _is_affirmative(instance.get('ecs_tags', True)) and Platform.is_ecs_instance() - - self.ecs_tags = {} - - except Exception, e: - self.log.critical(e) - self.warning("Initialization failed. Will retry at next iteration") - else: - self.init_success = True - - def check(self, instance): - """Run the Docker check for one instance.""" - if not self.init_success: - # Initialization can fail if cgroups are not ready. So we retry if needed - # https://github.com/DataDog/dd-agent/issues/1896 - self.init() - if not self.init_success: - # Initialization failed, will try later - return - - # Report image metrics - if self.collect_image_stats: - self._count_and_weigh_images() - - if self.collect_ecs_tags: - self.refresh_ecs_tags() - - if self.is_k8s(): - self.kube_labels = get_kube_labels() - - # Get the list of containers and the index of their names - containers_by_id = self._get_and_count_containers() - containers_by_id = self._crawl_container_pids(containers_by_id) - - # Report performance container metrics (cpu, mem, net, io) - self._report_performance_metrics(containers_by_id) - - if self.collect_container_size: - self._report_container_size(containers_by_id) - - # Send events from Docker API - if self.collect_events: - self._process_events(containers_by_id) - - def _count_and_weigh_images(self): - try: - tags = self._get_tags() - active_images = self.client.images(all=False) - active_images_len = len(active_images) - all_images_len = len(self.client.images(quiet=True, all=True)) - self.gauge("docker.images.available", active_images_len, tags=tags) - self.gauge("docker.images.intermediate", (all_images_len - active_images_len), tags=tags) - - if self.collect_image_size: - self._report_image_size(active_images) - - except Exception, e: - # It's not an important metric, keep going if it fails - self.warning("Failed to count Docker images. Exception: {0}".format(e)) - - def _get_and_count_containers(self): - """List all the containers from the API, filter and count them.""" - - # Querying the size of containers is slow, we don't do it at each run - must_query_size = self.collect_container_size and self._latest_size_query == 0 - self._latest_size_query = (self._latest_size_query + 1) % SIZE_REFRESH_RATE - - running_containers_count = Counter() - all_containers_count = Counter() - - try: - containers = self.client.containers(all=True, size=must_query_size) - except Exception, e: - message = "Unable to list Docker containers: {0}".format(e) - self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL, - message=message) - raise Exception(message) - - else: - self.service_check(SERVICE_CHECK_NAME, AgentCheck.OK) - - # Filter containers according to the exclude/include rules - self._filter_containers(containers) - - containers_by_id = {} - - for container in containers: - container_name = container_name_extractor(container)[0] - - container_status_tags = self._get_tags(container, CONTAINER) - - all_containers_count[tuple(sorted(container_status_tags))] += 1 - if self._is_container_running(container): - running_containers_count[tuple(sorted(container_status_tags))] += 1 - - # Check if the container is included/excluded via its tags - if self._is_container_excluded(container): - self.log.debug("Container {0} is excluded".format(container_name)) - continue - - containers_by_id[container['Id']] = container - - for tags, count in running_containers_count.iteritems(): - self.gauge("docker.containers.running", count, tags=list(tags)) - - for tags, count in all_containers_count.iteritems(): - stopped_count = count - running_containers_count[tags] - self.gauge("docker.containers.stopped", stopped_count, tags=list(tags)) - - return containers_by_id - - def _is_container_running(self, container): - """Tell if a container is running, according to its status. - - There is no "nice" API field to figure it out. We just look at the "Status" field, knowing how it is generated. - See: https://github.com/docker/docker/blob/v1.6.2/daemon/state.go#L35 - """ - return container["Status"].startswith("Up") or container["Status"].startswith("Restarting") - - def _get_tags(self, entity=None, tag_type=None): - """Generate the tags for a given entity (container or image) according to a list of tag names.""" - # Start with custom tags - tags = list(self.custom_tags) - - # Collect pod names as tags on kubernetes - if self.is_k8s() and POD_NAME_LABEL not in self.collect_labels_as_tags: - self.collect_labels_as_tags.append(POD_NAME_LABEL) - - if entity is not None: - pod_name = None - - # Get labels as tags - labels = entity.get("Labels") - if labels is not None: - for k in self.collect_labels_as_tags: - if k in labels: - v = labels[k] - if k == POD_NAME_LABEL and self.is_k8s(): - pod_name = v - k = "pod_name" - if "-" in pod_name: - replication_controller = "-".join(pod_name.split("-")[:-1]) - if "/" in replication_controller: - namespace, replication_controller = replication_controller.split("/", 1) - tags.append("kube_namespace:%s" % namespace) - - tags.append("kube_replication_controller:%s" % replication_controller) - - if not v: - tags.append(k) - else: - tags.append("%s:%s" % (k,v)) - if k == POD_NAME_LABEL and self.is_k8s() and k not in labels: - tags.append("pod_name:no_pod") - - # Get entity specific tags - if tag_type is not None: - tag_names = self.tag_names[tag_type] - for tag_name in tag_names: - tag_value = self._extract_tag_value(entity, tag_name) - if tag_value is not None: - for t in tag_value: - tags.append('%s:%s' % (tag_name, str(t).strip())) - - # Add ECS tags - if self.collect_ecs_tags: - entity_id = entity.get("Id") - if entity_id in self.ecs_tags: - ecs_tags = self.ecs_tags[entity_id] - tags.extend(ecs_tags) - - # Add kube labels - if self.is_k8s(): - kube_tags = self.kube_labels.get(pod_name) - if kube_tags: - tags.extend(list(kube_tags)) - - - return tags - - def _extract_tag_value(self, entity, tag_name): - """Extra tag information from the API result (containers or images). - Cache extracted tags inside the entity object. - """ - if tag_name not in TAG_EXTRACTORS: - self.warning("{0} isn't a supported tag".format(tag_name)) - return - - # Check for already extracted tags - if "_tag_values" not in entity: - entity["_tag_values"] = {} - - if tag_name not in entity["_tag_values"]: - entity["_tag_values"][tag_name] = TAG_EXTRACTORS[tag_name](entity) - - return entity["_tag_values"][tag_name] - - def refresh_ecs_tags(self): - ecs_config = self.client.inspect_container('ecs-agent') - ip = ecs_config.get('NetworkSettings', {}).get('IPAddress') - ports = ecs_config.get('NetworkSettings', {}).get('Ports') - port = ports.keys()[0].split('/')[0] if ports else None - ecs_tags = {} - if ip and port: - tasks = requests.get('http://%s:%s/v1/tasks' % (ip, port)).json() - for task in tasks.get('Tasks', []): - for container in task.get('Containers', []): - tags = ['task_name:%s' % task['Family'], 'task_version:%s' % task['Version']] - ecs_tags[container['DockerId']] = tags - - self.ecs_tags = ecs_tags - - def _filter_containers(self, containers): - if not self._filtering_enabled: - return - - self._filtered_containers = set() - for container in containers: - container_tags = self._get_tags(container, FILTERED) - if self._are_tags_filtered(container_tags): - container_name = container_name_extractor(container)[0] - self._filtered_containers.add(container_name) - self.log.debug("Container {0} is filtered".format(container["Names"][0])) - - - def _are_tags_filtered(self, tags): - if self._tags_match_patterns(tags, self._exclude_patterns): - if self._tags_match_patterns(tags, self._include_patterns): - return False - return True - return False - - def _tags_match_patterns(self, tags, filters): - for rule in filters: - for tag in tags: - if re.match(rule, tag): - return True - return False - - def _is_container_excluded(self, container): - """Check if a container is excluded according to the filter rules. - - Requires _filter_containers to run first. - """ - container_name = container_name_extractor(container)[0] - return container_name in self._filtered_containers - - def _report_container_size(self, containers_by_id): - container_list_with_size = None - for container in containers_by_id.itervalues(): - if self._is_container_excluded(container): - continue - - tags = self._get_tags(container, PERFORMANCE) - m_func = FUNC_MAP[GAUGE][self.use_histogram] - if "SizeRw" in container: - - m_func(self, 'docker.container.size_rw', container['SizeRw'], - tags=tags) - if "SizeRootFs" in container: - m_func( - self, 'docker.container.size_rootfs', container['SizeRootFs'], - tags=tags) - - def _report_image_size(self, images): - for image in images: - tags = self._get_tags(image, IMAGE) - if 'VirtualSize' in image: - self.gauge('docker.image.virtual_size', image['VirtualSize'], tags=tags) - if 'Size' in image: - self.gauge('docker.image.size', image['Size'], tags=tags) - - # Performance metrics - - def _report_performance_metrics(self, containers_by_id): - - containers_without_proc_root = [] - for container in containers_by_id.itervalues(): - if self._is_container_excluded(container) or not self._is_container_running(container): - continue - - tags = self._get_tags(container, PERFORMANCE) - self._report_cgroup_metrics(container, tags) - if "_proc_root" not in container: - containers_without_proc_root.append(container_name_extractor(container)[0]) - continue - self._report_net_metrics(container, tags) - - if containers_without_proc_root: - message = "Couldn't find pid directory for container: {0}. They'll be missing network metrics".format( - ",".join(containers_without_proc_root)) - if not self.is_k8s(): - self.warning(message) - else: - # On kubernetes, this is kind of expected. Network metrics will be collected by the kubernetes integration anyway - self.log.debug(message) - - - def _report_cgroup_metrics(self, container, tags): - try: - for cgroup in CGROUP_METRICS: - stat_file = self._get_cgroup_file(cgroup["cgroup"], container['Id'], cgroup['file']) - stats = self._parse_cgroup_file(stat_file) - if stats: - for key, (dd_key, metric_func) in cgroup['metrics'].iteritems(): - metric_func = FUNC_MAP[metric_func][self.use_histogram] - if key in stats: - metric_func(self, dd_key, int(stats[key]), tags=tags) - - # Computed metrics - for mname, (key_list, fct, metric_func) in cgroup.get('to_compute', {}).iteritems(): - values = [stats[key] for key in key_list if key in stats] - if len(values) != len(key_list): - self.log.debug("Couldn't compute {0}, some keys were missing.".format(mname)) - continue - value = fct(*values) - metric_func = FUNC_MAP[metric_func][self.use_histogram] - if value is not None: - metric_func(self, mname, value, tags=tags) - - except MountException as ex: - if self.cgroup_listing_retries > MAX_CGROUP_LISTING_RETRIES: - raise ex - else: - self.warning("Couldn't find the cgroup files. Skipping the CGROUP_METRICS for now." - "Will retry {0} times before failing.".format(MAX_CGROUP_LISTING_RETRIES - self.cgroup_listing_retries)) - self.cgroup_listing_retries += 1 - else: - self.cgroup_listing_retries = 0 - - def _report_net_metrics(self, container, tags): - """Find container network metrics by looking at /proc/$PID/net/dev of the container process.""" - if self._disable_net_metrics: - self.log.debug("Network metrics are disabled. Skipping") - return - - proc_net_file = os.path.join(container['_proc_root'], 'net/dev') - try: - with open(proc_net_file, 'r') as fp: - lines = fp.readlines() - """Two first lines are headers: - Inter-| Receive | Transmit - face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed - """ - for l in lines[2:]: - cols = l.split(':', 1) - interface_name = str(cols[0]).strip() - if interface_name == 'eth0': - x = cols[1].split() - m_func = FUNC_MAP[RATE][self.use_histogram] - m_func(self, "docker.net.bytes_rcvd", long(x[0]), tags) - m_func(self, "docker.net.bytes_sent", long(x[8]), tags) - break - except Exception, e: - # It is possible that the container got stopped between the API call and now - self.warning("Failed to report IO metrics from file {0}. Exception: {1}".format(proc_net_file, e)) - - def _process_events(self, containers_by_id): - try: - api_events = self._get_events() - aggregated_events = self._pre_aggregate_events(api_events, containers_by_id) - events = self._format_events(aggregated_events, containers_by_id) - except (socket.timeout, urllib2.URLError): - self.warning('Timeout when collecting events. Events will be missing.') - return - except Exception, e: - self.warning("Unexpected exception when collecting events: {0}. " - "Events will be missing".format(e)) - return - - for ev in events: - self.log.debug("Creating event: %s" % ev['msg_title']) - self.event(ev) - - def _get_events(self): - """Get the list of events.""" - now = int(time.time()) - events = [] - event_generator = self.client.events(since=self._last_event_collection_ts, - until=now, decode=True) - for event in event_generator: - if event != '': - events.append(event) - self._last_event_collection_ts = now - return events - - def _pre_aggregate_events(self, api_events, containers_by_id): - # Aggregate events, one per image. Put newer events first. - events = defaultdict(deque) - for event in api_events: - # Skip events related to filtered containers - container = containers_by_id.get(event['id']) - if container is not None and self._is_container_excluded(container): - self.log.debug("Excluded event: container {0} status changed to {1}".format( - event['id'], event['status'])) - continue - # Known bug: from may be missing - if 'from' in event: - events[event['from']].appendleft(event) - return events - - def _format_events(self, aggregated_events, containers_by_id): - events = [] - for image_name, event_group in aggregated_events.iteritems(): - max_timestamp = 0 - status = defaultdict(int) - status_change = [] - container_names = set() - for event in event_group: - max_timestamp = max(max_timestamp, int(event['time'])) - status[event['status']] += 1 - container_name = event['id'][:11] - if event['id'] in containers_by_id: - container_name = container_name_extractor(containers_by_id[event['id']])[0] - - container_names.add(container_name) - status_change.append([container_name, event['status']]) - - status_text = ", ".join(["%d %s" % (count, st) for st, count in status.iteritems()]) - msg_title = "%s %s on %s" % (image_name, status_text, self.hostname) - msg_body = ( - "%%%\n" - "{image_name} {status} on {hostname}\n" - "```\n{status_changes}\n```\n" - "%%%" - ).format( - image_name=image_name, - status=status_text, - hostname=self.hostname, - status_changes="\n".join( - ["%s \t%s" % (change[1].upper(), change[0]) for change in status_change]) - ) - events.append({ - 'timestamp': max_timestamp, - 'host': self.hostname, - 'event_type': EVENT_TYPE, - 'msg_title': msg_title, - 'msg_text': msg_body, - 'source_type_name': EVENT_TYPE, - 'event_object': 'docker:%s' % image_name, - 'tags': ['container_name:%s' % c_name for c_name in container_names] - }) - - return events - - # Cgroups - - def _get_cgroup_file(self, cgroup, container_id, filename): - """Find a specific cgroup file, containing metrics to extract.""" - params = { - "mountpoint": self._mountpoints[cgroup], - "id": container_id, - "file": filename, - } - - return find_cgroup_filename_pattern(self._mountpoints, container_id) % (params) - - def _parse_cgroup_file(self, stat_file): - """Parse a cgroup pseudo file for key/values.""" - self.log.debug("Opening cgroup file: %s" % stat_file) - try: - with open(stat_file, 'r') as fp: - if 'blkio' in stat_file: - return self._parse_blkio_metrics(fp.read().splitlines()) - else: - return dict(map(lambda x: x.split(' ', 1), fp.read().splitlines())) - except IOError: - # It is possible that the container got stopped between the API call and now - self.log.info("Can't open %s. Metrics for this container are skipped." % stat_file) - - def _parse_blkio_metrics(self, stats): - """Parse the blkio metrics.""" - metrics = { - 'io_read': 0, - 'io_write': 0, - } - for line in stats: - if 'Read' in line: - metrics['io_read'] += int(line.split()[2]) - if 'Write' in line: - metrics['io_write'] += int(line.split()[2]) - return metrics - - # proc files - def _crawl_container_pids(self, container_dict): - """Crawl `/proc` to find container PIDs and add them to `containers_by_id`.""" - proc_path = os.path.join(self._docker_root, 'proc') - pid_dirs = [_dir for _dir in os.listdir(proc_path) if _dir.isdigit()] - - if len(pid_dirs) == 0: - self.warning("Unable to find any pid directory in {0}. " - "If you are running the agent in a container, make sure to " - 'share the volume properly: "/proc:/host/proc:ro". ' - "See https://github.com/DataDog/docker-dd-agent/blob/master/README.md for more information. " - "Network metrics will be missing".format(proc_path)) - self._disable_net_metrics = True - return container_dict - - self._disable_net_metrics = False - - for folder in pid_dirs: - - try: - path = os.path.join(proc_path, folder, 'cgroup') - with open(path, 'r') as f: - content = [line.strip().split(':') for line in f.readlines()] - except Exception, e: - self.warning("Cannot read %s : %s" % (path, str(e))) - continue - - try: - for line in content: - if line[1] in ('cpu,cpuacct', 'cpuacct,cpu', 'cpuacct') and 'docker' in line[2]: - cpuacct = line[2] - break - else: - continue - - match = CONTAINER_ID_RE.search(cpuacct) - if match: - container_id = match.group(0) - container_dict[container_id]['_pid'] = folder - container_dict[container_id]['_proc_root'] = os.path.join(proc_path, folder) - except Exception, e: - self.warning("Cannot parse %s content: %s" % (path, str(e))) - continue - return container_dict diff --git a/py/checks/elastic.py b/py/checks/elastic.py deleted file mode 100644 index c2fda307eca8..000000000000 --- a/py/checks/elastic.py +++ /dev/null @@ -1,537 +0,0 @@ -# stdlib -from collections import defaultdict, namedtuple -import time -import urlparse - -# 3p -import requests - -# project -from checks import AgentCheck -from config import _is_affirmative -from util import headers - - -class NodeNotFound(Exception): - pass - - -ESInstanceConfig = namedtuple( - 'ESInstanceConfig', [ - 'pshard_stats', - 'cluster_stats', - 'password', - 'service_check_tags', - 'tags', - 'timeout', - 'url', - 'username', - ]) - - -class ESCheck(AgentCheck): - SERVICE_CHECK_CONNECT_NAME = 'elasticsearch.can_connect' - SERVICE_CHECK_CLUSTER_STATUS = 'elasticsearch.cluster_health' - - DEFAULT_TIMEOUT = 5 - - # Clusterwise metrics, pre aggregated on ES, compatible with all ES versions - PRIMARY_SHARD_METRICS = { - "elasticsearch.primaries.docs.count": ("gauge", "_all.primaries.docs.count"), - "elasticsearch.primaries.docs.deleted": ("gauge", "_all.primaries.docs.deleted"), - "elasticsearch.primaries.store.size": ("gauge", "_all.primaries.store.size_in_bytes"), - "elasticsearch.primaries.indexing.index.total": ("gauge", "_all.primaries.indexing.index_total"), - "elasticsearch.primaries.indexing.index.time": ("gauge", "_all.primaries.indexing.index_time_in_millis", lambda v: float(v)/1000), - "elasticsearch.primaries.indexing.index.current": ("gauge", "_all.primaries.indexing.index_current"), - "elasticsearch.primaries.indexing.delete.total": ("gauge", "_all.primaries.indexing.delete_total"), - "elasticsearch.primaries.indexing.delete.time": ("gauge", "_all.primaries.indexing.delete_time_in_millis", lambda v: float(v)/1000), - "elasticsearch.primaries.indexing.delete.current": ("gauge", "_all.primaries.indexing.delete_current"), - "elasticsearch.primaries.get.total": ("gauge", "_all.primaries.get.total"), - "elasticsearch.primaries.get.time": ("gauge", "_all.primaries.get.time_in_millis", lambda v: float(v)/1000), - "elasticsearch.primaries.get.current": ("gauge", "_all.primaries.get.current"), - "elasticsearch.primaries.get.exists.total": ("gauge", "_all.primaries.get.exists_total"), - "elasticsearch.primaries.get.exists.time": ("gauge", "_all.primaries.get.exists_time_in_millis", lambda v: float(v)/1000), - "elasticsearch.primaries.get.missing.total": ("gauge", "_all.primaries.get.missing_total"), - "elasticsearch.primaries.get.missing.time": ("gauge", "_all.primaries.get.missing_time_in_millis", lambda v: float(v)/1000), - "elasticsearch.primaries.search.query.total": ("gauge", "_all.primaries.search.query_total"), - "elasticsearch.primaries.search.query.time": ("gauge", "_all.primaries.search.query_time_in_millis", lambda v: float(v)/1000), - "elasticsearch.primaries.search.query.current": ("gauge", "_all.primaries.search.query_current"), - "elasticsearch.primaries.search.fetch.total": ("gauge", "_all.primaries.search.fetch_total"), - "elasticsearch.primaries.search.fetch.time": ("gauge", "_all.primaries.search.fetch_time_in_millis", lambda v: float(v)/1000), - "elasticsearch.primaries.search.fetch.current": ("gauge", "_all.primaries.search.fetch_current") - } - - PRIMARY_SHARD_METRICS_POST_1_0 = { - "elasticsearch.primaries.merges.current": ("gauge", "_all.primaries.merges.current"), - "elasticsearch.primaries.merges.current.docs": ("gauge", "_all.primaries.merges.current_docs"), - "elasticsearch.primaries.merges.current.size": ("gauge", "_all.primaries.merges.current_size_in_bytes"), - "elasticsearch.primaries.merges.total": ("gauge", "_all.primaries.merges.total"), - "elasticsearch.primaries.merges.total.time": ("gauge", "_all.primaries.merges.total_time_in_millis", lambda v: float(v)/1000), - "elasticsearch.primaries.merges.total.docs": ("gauge", "_all.primaries.merges.total_docs"), - "elasticsearch.primaries.merges.total.size": ("gauge", "_all.primaries.merges.total_size_in_bytes"), - "elasticsearch.primaries.refresh.total": ("gauge", "_all.primaries.refresh.total"), - "elasticsearch.primaries.refresh.total.time": ("gauge", "_all.primaries.refresh.total_time_in_millis", lambda v: float(v)/1000), - "elasticsearch.primaries.flush.total": ("gauge", "_all.primaries.flush.total"), - "elasticsearch.primaries.flush.total.time": ("gauge", "_all.primaries.flush.total_time_in_millis", lambda v: float(v)/1000) - } - - STATS_METRICS = { # Metrics that are common to all Elasticsearch versions - "elasticsearch.docs.count": ("gauge", "indices.docs.count"), - "elasticsearch.docs.deleted": ("gauge", "indices.docs.deleted"), - "elasticsearch.store.size": ("gauge", "indices.store.size_in_bytes"), - "elasticsearch.indexing.index.total": ("gauge", "indices.indexing.index_total"), - "elasticsearch.indexing.index.time": ("gauge", "indices.indexing.index_time_in_millis", lambda v: float(v)/1000), - "elasticsearch.indexing.index.current": ("gauge", "indices.indexing.index_current"), - "elasticsearch.indexing.delete.total": ("gauge", "indices.indexing.delete_total"), - "elasticsearch.indexing.delete.time": ("gauge", "indices.indexing.delete_time_in_millis", lambda v: float(v)/1000), - "elasticsearch.indexing.delete.current": ("gauge", "indices.indexing.delete_current"), - "elasticsearch.get.total": ("gauge", "indices.get.total"), - "elasticsearch.get.time": ("gauge", "indices.get.time_in_millis", lambda v: float(v)/1000), - "elasticsearch.get.current": ("gauge", "indices.get.current"), - "elasticsearch.get.exists.total": ("gauge", "indices.get.exists_total"), - "elasticsearch.get.exists.time": ("gauge", "indices.get.exists_time_in_millis", lambda v: float(v)/1000), - "elasticsearch.get.missing.total": ("gauge", "indices.get.missing_total"), - "elasticsearch.get.missing.time": ("gauge", "indices.get.missing_time_in_millis", lambda v: float(v)/1000), - "elasticsearch.search.query.total": ("gauge", "indices.search.query_total"), - "elasticsearch.search.query.time": ("gauge", "indices.search.query_time_in_millis", lambda v: float(v)/1000), - "elasticsearch.search.query.current": ("gauge", "indices.search.query_current"), - "elasticsearch.search.fetch.total": ("gauge", "indices.search.fetch_total"), - "elasticsearch.search.fetch.time": ("gauge", "indices.search.fetch_time_in_millis", lambda v: float(v)/1000), - "elasticsearch.search.fetch.current": ("gauge", "indices.search.fetch_current"), - "elasticsearch.merges.current": ("gauge", "indices.merges.current"), - "elasticsearch.merges.current.docs": ("gauge", "indices.merges.current_docs"), - "elasticsearch.merges.current.size": ("gauge", "indices.merges.current_size_in_bytes"), - "elasticsearch.merges.total": ("gauge", "indices.merges.total"), - "elasticsearch.merges.total.time": ("gauge", "indices.merges.total_time_in_millis", lambda v: float(v)/1000), - "elasticsearch.merges.total.docs": ("gauge", "indices.merges.total_docs"), - "elasticsearch.merges.total.size": ("gauge", "indices.merges.total_size_in_bytes"), - "elasticsearch.refresh.total": ("gauge", "indices.refresh.total"), - "elasticsearch.refresh.total.time": ("gauge", "indices.refresh.total_time_in_millis", lambda v: float(v)/1000), - "elasticsearch.flush.total": ("gauge", "indices.flush.total"), - "elasticsearch.flush.total.time": ("gauge", "indices.flush.total_time_in_millis", lambda v: float(v)/1000), - "elasticsearch.process.open_fd": ("gauge", "process.open_file_descriptors"), - "elasticsearch.transport.rx_count": ("gauge", "transport.rx_count"), - "elasticsearch.transport.tx_count": ("gauge", "transport.tx_count"), - "elasticsearch.transport.rx_size": ("gauge", "transport.rx_size_in_bytes"), - "elasticsearch.transport.tx_size": ("gauge", "transport.tx_size_in_bytes"), - "elasticsearch.transport.server_open": ("gauge", "transport.server_open"), - "elasticsearch.thread_pool.bulk.active": ("gauge", "thread_pool.bulk.active"), - "elasticsearch.thread_pool.bulk.threads": ("gauge", "thread_pool.bulk.threads"), - "elasticsearch.thread_pool.bulk.queue": ("gauge", "thread_pool.bulk.queue"), - "elasticsearch.thread_pool.flush.active": ("gauge", "thread_pool.flush.active"), - "elasticsearch.thread_pool.flush.threads": ("gauge", "thread_pool.flush.threads"), - "elasticsearch.thread_pool.flush.queue": ("gauge", "thread_pool.flush.queue"), - "elasticsearch.thread_pool.generic.active": ("gauge", "thread_pool.generic.active"), - "elasticsearch.thread_pool.generic.threads": ("gauge", "thread_pool.generic.threads"), - "elasticsearch.thread_pool.generic.queue": ("gauge", "thread_pool.generic.queue"), - "elasticsearch.thread_pool.get.active": ("gauge", "thread_pool.get.active"), - "elasticsearch.thread_pool.get.threads": ("gauge", "thread_pool.get.threads"), - "elasticsearch.thread_pool.get.queue": ("gauge", "thread_pool.get.queue"), - "elasticsearch.thread_pool.index.active": ("gauge", "thread_pool.index.active"), - "elasticsearch.thread_pool.index.threads": ("gauge", "thread_pool.index.threads"), - "elasticsearch.thread_pool.index.queue": ("gauge", "thread_pool.index.queue"), - "elasticsearch.thread_pool.management.active": ("gauge", "thread_pool.management.active"), - "elasticsearch.thread_pool.management.threads": ("gauge", "thread_pool.management.threads"), - "elasticsearch.thread_pool.management.queue": ("gauge", "thread_pool.management.queue"), - "elasticsearch.thread_pool.merge.active": ("gauge", "thread_pool.merge.active"), - "elasticsearch.thread_pool.merge.threads": ("gauge", "thread_pool.merge.threads"), - "elasticsearch.thread_pool.merge.queue": ("gauge", "thread_pool.merge.queue"), - "elasticsearch.thread_pool.percolate.active": ("gauge", "thread_pool.percolate.active"), - "elasticsearch.thread_pool.percolate.threads": ("gauge", "thread_pool.percolate.threads"), - "elasticsearch.thread_pool.percolate.queue": ("gauge", "thread_pool.percolate.queue"), - "elasticsearch.thread_pool.refresh.active": ("gauge", "thread_pool.refresh.active"), - "elasticsearch.thread_pool.refresh.threads": ("gauge", "thread_pool.refresh.threads"), - "elasticsearch.thread_pool.refresh.queue": ("gauge", "thread_pool.refresh.queue"), - "elasticsearch.thread_pool.search.active": ("gauge", "thread_pool.search.active"), - "elasticsearch.thread_pool.search.threads": ("gauge", "thread_pool.search.threads"), - "elasticsearch.thread_pool.search.queue": ("gauge", "thread_pool.search.queue"), - "elasticsearch.thread_pool.snapshot.active": ("gauge", "thread_pool.snapshot.active"), - "elasticsearch.thread_pool.snapshot.threads": ("gauge", "thread_pool.snapshot.threads"), - "elasticsearch.thread_pool.snapshot.queue": ("gauge", "thread_pool.snapshot.queue"), - "elasticsearch.http.current_open": ("gauge", "http.current_open"), - "elasticsearch.http.total_opened": ("gauge", "http.total_opened"), - "jvm.mem.heap_committed": ("gauge", "jvm.mem.heap_committed_in_bytes"), - "jvm.mem.heap_used": ("gauge", "jvm.mem.heap_used_in_bytes"), - "jvm.mem.heap_in_use": ("gauge", "jvm.mem.heap_used_percent"), - "jvm.mem.heap_max": ("gauge", "jvm.mem.heap_max_in_bytes"), - "jvm.mem.non_heap_committed": ("gauge", "jvm.mem.non_heap_committed_in_bytes"), - "jvm.mem.non_heap_used": ("gauge", "jvm.mem.non_heap_used_in_bytes"), - "jvm.threads.count": ("gauge", "jvm.threads.count"), - "jvm.threads.peak_count": ("gauge", "jvm.threads.peak_count"), - } - - JVM_METRICS_POST_0_90_10 = { - "jvm.gc.collectors.young.count": ("gauge", "jvm.gc.collectors.young.collection_count"), - "jvm.gc.collectors.young.collection_time": ("gauge", "jvm.gc.collectors.young.collection_time_in_millis", lambda v: float(v)/1000), - "jvm.gc.collectors.old.count": ("gauge", "jvm.gc.collectors.old.collection_count"), - "jvm.gc.collectors.old.collection_time": ("gauge", "jvm.gc.collectors.old.collection_time_in_millis", lambda v: float(v)/1000) - } - - JVM_METRICS_PRE_0_90_10 = { - "jvm.gc.concurrent_mark_sweep.count": ("gauge", "jvm.gc.collectors.ConcurrentMarkSweep.collection_count"), - "jvm.gc.concurrent_mark_sweep.collection_time": ("gauge", "jvm.gc.collectors.ConcurrentMarkSweep.collection_time_in_millis", lambda v: float(v)/1000), - "jvm.gc.par_new.count": ("gauge", "jvm.gc.collectors.ParNew.collection_count"), - "jvm.gc.par_new.collection_time": ("gauge", "jvm.gc.collectors.ParNew.collection_time_in_millis", lambda v: float(v)/1000), - "jvm.gc.collection_count": ("gauge", "jvm.gc.collection_count"), - "jvm.gc.collection_time": ("gauge", "jvm.gc.collection_time_in_millis", lambda v: float(v)/1000), - } - - ADDITIONAL_METRICS_POST_0_90_5 = { - "elasticsearch.search.fetch.open_contexts": ("gauge", "indices.search.open_contexts"), - "elasticsearch.cache.filter.evictions": ("gauge", "indices.filter_cache.evictions"), - "elasticsearch.cache.filter.size": ("gauge", "indices.filter_cache.memory_size_in_bytes"), - "elasticsearch.id_cache.size": ("gauge", "indices.id_cache.memory_size_in_bytes"), - "elasticsearch.fielddata.size": ("gauge", "indices.fielddata.memory_size_in_bytes"), - "elasticsearch.fielddata.evictions": ("gauge", "indices.fielddata.evictions"), - } - - ADDITIONAL_METRICS_PRE_0_90_5 = { - "elasticsearch.cache.field.evictions": ("gauge", "indices.cache.field_evictions"), - "elasticsearch.cache.field.size": ("gauge", "indices.cache.field_size_in_bytes"), - "elasticsearch.cache.filter.count": ("gauge", "indices.cache.filter_count"), - "elasticsearch.cache.filter.evictions": ("gauge", "indices.cache.filter_evictions"), - "elasticsearch.cache.filter.size": ("gauge", "indices.cache.filter_size_in_bytes"), - } - - CLUSTER_HEALTH_METRICS = { - "elasticsearch.number_of_nodes": ("gauge", "number_of_nodes"), - "elasticsearch.number_of_data_nodes": ("gauge", "number_of_data_nodes"), - "elasticsearch.active_primary_shards": ("gauge", "active_primary_shards"), - "elasticsearch.active_shards": ("gauge", "active_shards"), - "elasticsearch.relocating_shards": ("gauge", "relocating_shards"), - "elasticsearch.initializing_shards": ("gauge", "initializing_shards"), - "elasticsearch.unassigned_shards": ("gauge", "unassigned_shards"), - "elasticsearch.cluster_status": ("gauge", "status", lambda v: {"red": 0, "yellow": 1, "green": 2}.get(v, -1)), - } - - CLUSTER_PENDING_TASKS = { - "elasticsearch.pending_tasks_total": ("gauge", "pending_task_total"), - "elasticsearch.pending_tasks_priority_high": ("gauge", "pending_tasks_priority_high"), - "elasticsearch.pending_tasks_priority_urgent": ("gauge", "pending_tasks_priority_urgent") - } - - SOURCE_TYPE_NAME = 'elasticsearch' - - def __init__(self, name, init_config, agentConfig, instances=None): - AgentCheck.__init__(self, name, init_config, agentConfig, instances) - - # Host status needs to persist across all checks - self.cluster_status = {} - - def get_instance_config(self, instance): - url = instance.get('url') - if url is None: - raise Exception("An url must be specified in the instance") - - pshard_stats = _is_affirmative(instance.get('pshard_stats', False)) - - cluster_stats = _is_affirmative(instance.get('cluster_stats', False)) - if 'is_external' in instance: - cluster_stats = _is_affirmative(instance.get('is_external', False)) - - # Support URLs that have a path in them from the config, for - # backwards-compatibility. - parsed = urlparse.urlparse(url) - if parsed[2] != "": - url = "%s://%s" % (parsed[0], parsed[1]) - port = parsed.port - host = parsed.hostname - service_check_tags = [ - 'host:%s' % host, - 'port:%s' % port - ] - - # Tag by URL so we can differentiate the metrics - # from multiple instances - tags = ['url:%s' % url] - tags.extend(instance.get('tags', [])) - - timeout = instance.get('timeout') or self.DEFAULT_TIMEOUT - - config = ESInstanceConfig( - pshard_stats=pshard_stats, - cluster_stats=cluster_stats, - password=instance.get('password'), - service_check_tags=service_check_tags, - tags=tags, - timeout=timeout, - url=url, - username=instance.get('username') - ) - return config - - def check(self, instance): - config = self.get_instance_config(instance) - - # Check ES version for this instance and define parameters - # (URLs and metrics) accordingly - version = self._get_es_version(config) - - health_url, nodes_url, stats_url, pshard_stats_url, pending_tasks_url, stats_metrics, \ - pshard_stats_metrics = self._define_params(version, config.cluster_stats) - - # Load clusterwise data - if config.pshard_stats: - pshard_stats_url = urlparse.urljoin(config.url, pshard_stats_url) - pshard_stats_data = self._get_data(pshard_stats_url, config) - self._process_pshard_stats_data(pshard_stats_data, config, pshard_stats_metrics) - - # Load stats data. - stats_url = urlparse.urljoin(config.url, stats_url) - stats_data = self._get_data(stats_url, config) - self._process_stats_data(nodes_url, stats_data, stats_metrics, config) - - # Load the health data. - health_url = urlparse.urljoin(config.url, health_url) - health_data = self._get_data(health_url, config) - self._process_health_data(health_data, config) - - # Load the pending_tasks data. - pending_tasks_url = urlparse.urljoin(config.url, pending_tasks_url) - pending_tasks_data = self._get_data(pending_tasks_url, config) - self._process_pending_tasks_data(pending_tasks_data, config) - - # If we're here we did not have any ES conn issues - self.service_check( - self.SERVICE_CHECK_CONNECT_NAME, - AgentCheck.OK, - tags=config.service_check_tags - ) - - def _get_es_version(self, config): - """ Get the running version of elasticsearch. - """ - try: - data = self._get_data(config.url, config, send_sc=False) - version = map(int, data['version']['number'].split('.')[0:3]) - except Exception, e: - self.warning( - "Error while trying to get Elasticsearch version " - "from %s %s" - % (config.url, str(e)) - ) - version = [1, 0, 0] - - self.service_metadata('version', version) - self.log.debug("Elasticsearch version is %s" % version) - return version - - def _define_params(self, version, cluster_stats): - """ Define the set of URLs and METRICS to use depending on the - running ES version. - """ - - pshard_stats_url = "/_stats" - - if version >= [0, 90, 10]: - # ES versions 0.90.10 and above - health_url = "/_cluster/health?pretty=true" - nodes_url = "/_nodes?network=true" - pending_tasks_url = "/_cluster/pending_tasks?pretty=true" - - # For "external" clusters, we want to collect from all nodes. - if cluster_stats: - stats_url = "/_nodes/stats?all=true" - else: - stats_url = "/_nodes/_local/stats?all=true" - - additional_metrics = self.JVM_METRICS_POST_0_90_10 - else: - health_url = "/_cluster/health?pretty=true" - nodes_url = "/_cluster/nodes?network=true" - pending_tasks_url = None - if cluster_stats: - stats_url = "/_cluster/nodes/stats?all=true" - else: - stats_url = "/_cluster/nodes/_local/stats?all=true" - - additional_metrics = self.JVM_METRICS_PRE_0_90_10 - - stats_metrics = dict(self.STATS_METRICS) - stats_metrics.update(additional_metrics) - - if version >= [0, 90, 5]: - # ES versions 0.90.5 and above - additional_metrics = self.ADDITIONAL_METRICS_POST_0_90_5 - else: - # ES version 0.90.4 and below - additional_metrics = self.ADDITIONAL_METRICS_PRE_0_90_5 - - stats_metrics.update(additional_metrics) - - # Version specific stats metrics about the primary shards - pshard_stats_metrics = dict(self.PRIMARY_SHARD_METRICS) - - if version >= [1, 0, 0]: - additional_metrics = self.PRIMARY_SHARD_METRICS_POST_1_0 - - pshard_stats_metrics.update(additional_metrics) - - return health_url, nodes_url, stats_url, pshard_stats_url, pending_tasks_url, \ - stats_metrics, pshard_stats_metrics - - def _get_data(self, url, config, send_sc=True): - """ Hit a given URL and return the parsed json - """ - # Load basic authentication configuration, if available. - if config.username and config.password: - auth = (config.username, config.password) - else: - auth = None - - try: - resp = requests.get( - url, - timeout=config.timeout, - headers=headers(self.agentConfig), - auth=auth - ) - resp.raise_for_status() - except Exception as e: - if send_sc: - self.service_check( - self.SERVICE_CHECK_CONNECT_NAME, - AgentCheck.CRITICAL, - message="Error {0} when hitting {1}".format(e, url), - tags=config.service_check_tags - ) - raise - - return resp.json() - - def _process_pending_tasks_data(self, data, config): - p_tasks = defaultdict(int) - - for task in data.get('tasks', []): - p_tasks[task.get('priority')] += 1 - - node_data = { - 'pending_task_total': sum(p_tasks.values()), - 'pending_tasks_priority_high': p_tasks['high'], - 'pending_tasks_priority_urgent': p_tasks['urgent'], - } - - for metric in self.CLUSTER_PENDING_TASKS: - # metric description - desc = self.CLUSTER_PENDING_TASKS[metric] - self._process_metric(node_data, metric, *desc, tags=config.tags) - - def _process_stats_data(self, nodes_url, data, stats_metrics, config): - cluster_stats = config.cluster_stats - for node_name in data['nodes']: - node_data = data['nodes'][node_name] - # On newer version of ES it's "host" not "hostname" - node_hostname = node_data.get( - 'hostname', node_data.get('host', None)) - - # Override the metric hostname if we're hitting an external cluster - metric_hostname = node_hostname if cluster_stats else None - - for metric, desc in stats_metrics.iteritems(): - self._process_metric( - node_data, metric, *desc, tags=config.tags, - hostname=metric_hostname) - - def _process_pshard_stats_data(self, data, config, pshard_stats_metrics): - for metric, desc in pshard_stats_metrics.iteritems(): - self._process_metric(data, metric, *desc, tags=config.tags) - - def _process_metric(self, data, metric, xtype, path, xform=None, - tags=None, hostname=None): - """data: dictionary containing all the stats - metric: datadog metric - path: corresponding path in data, flattened, e.g. thread_pool.bulk.queue - xfom: a lambda to apply to the numerical value - """ - value = data - - # Traverse the nested dictionaries - for key in path.split('.'): - if value is not None: - value = value.get(key, None) - else: - break - - if value is not None: - if xform: - value = xform(value) - if xtype == "gauge": - self.gauge(metric, value, tags=tags, hostname=hostname) - else: - self.rate(metric, value, tags=tags, hostname=hostname) - else: - self._metric_not_found(metric, path) - - def _process_health_data(self, data, config): - if self.cluster_status.get(config.url) is None: - self.cluster_status[config.url] = data['status'] - if data['status'] in ["yellow", "red"]: - event = self._create_event(data['status'], tags=config.tags) - self.event(event) - - if data['status'] != self.cluster_status.get(config.url): - self.cluster_status[config.url] = data['status'] - event = self._create_event(data['status'], tags=config.tags) - self.event(event) - - for metric, desc in self.CLUSTER_HEALTH_METRICS.iteritems(): - self._process_metric(data, metric, *desc, tags=config.tags) - - # Process the service check - cluster_status = data['status'] - if cluster_status == 'green': - status = AgentCheck.OK - data['tag'] = "OK" - elif cluster_status == 'yellow': - status = AgentCheck.WARNING - data['tag'] = "WARN" - else: - status = AgentCheck.CRITICAL - data['tag'] = "ALERT" - - msg = "{tag} on cluster \"{cluster_name}\" "\ - "| active_shards={active_shards} "\ - "| initializing_shards={initializing_shards} "\ - "| relocating_shards={relocating_shards} "\ - "| unassigned_shards={unassigned_shards} "\ - "| timed_out={timed_out}" \ - .format(**data) - - self.service_check( - self.SERVICE_CHECK_CLUSTER_STATUS, - status, - message=msg, - tags=config.service_check_tags - ) - - def _metric_not_found(self, metric, path): - self.log.debug("Metric not found: %s -> %s", path, metric) - - def _create_event(self, status, tags=None): - hostname = self.hostname.decode('utf-8') - if status == "red": - alert_type = "error" - msg_title = "%s is %s" % (hostname, status) - - elif status == "yellow": - alert_type = "warning" - msg_title = "%s is %s" % (hostname, status) - - else: - # then it should be green - alert_type = "success" - msg_title = "%s recovered as %s" % (hostname, status) - - msg = "ElasticSearch: %s just reported as %s" % (hostname, status) - - return { - 'timestamp': int(time.time()), - 'event_type': 'elasticsearch', - 'host': hostname, - 'msg_text': msg, - 'msg_title': msg_title, - 'alert_type': alert_type, - 'source_type_name': "elasticsearch", - 'event_object': hostname, - 'tags': tags - } diff --git a/py/checks/etcd.py b/py/checks/etcd.py deleted file mode 100644 index b21266cb11cd..000000000000 --- a/py/checks/etcd.py +++ /dev/null @@ -1,178 +0,0 @@ -# 3rd party -import requests - -# project -from checks import AgentCheck -from config import _is_affirmative -from util import headers - - -class Etcd(AgentCheck): - - DEFAULT_TIMEOUT = 5 - - SERVICE_CHECK_NAME = 'etcd.can_connect' - - STORE_RATES = { - 'getsSuccess': 'etcd.store.gets.success', - 'getsFail': 'etcd.store.gets.fail', - 'setsSuccess': 'etcd.store.sets.success', - 'setsFail': 'etcd.store.sets.fail', - 'deleteSuccess': 'etcd.store.delete.success', - 'deleteFail': 'etcd.store.delete.fail', - 'updateSuccess': 'etcd.store.update.success', - 'updateFail': 'etcd.store.update.fail', - 'createSuccess': 'etcd.store.create.success', - 'createFail': 'etcd.store.create.fail', - 'compareAndSwapSuccess': 'etcd.store.compareandswap.success', - 'compareAndSwapFail': 'etcd.store.compareandswap.fail', - 'compareAndDeleteSuccess': 'etcd.store.compareanddelete.success', - 'compareAndDeleteFail': 'etcd.store.compareanddelete.fail', - 'expireCount': 'etcd.store.expire.count' - } - - STORE_GAUGES = { - 'watchers': 'etcd.store.watchers' - } - - SELF_GAUGES = { - 'sendPkgRate': 'etcd.self.send.pkgrate', - 'sendBandwidthRate': 'etcd.self.send.bandwidthrate', - 'recvPkgRate': 'etcd.self.recv.pkgrate', - 'recvBandwidthRate': 'etcd.self.recv.bandwidthrate' - } - - SELF_RATES = { - 'recvAppendRequestCnt': 'etcd.self.recv.appendrequest.count', - 'sendAppendRequestCnt': 'etcd.self.send.appendrequest.count' - } - - LEADER_COUNTS = { - # Rates - 'fail': 'etcd.leader.counts.fail', - 'success': 'etcd.leader.counts.success', - } - - LEADER_LATENCY = { - # Gauges - 'current': 'etcd.leader.latency.current', - 'average': 'etcd.leader.latency.avg', - 'minimum': 'etcd.leader.latency.min', - 'maximum': 'etcd.leader.latency.max', - 'standardDeviation': 'etcd.leader.latency.stddev', - } - - def check(self, instance): - if 'url' not in instance: - raise Exception('etcd instance missing "url" value.') - - # Load values from the instance config - url = instance['url'] - instance_tags = instance.get('tags', []) - - # Load the ssl configuration - ssl_params = { - 'ssl_keyfile': instance.get('ssl_keyfile'), - 'ssl_certfile': instance.get('ssl_certfile'), - 'ssl_cert_validation': _is_affirmative(instance.get('ssl_cert_validation', True)), - 'ssl_ca_certs': instance.get('ssl_ca_certs'), - } - - for key, param in ssl_params.items(): - if param is None: - del ssl_params[key] - # Append the instance's URL in case there are more than one, that - # way they can tell the difference! - instance_tags.append("url:{0}".format(url)) - timeout = float(instance.get('timeout', self.DEFAULT_TIMEOUT)) - is_leader = False - - # Gather self metrics - self_response = self._get_self_metrics(url, ssl_params, timeout) - if self_response is not None: - if self_response['state'] == 'StateLeader': - is_leader = True - instance_tags.append('etcd_state:leader') - else: - instance_tags.append('etcd_state:follower') - - for key in self.SELF_RATES: - if key in self_response: - self.rate(self.SELF_RATES[key], self_response[key], tags=instance_tags) - else: - self.log.warn("Missing key {0} in stats.".format(key)) - - for key in self.SELF_GAUGES: - if key in self_response: - self.gauge(self.SELF_GAUGES[key], self_response[key], tags=instance_tags) - else: - self.log.warn("Missing key {0} in stats.".format(key)) - - # Gather store metrics - store_response = self._get_store_metrics(url, ssl_params, timeout) - if store_response is not None: - for key in self.STORE_RATES: - if key in store_response: - self.rate(self.STORE_RATES[key], store_response[key], tags=instance_tags) - else: - self.log.warn("Missing key {0} in stats.".format(key)) - - for key in self.STORE_GAUGES: - if key in store_response: - self.gauge(self.STORE_GAUGES[key], store_response[key], tags=instance_tags) - else: - self.log.warn("Missing key {0} in stats.".format(key)) - - # Gather leader metrics - if is_leader: - leader_response = self._get_leader_metrics(url, ssl_params, timeout) - if leader_response is not None and len(leader_response.get("followers", {})) > 0: - # Get the followers - followers = leader_response.get("followers") - for fol in followers: - # counts - for key in self.LEADER_COUNTS: - self.rate(self.LEADER_COUNTS[key], - followers[fol].get("counts").get(key), - tags=instance_tags + ['follower:{0}'.format(fol)]) - # latency - for key in self.LEADER_LATENCY: - self.gauge(self.LEADER_LATENCY[key], - followers[fol].get("latency").get(key), - tags=instance_tags + ['follower:{0}'.format(fol)]) - - # Service check - if self_response is not None and store_response is not None: - self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, - tags=["url:{0}".format(url)]) - - def _get_self_metrics(self, url, ssl_params, timeout): - return self._get_json(url + "/v2/stats/self", ssl_params, timeout) - - def _get_store_metrics(self, url, ssl_params, timeout): - return self._get_json(url + "/v2/stats/store", ssl_params, timeout) - - def _get_leader_metrics(self, url, ssl_params, timeout): - return self._get_json(url + "/v2/stats/leader", ssl_params, timeout) - - def _get_json(self, url, ssl_params, timeout): - try: - certificate = None - if 'ssl_certfile' in ssl_params and 'ssl_keyfile' in ssl_params: - certificate = (ssl_params['ssl_certfile'], ssl_params['ssl_keyfile']) - verify = ssl_params.get('ssl_ca_certs', True) if ssl_params['ssl_cert_validation'] else False - r = requests.get(url, verify=verify, cert=certificate, timeout=timeout, headers=headers(self.agentConfig)) - except requests.exceptions.Timeout: - # If there's a timeout - self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, - message="Timeout when hitting %s" % url, - tags=["url:{0}".format(url)]) - raise - - if r.status_code != 200: - self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, - message="Got %s when hitting %s" % (r.status_code, url), - tags=["url:{0}".format(url)]) - raise Exception("Http status code {0} on url {1}".format(r.status_code, url)) - - return r.json() diff --git a/py/checks/fluentd.py b/py/checks/fluentd.py deleted file mode 100644 index 915eaa491e3b..000000000000 --- a/py/checks/fluentd.py +++ /dev/null @@ -1,61 +0,0 @@ -# stdlib -import urlparse - -# 3rd party -import requests - -# project -from checks import AgentCheck -from util import headers - - -class Fluentd(AgentCheck): - SERVICE_CHECK_NAME = 'fluentd.is_ok' - GAUGES = ['retry_count', 'buffer_total_queued_size', 'buffer_queue_length'] - _AVAILABLE_TAGS = frozenset(['plugin_id', 'type']) - - """Tracks basic fluentd metrics via the monitor_agent plugin - * number of retry_count - * number of buffer_queue_length - * number of buffer_total_queued_size - - $ curl http://localhost:24220/api/plugins.json - {"plugins":[{"type": "monitor_agent", ...}, {"type": "forward", ...}]} - """ - def check(self, instance): - if 'monitor_agent_url' not in instance: - raise Exception('Fluentd instance missing "monitor_agent_url" value.') - - try: - url = instance.get('monitor_agent_url') - plugin_ids = instance.get('plugin_ids', []) - - # Fallback with `tag_by: plugin_id` - tag_by = instance.get('tag_by') - tag_by = tag_by if tag_by in self._AVAILABLE_TAGS else 'plugin_id' - - parsed_url = urlparse.urlparse(url) - monitor_agent_host = parsed_url.hostname - monitor_agent_port = parsed_url.port or 24220 - service_check_tags = ['fluentd_host:%s' % monitor_agent_host, 'fluentd_port:%s' - % monitor_agent_port] - - r = requests.get(url, headers=headers(self.agentConfig)) - r.raise_for_status() - status = r.json() - - for p in status['plugins']: - tag = "%s:%s" % (tag_by, p.get(tag_by)) - for m in self.GAUGES: - if p.get(m) is None: - continue - # Filter unspecified plugins to keep backward compatibility. - if len(plugin_ids) == 0 or p.get('plugin_id') in plugin_ids: - self.gauge('fluentd.%s' % (m), p.get(m), [tag]) - except Exception, e: - msg = "No stats could be retrieved from %s : %s" % (url, str(e)) - self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, - tags=service_check_tags, message=msg) - raise - else: - self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, tags=service_check_tags) diff --git a/py/checks/gearmand.py b/py/checks/gearmand.py deleted file mode 100644 index 8b88ff61a139..000000000000 --- a/py/checks/gearmand.py +++ /dev/null @@ -1,75 +0,0 @@ -# 3rd party -import gearman - -# project -from checks import AgentCheck - -class Gearman(AgentCheck): - SERVICE_CHECK_NAME = 'gearman.can_connect' - - def get_library_versions(self): - return {"gearman": gearman.__version__} - - def _get_client(self,host,port): - self.log.debug("Connecting to gearman at address %s:%s" % (host, port)) - return gearman.GearmanAdminClient(["%s:%s" % - (host, port)]) - - def _get_metrics(self, client, tags): - data = client.get_status() - running = 0 - queued = 0 - workers = 0 - - for stat in data: - running += stat['running'] - queued += stat['queued'] - workers += stat['workers'] - - unique_tasks = len(data) - - self.gauge("gearman.unique_tasks", unique_tasks, tags=tags) - self.gauge("gearman.running", running, tags=tags) - self.gauge("gearman.queued", queued, tags=tags) - self.gauge("gearman.workers", workers, tags=tags) - - self.log.debug("running %d, queued %d, unique tasks %d, workers: %d" - % (running, queued, unique_tasks, workers)) - - def _get_conf(self, instance): - host = instance.get('server', None) - port = instance.get('port', None) - - if host is None: - self.warning("Host not set, assuming 127.0.0.1") - host = "127.0.0.1" - - if port is None: - self.warning("Port is not set, assuming 4730") - port = 4730 - - tags = instance.get('tags', []) - - return host, port, tags - - def check(self, instance): - self.log.debug("Gearman check start") - - host, port, tags = self._get_conf(instance) - service_check_tags = ["server:{0}".format(host), - "port:{0}".format(port)] - - client = self._get_client(host, port) - self.log.debug("Connected to gearman") - - tags += service_check_tags - - try: - self._get_metrics(client, tags) - self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, - message="Connection to %s:%s succeeded." % (host, port), - tags=service_check_tags) - except Exception as e: - self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, - message=str(e), tags=service_check_tags) - raise diff --git a/py/checks/gunicorn.py b/py/checks/gunicorn.py deleted file mode 100644 index 570d3cd95469..000000000000 --- a/py/checks/gunicorn.py +++ /dev/null @@ -1,123 +0,0 @@ -""" -Collects metrics from the gunicorn web server. - -http://gunicorn.org/ -""" -# stdlib -import time - -# 3rd party -import psutil - -# project -from checks import AgentCheck - - -class GUnicornCheck(AgentCheck): - - # Config - PROC_NAME = 'proc_name' - - # Number of seconds to sleep between cpu time checks. - CPU_SLEEP_SECS = 0.1 - - # Worker state tags. - IDLE_TAGS = ["state:idle"] - WORKING_TAGS = ["state:working"] - SVC_NAME = "gunicorn.is_running" - - def get_library_versions(self): - return {"psutil": psutil.__version__} - - def check(self, instance): - """ Collect metrics for the given gunicorn instance. """ - self.log.debug("Running instance: %s", instance) - - # Validate the config. - if not instance or self.PROC_NAME not in instance: - raise GUnicornCheckError("instance must specify: %s" % self.PROC_NAME) - - # Load the gunicorn master procedure. - proc_name = instance.get(self.PROC_NAME) - master_proc = self._get_master_proc_by_name(proc_name) - - # Fetch the worker procs and count their states. - worker_procs = master_proc.get_children() - working, idle = self._count_workers(worker_procs) - - # if no workers are running, alert CRITICAL, otherwise OK - msg = "%s working and %s idle workers for %s" % (working, idle, proc_name) - status = AgentCheck.CRITICAL if working == 0 and idle == 0 else AgentCheck.OK - - self.service_check(self.SVC_NAME, status, tags=['app:' + proc_name], message=msg) - - # Submit the data. - self.log.debug("instance %s procs - working:%s idle:%s" % (proc_name, working, idle)) - self.gauge("gunicorn.workers", working, self.WORKING_TAGS) - self.gauge("gunicorn.workers", idle, self.IDLE_TAGS) - - def _count_workers(self, worker_procs): - working = 0 - idle = 0 - - if not worker_procs: - return working, idle - - # Count how much sleep time is used by the workers. - cpu_time_by_pid = {} - for proc in worker_procs: - # cpu time is the sum of user + system time. - try: - cpu_time_by_pid[proc.pid] = sum(proc.get_cpu_times()) - except psutil.NoSuchProcess: - self.warning('Process %s disappeared while scanning' % proc.name) - continue - - # Let them do a little bit more work. - time.sleep(self.CPU_SLEEP_SECS) - - # Processes which have used more CPU are considered active (this is a very - # naive check, but gunicorn exposes no stats API) - for proc in worker_procs: - if proc.pid not in cpu_time_by_pid: - # The process is not running anymore, we didn't collect initial cpu times - continue - try: - cpu_time = sum(proc.get_cpu_times()) - except Exception: - # couldn't collect cpu time. assume it's dead. - self.log.debug("Couldn't collect cpu time for %s" % proc) - continue - if cpu_time == cpu_time_by_pid[proc.pid]: - idle += 1 - else: - working += 1 - - return working, idle - - def _get_master_proc_by_name(self, name): - """ Return a psutil process for the master gunicorn process with the given name. """ - master_name = GUnicornCheck._get_master_proc_name(name) - master_procs = [p for p in psutil.process_iter() if p.cmdline() and p.cmdline()[0] == master_name] - if len(master_procs) == 0: - # process not found, it's dead. - self.service_check(self.SVC_NAME, AgentCheck.CRITICAL, tags=['app:' + name], - message="No gunicorn process with name %s found" % name) - raise GUnicornCheckError("Found no master process with name: %s" % master_name) - elif len(master_procs) > 1: - raise GUnicornCheckError("Found more than one master process with name: %s" % master_name) - else: - return master_procs[0] - - @staticmethod - def _get_master_proc_name(name): - """ Return the name of the master gunicorn process for the given proc name. """ - # Here's an example of a process list for a gunicorn box with name web1 - # root 22976 0.1 0.1 60364 13424 ? Ss 19:30 0:00 gunicorn: master [web1] - # web 22984 20.7 2.3 521924 176136 ? Sl 19:30 1:58 gunicorn: worker [web1] - # web 22985 26.4 6.1 795288 449596 ? Sl 19:30 2:32 gunicorn: worker [web1] - return "gunicorn: master [%s]" % name - - -class GUnicornCheckError(Exception): - pass diff --git a/py/checks/haproxy.py b/py/checks/haproxy.py deleted file mode 100644 index 5ca3ed8ba9c3..000000000000 --- a/py/checks/haproxy.py +++ /dev/null @@ -1,438 +0,0 @@ -# stdlib -from collections import defaultdict -import re -import time - -# 3rd party -import requests - -# project -from checks import AgentCheck -from config import _is_affirmative -from util import headers - -STATS_URL = "/;csv;norefresh" -EVENT_TYPE = SOURCE_TYPE_NAME = 'haproxy' - - -class Services(object): - BACKEND = 'BACKEND' - FRONTEND = 'FRONTEND' - ALL = (BACKEND, FRONTEND) - ALL_STATUSES = ( - 'up', 'open', 'no check', 'down', 'maint', 'nolb' - ) - STATUSES_TO_SERVICE_CHECK = { - 'UP': AgentCheck.OK, - 'DOWN': AgentCheck.CRITICAL, - 'no check': AgentCheck.UNKNOWN, - 'MAINT': AgentCheck.OK, - } - - -class HAProxy(AgentCheck): - def __init__(self, name, init_config, agentConfig, instances=None): - AgentCheck.__init__(self, name, init_config, agentConfig, instances) - - # Host status needs to persist across all checks - self.host_status = defaultdict(lambda: defaultdict(lambda: None)) - - METRICS = { - "qcur": ("gauge", "queue.current"), - "scur": ("gauge", "session.current"), - "slim": ("gauge", "session.limit"), - "spct": ("gauge", "session.pct"), # Calculated as: (scur/slim)*100 - "stot": ("rate", "session.rate"), - "bin": ("rate", "bytes.in_rate"), - "bout": ("rate", "bytes.out_rate"), - "dreq": ("rate", "denied.req_rate"), - "dresp": ("rate", "denied.resp_rate"), - "ereq": ("rate", "errors.req_rate"), - "econ": ("rate", "errors.con_rate"), - "eresp": ("rate", "errors.resp_rate"), - "wretr": ("rate", "warnings.retr_rate"), - "wredis": ("rate", "warnings.redis_rate"), - "req_rate": ("gauge", "requests.rate"), # HA Proxy 1.4 and higher - "hrsp_1xx": ("rate", "response.1xx"), # HA Proxy 1.4 and higher - "hrsp_2xx": ("rate", "response.2xx"), # HA Proxy 1.4 and higher - "hrsp_3xx": ("rate", "response.3xx"), # HA Proxy 1.4 and higher - "hrsp_4xx": ("rate", "response.4xx"), # HA Proxy 1.4 and higher - "hrsp_5xx": ("rate", "response.5xx"), # HA Proxy 1.4 and higher - "hrsp_other": ("rate", "response.other"), # HA Proxy 1.4 and higher - "qtime": ("gauge", "queue.time"), # HA Proxy 1.5 and higher - "ctime": ("gauge", "connect.time"), # HA Proxy 1.5 and higher - "rtime": ("gauge", "response.time"), # HA Proxy 1.5 and higher - "ttime": ("gauge", "session.time"), # HA Proxy 1.5 and higher - } - - SERVICE_CHECK_NAME = 'haproxy.backend_up' - - def check(self, instance): - url = instance.get('url') - username = instance.get('username') - password = instance.get('password') - collect_aggregates_only = _is_affirmative( - instance.get('collect_aggregates_only', True) - ) - collect_status_metrics = _is_affirmative( - instance.get('collect_status_metrics', False) - ) - collect_status_metrics_by_host = _is_affirmative( - instance.get('collect_status_metrics_by_host', False) - ) - tag_service_check_by_host = _is_affirmative( - instance.get('tag_service_check_by_host', False) - ) - services_incl_filter = instance.get('services_include', []) - services_excl_filter = instance.get('services_exclude', []) - - self.log.debug('Processing HAProxy data for %s' % url) - - data = self._fetch_data(url, username, password) - - process_events = instance.get('status_check', self.init_config.get('status_check', False)) - - self._process_data( - data, collect_aggregates_only, process_events, - url=url, collect_status_metrics=collect_status_metrics, - collect_status_metrics_by_host=collect_status_metrics_by_host, - tag_service_check_by_host=tag_service_check_by_host, - services_incl_filter=services_incl_filter, - services_excl_filter=services_excl_filter - ) - - def _fetch_data(self, url, username, password): - ''' Hit a given URL and return the parsed json ''' - # Try to fetch data from the stats URL - - auth = (username, password) - url = "%s%s" % (url, STATS_URL) - - self.log.debug("HAProxy Fetching haproxy search data from: %s" % url) - - r = requests.get(url, auth=auth, headers=headers(self.agentConfig)) - r.raise_for_status() - - return r.content.splitlines() - - def _process_data(self, data, collect_aggregates_only, process_events, url=None, - collect_status_metrics=False, collect_status_metrics_by_host=False, - tag_service_check_by_host=False, services_incl_filter=None, - services_excl_filter=None): - ''' Main data-processing loop. For each piece of useful data, we'll - either save a metric, save an event or both. ''' - - # Split the first line into an index of fields - # The line looks like: - # "# pxname,svname,qcur,qmax,scur,smax,slim,stot,bin,bout,dreq,dresp,ereq,econ,eresp,wretr,wredis,status,weight,act,bck,chkfail,chkdown,lastchg,downtime,qlimit,pid,iid,sid,throttle,lbtot,tracked,type,rate,rate_lim,rate_max," - fields = [f.strip() for f in data[0][2:].split(',') if f] - - self.hosts_statuses = defaultdict(int) - - back_or_front = None - - # Skip the first line, go backwards to set back_or_front - for line in data[:0:-1]: - if not line.strip(): - continue - - # Store each line's values in a dictionary - data_dict = self._line_to_dict(fields, line) - - if self._is_aggregate(data_dict): - back_or_front = data_dict['svname'] - - self._update_data_dict(data_dict, back_or_front) - - self._update_hosts_statuses_if_needed( - collect_status_metrics, collect_status_metrics_by_host, - data_dict, self.hosts_statuses - ) - - if self._should_process(data_dict, collect_aggregates_only): - # update status - # Send the list of data to the metric and event callbacks - self._process_metrics( - data_dict, url, - services_incl_filter=services_incl_filter, - services_excl_filter=services_excl_filter - ) - if process_events: - self._process_event( - data_dict, url, - services_incl_filter=services_incl_filter, - services_excl_filter=services_excl_filter - ) - self._process_service_check( - data_dict, url, - tag_by_host=tag_service_check_by_host, - services_incl_filter=services_incl_filter, - services_excl_filter=services_excl_filter - ) - - if collect_status_metrics: - self._process_status_metric( - self.hosts_statuses, collect_status_metrics_by_host, - services_incl_filter=services_incl_filter, - services_excl_filter=services_excl_filter - ) - self._process_backend_hosts_metric( - self.hosts_statuses, - services_incl_filter=services_incl_filter, - services_excl_filter=services_excl_filter - ) - - return data - - def _line_to_dict(self, fields, line): - data_dict = {} - for i, val in enumerate(line.split(',')[:]): - if val: - try: - # Try converting to a long, if failure, just leave it - val = float(val) - except Exception: - pass - data_dict[fields[i]] = val - return data_dict - - def _update_data_dict(self, data_dict, back_or_front): - """ - Adds spct if relevant, adds service - """ - data_dict['back_or_front'] = back_or_front - # The percentage of used sessions based on 'scur' and 'slim' - if 'slim' in data_dict and 'scur' in data_dict: - try: - data_dict['spct'] = (data_dict['scur'] / data_dict['slim']) * 100 - except (TypeError, ZeroDivisionError): - pass - - def _is_aggregate(self, data_dict): - return data_dict['svname'] in Services.ALL - - def _update_hosts_statuses_if_needed(self, collect_status_metrics, - collect_status_metrics_by_host, - data_dict, hosts_statuses): - if data_dict['svname'] == Services.BACKEND: - return - if collect_status_metrics and 'status' in data_dict and 'pxname' in data_dict: - if collect_status_metrics_by_host and 'svname' in data_dict: - key = (data_dict['pxname'], data_dict['svname'], data_dict['status']) - else: - key = (data_dict['pxname'], data_dict['status']) - hosts_statuses[key] += 1 - - def _should_process(self, data_dict, collect_aggregates_only): - """ - if collect_aggregates_only, we process only the aggregates - else we process all except Services.BACKEND - """ - if collect_aggregates_only: - if self._is_aggregate(data_dict): - return True - return False - elif data_dict['svname'] == Services.BACKEND: - return False - return True - - def _is_service_excl_filtered(self, service_name, services_incl_filter, - services_excl_filter): - if self._tag_match_patterns(service_name, services_excl_filter): - if self._tag_match_patterns(service_name, services_incl_filter): - return False - return True - return False - - def _tag_match_patterns(self, tag, filters): - if not filters: - return False - for rule in filters: - if re.search(rule, tag): - return True - return False - - def _process_backend_hosts_metric(self, hosts_statuses, services_incl_filter=None, - services_excl_filter=None): - agg_statuses = defaultdict(lambda: {'available': 0, 'unavailable': 0}) - for host_status, count in hosts_statuses.iteritems(): - try: - service, hostname, status = host_status - except Exception: - service, status = host_status - - if self._is_service_excl_filtered(service, services_incl_filter, services_excl_filter): - continue - status = status.lower() - if 'up' in status: - agg_statuses[service]['available'] += count - elif 'down' in status or 'maint' in status or 'nolb' in status: - agg_statuses[service]['unavailable'] += count - else: - # create the entries for this service anyway - agg_statuses[service] - - for service in agg_statuses: - tags = ['service:%s' % service] - self.gauge( - 'haproxy.backend_hosts', - agg_statuses[service]['available'], - tags=tags + ['available:true']) - self.gauge( - 'haproxy.backend_hosts', - agg_statuses[service]['unavailable'], - tags=tags + ['available:false']) - return agg_statuses - - def _process_status_metric(self, hosts_statuses, collect_status_metrics_by_host, - services_incl_filter=None, services_excl_filter=None): - agg_statuses = defaultdict(lambda: {'available': 0, 'unavailable': 0}) - for host_status, count in hosts_statuses.iteritems(): - try: - service, hostname, status = host_status - except Exception: - service, status = host_status - status = status.lower() - - tags = ['service:%s' % service] - if self._is_service_excl_filtered(service, services_incl_filter, services_excl_filter): - continue - - if collect_status_metrics_by_host: - tags.append('backend:%s' % hostname) - self._gauge_all_statuses("haproxy.count_per_status", count, status, tags=tags) - - if 'up' in status or 'open' in status: - agg_statuses[service]['available'] += count - if 'down' in status or 'maint' in status or 'nolb' in status: - agg_statuses[service]['unavailable'] += count - - for service in agg_statuses: - for status, count in agg_statuses[service].iteritems(): - tags = ['status:%s' % status, 'service:%s' % service] - self.gauge("haproxy.count_per_status", count, tags=tags) - - def _gauge_all_statuses(self, metric_name, count, status, tags): - self.gauge(metric_name, count, tags + ['status:%s' % status]) - for state in Services.ALL_STATUSES: - if state != status: - self.gauge(metric_name, 0, tags + ['status:%s' % state.replace(" ", "_")]) - - def _process_metrics(self, data, url, services_incl_filter=None, - services_excl_filter=None): - """ - Data is a dictionary related to one host - (one line) extracted from the csv. - It should look like: - {'pxname':'dogweb', 'svname':'i-4562165', 'scur':'42', ...} - """ - hostname = data['svname'] - service_name = data['pxname'] - back_or_front = data['back_or_front'] - tags = ["type:%s" % back_or_front, "instance_url:%s" % url] - tags.append("service:%s" % service_name) - - if self._is_service_excl_filtered(service_name, services_incl_filter, - services_excl_filter): - return - - if back_or_front == Services.BACKEND: - tags.append('backend:%s' % hostname) - - for key, value in data.items(): - if HAProxy.METRICS.get(key): - suffix = HAProxy.METRICS[key][1] - name = "haproxy.%s.%s" % (back_or_front.lower(), suffix) - if HAProxy.METRICS[key][0] == 'rate': - self.rate(name, value, tags=tags) - else: - self.gauge(name, value, tags=tags) - - def _process_event(self, data, url, services_incl_filter=None, - services_excl_filter=None): - ''' - Main event processing loop. An event will be created for a service - status change. - Service checks on the server side can be used to provide the same functionality - ''' - hostname = data['svname'] - service_name = data['pxname'] - key = "%s:%s" % (hostname, service_name) - status = self.host_status[url][key] - - if self._is_service_excl_filtered(service_name, services_incl_filter, - services_excl_filter): - return - - if status is None: - self.host_status[url][key] = data['status'] - return - - if status != data['status'] and data['status'] in ('UP', 'DOWN'): - # If the status of a host has changed, we trigger an event - try: - lastchg = int(data['lastchg']) - except Exception: - lastchg = 0 - - # Create the event object - ev = self._create_event( - data['status'], hostname, lastchg, service_name, - data['back_or_front'] - ) - self.event(ev) - - # Store this host status so we can check against it later - self.host_status[url][key] = data['status'] - - def _create_event(self, status, hostname, lastchg, service_name, back_or_front): - HAProxy_agent = self.hostname.decode('utf-8') - if status == "DOWN": - alert_type = "error" - title = "%s reported %s:%s %s" % (HAProxy_agent, service_name, hostname, status) - else: - if status == "UP": - alert_type = "success" - else: - alert_type = "info" - title = "%s reported %s:%s back and %s" % (HAProxy_agent, service_name, hostname, status) - - tags = ["service:%s" % service_name] - if back_or_front == Services.BACKEND: - tags.append('backend:%s' % hostname) - return { - 'timestamp': int(time.time() - lastchg), - 'event_type': EVENT_TYPE, - 'host': HAProxy_agent, - 'msg_title': title, - 'alert_type': alert_type, - "source_type_name": SOURCE_TYPE_NAME, - "event_object": hostname, - "tags": tags - } - - def _process_service_check(self, data, url, tag_by_host=False, - services_incl_filter=None, services_excl_filter=None): - ''' Report a service check, tagged by the service and the backend. - Statuses are defined in `STATUSES_TO_SERVICE_CHECK` mapping. - ''' - service_name = data['pxname'] - status = data['status'] - haproxy_hostname = self.hostname.decode('utf-8') - check_hostname = haproxy_hostname if tag_by_host else '' - - if self._is_service_excl_filtered(service_name, services_incl_filter, - services_excl_filter): - return - - if status in Services.STATUSES_TO_SERVICE_CHECK: - service_check_tags = ["service:%s" % service_name] - hostname = data['svname'] - if data['back_or_front'] == Services.BACKEND: - service_check_tags.append('backend:%s' % hostname) - - status = Services.STATUSES_TO_SERVICE_CHECK[status] - message = "%s reported %s:%s %s" % (haproxy_hostname, service_name, - hostname, status) - self.service_check(self.SERVICE_CHECK_NAME, status, message=message, - hostname=check_hostname, tags=service_check_tags) diff --git a/py/checks/hdfs.py b/py/checks/hdfs.py deleted file mode 100644 index 0d8c6bde394b..000000000000 --- a/py/checks/hdfs.py +++ /dev/null @@ -1,91 +0,0 @@ -# 3rd party -import snakebite.client -import snakebite.version - -# project -from checks import AgentCheck - -# This is only available on snakebite >= 2.2.0 -# but snakebite 2.x is only compatible with hadoop >= 2.2.0 -# So we bundle snakebite 1.3.9 and let the possibility to upgrade to a newer version -# if people want to use HA Mode -try: - # FIXME: Can be remove when we upgrade pylint (drop py 2.6) - # pylint: disable=E0611 - from snakebite.namenode import Namenode -except ImportError: - Namenode = None - - -DEFAULT_PORT = 8020 - - -class HDFSCheck(AgentCheck): - """Report on free space and space used in HDFS. - """ - - def get_client(self, instance): - - if 'namenode' in instance: - # backward compatibility for old style configuration of that check - host, port = instance['namenode'], instance.get('port', DEFAULT_PORT) - return snakebite.client.Client(host, port) - - if type(instance['namenodes']) != list or len(instance['namenodes']) == 0: - raise ValueError('"namenodes parameter should be a list of dictionaries.') - - for namenode in instance['namenodes']: - if type(namenode) != dict: - raise ValueError('"namenodes parameter should be a list of dictionaries.') - - if "url" not in namenode: - raise ValueError('Each namenode should specify a "url" parameter.') - - if len(instance['namenodes']) == 1: - host, port = instance['namenodes'][0]['url'], instance['namenodes'][0].get('port', DEFAULT_PORT) - return snakebite.client.Client(host, port) - - else: - # We are running on HA mode - if Namenode is None: - # We are running snakebite 1.x which is not compatible with the HA mode - # Let's display a warning and use regular mode - self.warning("HA Mode is not available with snakebite < 2.2.0" - "Upgrade to the latest version of snakebiteby running: " - "sudo /opt/datadog-agent/embedded/bin/pip install --upgrade snakebite") - - host, port = instance['namenodes'][0]['url'], instance['namenodes'][0].get('port', DEFAULT_PORT) - return snakebite.client.Client(host, port) - else: - self.log.debug("Running in HA Mode") - nodes = [] - for namenode in instance['namenodes']: - nodes.append(Namenode(namenode['url'], namenode.get('port', DEFAULT_PORT))) - - return snakebite.client.HAClient(nodes) - - def check(self, instance): - if 'namenode' not in instance and 'namenodes' not in instance: - raise ValueError('Missing key \'namenode\' in HDFSCheck config') - - tags = instance.get('tags', None) - - hdfs = self.get_client(instance) - stats = hdfs.df() - # {'used': 2190859321781L, - # 'capacity': 76890897326080L, - # 'under_replicated': 0L, - # 'missing_blocks': 0L, - # 'filesystem': 'hdfs://hostname:port', - # 'remaining': 71186818453504L, - # 'corrupt_blocks': 0L} - - self.gauge('hdfs.used', stats['used'], tags=tags) - self.gauge('hdfs.free', stats['remaining'], tags=tags) - self.gauge('hdfs.capacity', stats['capacity'], tags=tags) - self.gauge('hdfs.in_use', float(stats['used']) / - float(stats['capacity']), tags=tags) - self.gauge('hdfs.under_replicated', stats['under_replicated'], - tags=tags) - self.gauge('hdfs.missing_blocks', stats['missing_blocks'], tags=tags) - self.gauge('hdfs.corrupt_blocks', stats['corrupt_blocks'], tags=tags) diff --git a/py/checks/http_check.py b/py/checks/http_check.py deleted file mode 100644 index d18a71e49f55..000000000000 --- a/py/checks/http_check.py +++ /dev/null @@ -1,415 +0,0 @@ -# stdlib -from datetime import datetime -import os.path -import re -import socket -import ssl -import time -import warnings -from urlparse import urlparse - -# 3rd party -import requests -import tornado - -from requests.adapters import HTTPAdapter -from requests.packages import urllib3 -from requests.packages.urllib3.util import ssl_ - -from requests.packages.urllib3.exceptions import ( - SecurityWarning, -) -from requests.packages.urllib3.packages.ssl_match_hostname import \ - match_hostname - -# project -from checks.network_checks import EventType, NetworkCheck, Status -from config import _is_affirmative -from util import headers as agent_headers - - -class WeakCiphersHTTPSConnection(urllib3.connection.VerifiedHTTPSConnection): - - SUPPORTED_CIPHERS = ( - 'ECDH+AESGCM:DH+AESGCM:ECDH+AES256:DH+AES256:ECDH+AES128:DH+AES:' - 'ECDH+HIGH:DH+HIGH:ECDH+3DES:DH+3DES:RSA+AESGCM:RSA+AES:RSA+HIGH:' - 'RSA+3DES:ECDH+RC4:DH+RC4:RSA+RC4:!aNULL:!eNULL:!EXP:-MD5:RSA+RC4+MD5' - ) - - def __init__(self, host, port, ciphers=None, **kwargs): - self.ciphers = ciphers if ciphers is not None else self.SUPPORTED_CIPHERS - super(WeakCiphersHTTPSConnection, self).__init__(host, port, **kwargs) - - def connect(self): - # Add certificate verification - conn = self._new_conn() - - resolved_cert_reqs = ssl_.resolve_cert_reqs(self.cert_reqs) - resolved_ssl_version = ssl_.resolve_ssl_version(self.ssl_version) - - hostname = self.host - if getattr(self, '_tunnel_host', None): - # _tunnel_host was added in Python 2.6.3 - # (See: - # http://hg.python.org/cpython/rev/0f57b30a152f) - # - # However this check is still necessary in 2.7.x - - self.sock = conn - # Calls self._set_hostport(), so self.host is - # self._tunnel_host below. - self._tunnel() - # Mark this connection as not reusable - self.auto_open = 0 - - # Override the host with the one we're requesting data from. - hostname = self._tunnel_host - - # Wrap socket using verification with the root certs in trusted_root_certs - self.sock = ssl_.ssl_wrap_socket(conn, self.key_file, self.cert_file, - cert_reqs=resolved_cert_reqs, - ca_certs=self.ca_certs, - server_hostname=hostname, - ssl_version=resolved_ssl_version, - ciphers=self.ciphers) - - if self.assert_fingerprint: - ssl_.assert_fingerprint(self.sock.getpeercert(binary_form=True), self.assert_fingerprint) - elif resolved_cert_reqs != ssl.CERT_NONE \ - and self.assert_hostname is not False: - cert = self.sock.getpeercert() - if not cert.get('subjectAltName', ()): - warnings.warn(( - 'Certificate has no `subjectAltName`, falling back to check for a `commonName` for now. ' - 'This feature is being removed by major browsers and deprecated by RFC 2818. ' - '(See https://github.com/shazow/urllib3/issues/497 for details.)'), - SecurityWarning - ) - match_hostname(cert, self.assert_hostname or hostname) - - self.is_verified = (resolved_cert_reqs == ssl.CERT_REQUIRED - or self.assert_fingerprint is not None) - - -class WeakCiphersHTTPSConnectionPool(urllib3.connectionpool.HTTPSConnectionPool): - - ConnectionCls = WeakCiphersHTTPSConnection - - -class WeakCiphersPoolManager(urllib3.poolmanager.PoolManager): - - def _new_pool(self, scheme, host, port): - if scheme == 'https': - return WeakCiphersHTTPSConnectionPool(host, port, **(self.connection_pool_kw)) - return super(WeakCiphersPoolManager, self)._new_pool(scheme, host, port) - - -class WeakCiphersAdapter(HTTPAdapter): - """"Transport adapter" that allows us to use TLS_RSA_WITH_RC4_128_MD5.""" - - def init_poolmanager(self, connections, maxsize, block=False, **pool_kwargs): - # Rewrite of the - # requests.adapters.HTTPAdapter.init_poolmanager method - # to use WeakCiphersPoolManager instead of - # urllib3's PoolManager - self._pool_connections = connections - self._pool_maxsize = maxsize - self._pool_block = block - - self.poolmanager = WeakCiphersPoolManager(num_pools=connections, - maxsize=maxsize, block=block, strict=True, **pool_kwargs) - - -def get_ca_certs_path(): - """ - Get a path to the trusted certificates of the system - """ - CA_CERTS = [ - '/opt/datadog-agent/embedded/ssl/certs/cacert.pem', - os.path.join(os.path.dirname(tornado.__file__), 'ca-certificates.crt'), - '/etc/ssl/certs/ca-certificates.crt', - ] - - for f in CA_CERTS: - if os.path.exists(f): - return f - return None - - -class HTTPCheck(NetworkCheck): - SOURCE_TYPE_NAME = 'system' - SC_STATUS = 'http.can_connect' - SC_SSL_CERT = 'http.ssl_cert' - - def __init__(self, name, init_config, agentConfig, instances): - self.ca_certs = init_config.get('ca_certs', get_ca_certs_path()) - NetworkCheck.__init__(self, name, init_config, agentConfig, instances) - - def _load_conf(self, instance): - # Fetches the conf - tags = instance.get('tags', []) - username = instance.get('username') - password = instance.get('password') - http_response_status_code = str(instance.get('http_response_status_code', "(1|2|3)\d\d")) - timeout = int(instance.get('timeout', 10)) - config_headers = instance.get('headers', {}) - headers = agent_headers(self.agentConfig) - headers.update(config_headers) - url = instance.get('url') - content_match = instance.get('content_match') - response_time = _is_affirmative(instance.get('collect_response_time', True)) - if not url: - raise Exception("Bad configuration. You must specify a url") - include_content = _is_affirmative(instance.get('include_content', False)) - ssl = _is_affirmative(instance.get('disable_ssl_validation', True)) - ssl_expire = _is_affirmative(instance.get('check_certificate_expiration', True)) - instance_ca_certs = instance.get('ca_certs', self.ca_certs) - weakcipher = _is_affirmative(instance.get('weakciphers', False)) - - return url, username, password, http_response_status_code, timeout, include_content,\ - headers, response_time, content_match, tags, ssl, ssl_expire, instance_ca_certs,\ - weakcipher - - def _check(self, instance): - addr, username, password, http_response_status_code, timeout, include_content, headers,\ - response_time, content_match, tags, disable_ssl_validation,\ - ssl_expire, instance_ca_certs, weakcipher = self._load_conf(instance) - start = time.time() - - service_checks = [] - try: - parsed_uri = urlparse(addr) - self.log.debug("Connecting to %s" % addr) - if disable_ssl_validation and parsed_uri.scheme == "https": - self.warning("Skipping SSL certificate validation for %s based on configuration" - % addr) - - auth = None - if username is not None and password is not None: - auth = (username, password) - - sess = requests.Session() - if weakcipher: - base_addr = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) - sess.mount(base_addr, WeakCiphersAdapter()) - self.log.debug("Weak Ciphers will be used for {0}. Suppoted Cipherlist: {1}".format( - base_addr, WeakCiphersHTTPSConnection.SUPPORTED_CIPHERS)) - - r = sess.request('GET', addr, auth=auth, timeout=timeout, headers=headers, - verify=False if disable_ssl_validation else instance_ca_certs) - - except (socket.timeout, requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e: - length = int((time.time() - start) * 1000) - self.log.info("%s is DOWN, error: %s. Connection failed after %s ms" - % (addr, str(e), length)) - service_checks.append(( - self.SC_STATUS, - Status.DOWN, - "%s. Connection failed after %s ms" % (str(e), length) - )) - - except socket.error, e: - length = int((time.time() - start) * 1000) - self.log.info("%s is DOWN, error: %s. Connection failed after %s ms" - % (addr, repr(e), length)) - service_checks.append(( - self.SC_STATUS, - Status.DOWN, - "Socket error: %s. Connection failed after %s ms" % (repr(e), length) - )) - - except Exception, e: - length = int((time.time() - start) * 1000) - self.log.error("Unhandled exception %s. Connection failed after %s ms" - % (str(e), length)) - raise - - # Only report this metric if the site is not down - if response_time and not service_checks: - # Stop the timer as early as possible - running_time = time.time() - start - # Store tags in a temporary list so that we don't modify the global tags data structure - tags_list = list(tags) - tags_list.append('url:%s' % addr) - self.gauge('network.http.response_time', running_time, tags=tags_list) - - # Check HTTP response status code - if not (service_checks or re.match(http_response_status_code, str(r.status_code))): - self.log.info("Incorrect HTTP return code. Expected %s, got %s" - % (http_response_status_code, str(r.status_code))) - - service_checks.append(( - self.SC_STATUS, - Status.DOWN, - "Incorrect HTTP return code. Expected %s, got %s" - % (http_response_status_code, str(r.status_code)) - )) - - if not service_checks: - # Host is UP - # Check content matching is set - if content_match: - content = r.content - if re.search(content_match, content, re.UNICODE): - self.log.debug("%s is found in return content" % content_match) - service_checks.append(( - self.SC_STATUS, Status.UP, "UP" - )) - else: - self.log.info("%s not found in content" % content_match) - self.log.debug("Content returned:\n%s" % content) - service_checks.append(( - self.SC_STATUS, - Status.DOWN, - 'Content "%s" not found in response' % content_match - )) - else: - self.log.debug("%s is UP" % addr) - service_checks.append(( - self.SC_STATUS, Status.UP, "UP" - )) - - if ssl_expire and parsed_uri.scheme == "https": - status, msg = self.check_cert_expiration(instance, timeout, instance_ca_certs) - service_checks.append(( - self.SC_SSL_CERT, status, msg - )) - - return service_checks - - # FIXME: 5.3 drop this function - def _create_status_event(self, sc_name, status, msg, instance): - # Create only this deprecated event for old check - if sc_name != self.SC_STATUS: - return - # Get the instance settings - url = instance.get('url', None) - name = instance.get('name', None) - nb_failures = self.statuses[name][sc_name].count(Status.DOWN) - nb_tries = len(self.statuses[name][sc_name]) - tags = instance.get('tags', []) - tags_list = [] - tags_list.extend(tags) - tags_list.append('url:%s' % url) - - # Get a custom message that will be displayed in the event - custom_message = instance.get('message', "") - if custom_message: - custom_message += " \n" - - # Let the possibility to override the source type name - instance_source_type_name = instance.get('source_type', None) - if instance_source_type_name is None: - source_type = "%s.%s" % (NetworkCheck.SOURCE_TYPE_NAME, name) - else: - source_type = "%s.%s" % (NetworkCheck.SOURCE_TYPE_NAME, instance_source_type_name) - - # Get the handles you want to notify - notify = instance.get('notify', self.init_config.get('notify', [])) - notify_message = "" - if notify: - notify_list = [] - for handle in notify: - notify_list.append("@%s" % handle.strip()) - notify_message = " ".join(notify_list) + " \n" - - if status == Status.DOWN: - # format the HTTP response body into the event - if isinstance(msg, tuple): - code, reason, content = msg - - # truncate and html-escape content - if len(content) > 200: - content = content[:197] + '...' - - msg = u"%d %s\n\n%s" % (code, reason, content) - msg = msg.rstrip() - - title = "[Alert] %s reported that %s is down" % (self.hostname, name) - alert_type = "error" - msg = u"%s %s %s reported that %s (%s) failed %s time(s) within %s last attempt(s)."\ - " Last error: %s" % (notify_message, custom_message, self.hostname, - name, url, nb_failures, nb_tries, msg) - event_type = EventType.DOWN - - else: # Status is UP - title = "[Recovered] %s reported that %s is up" % (self.hostname, name) - alert_type = "success" - msg = u"%s %s %s reported that %s (%s) recovered" \ - % (notify_message, custom_message, self.hostname, name, url) - event_type = EventType.UP - - return { - 'timestamp': int(time.time()), - 'event_type': event_type, - 'host': self.hostname, - 'msg_text': msg, - 'msg_title': title, - 'alert_type': alert_type, - "source_type_name": source_type, - "event_object": name, - "tags": tags_list - } - - def report_as_service_check(self, sc_name, status, instance, msg=None): - instance_name = self.normalize(instance['name']) - url = instance.get('url', None) - sc_tags = ['url:{0}'.format(url), "instance:{0}".format(instance_name)] - custom_tags = instance.get('tags', []) - tags = sc_tags + custom_tags - - if sc_name == self.SC_STATUS: - # format the HTTP response body into the event - if isinstance(msg, tuple): - code, reason, content = msg - - # truncate and html-escape content - if len(content) > 200: - content = content[:197] + '...' - - msg = u"%d %s\n\n%s" % (code, reason, content) - msg = msg.rstrip() - - self.service_check(sc_name, - NetworkCheck.STATUS_TO_SERVICE_CHECK[status], - tags=tags, - message=msg - ) - - def check_cert_expiration(self, instance, timeout, instance_ca_certs): - warning_days = int(instance.get('days_warning', 14)) - critical_days = int(instance.get('days_critical', 7)) - url = instance.get('url') - - o = urlparse(url) - host = o.hostname - - port = o.port or 443 - - try: - sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - sock.settimeout(float(timeout)) - sock.connect((host, port)) - ssl_sock = ssl.wrap_socket(sock, cert_reqs=ssl.CERT_REQUIRED, - ca_certs=instance_ca_certs) - cert = ssl_sock.getpeercert() - - except Exception as e: - return Status.DOWN, "%s" % (str(e)) - - exp_date = datetime.strptime(cert['notAfter'], "%b %d %H:%M:%S %Y %Z") - days_left = exp_date - datetime.utcnow() - - if days_left.days < 0: - return Status.DOWN, "Expired by {0} days".format(days_left.days) - - elif days_left.days < critical_days: - return Status.CRITICAL, "This cert TTL is critical: only {0} days before it expires"\ - .format(days_left.days) - - elif days_left.days < warning_days: - return Status.WARNING, "This cert is almost expired, only {0} days left"\ - .format(days_left.days) - - else: - return Status.UP, "Days left: {0}".format(days_left.days) diff --git a/py/checks/iis.py b/py/checks/iis.py deleted file mode 100644 index 073455633937..000000000000 --- a/py/checks/iis.py +++ /dev/null @@ -1,114 +0,0 @@ -''' -Check the performance counters from IIS -''' -# 3rd party -import wmi - -# project -from checks import AgentCheck - - -class IIS(AgentCheck): - METRICS = [ - ('iis.uptime', 'gauge', 'ServiceUptime'), - - # Network - ('iis.net.bytes_sent', 'rate', 'TotalBytesSent'), - ('iis.net.bytes_rcvd', 'rate', 'TotalBytesReceived'), - ('iis.net.bytes_total', 'rate', 'TotalBytesTransferred'), - ('iis.net.num_connections', 'gauge', 'CurrentConnections'), - ('iis.net.files_sent', 'rate', 'TotalFilesSent'), - ('iis.net.files_rcvd', 'rate', 'TotalFilesReceived'), - ('iis.net.connection_attempts', 'rate', 'TotalConnectionAttemptsAllInstances'), - - # HTTP Methods - ('iis.httpd_request_method.get', 'rate', 'TotalGetRequests'), - ('iis.httpd_request_method.post', 'rate', 'TotalPostRequests'), - ('iis.httpd_request_method.head', 'rate', 'TotalHeadRequests'), - ('iis.httpd_request_method.put', 'rate', 'TotalPutRequests'), - ('iis.httpd_request_method.delete', 'rate', 'TotalDeleteRequests'), - ('iis.httpd_request_method.options', 'rate', 'TotalOptionsRequests'), - ('iis.httpd_request_method.trace', 'rate', 'TotalTraceRequests'), - - # Errors - ('iis.errors.not_found', 'rate', 'TotalNotFoundErrors'), - ('iis.errors.locked', 'rate', 'TotalLockedErrors'), - - # Users - ('iis.users.anon', 'rate', 'TotalAnonymousUsers'), - ('iis.users.nonanon', 'rate', 'TotalNonAnonymousUsers'), - - # Requests - ('iis.requests.cgi', 'rate', 'TotalCGIRequests'), - ('iis.requests.isapi', 'rate', 'TotalISAPIExtensionRequests'), - ] - SERVICE_CHECK = "iis.site_up" - - def __init__(self, name, init_config, agentConfig, instances): - AgentCheck.__init__(self, name, init_config, agentConfig, instances) - self.wmi_conns = {} - - def _get_wmi_conn(self, host, user, password): - key = "%s:%s:%s" % (host, user, password) - if key not in self.wmi_conns: - self.wmi_conns[key] = wmi.WMI(host, user=user, password=password) - return self.wmi_conns[key] - - def check(self, instance): - # Connect to the WMI provider - host = instance.get('host', None) - user = instance.get('username', None) - password = instance.get('password', None) - instance_tags = instance.get('tags', []) - sites = instance.get('sites', ['_Total']) - w = self._get_wmi_conn(host, user, password) - - try: - wmi_cls = w.Win32_PerfFormattedData_W3SVC_WebService() - if not wmi_cls: - raise Exception('Missing data from Win32_PerfFormattedData_W3SVC_WebService') - except Exception: - self.log.exception('Unable to fetch Win32_PerfFormattedData_W3SVC_WebService class') - return - - expected_sites = set(sites) - # Iterate over every IIS site - for iis_site in wmi_cls: - # Skip any sites we don't specifically want. - if iis_site.Name not in sites: - continue - - # Tag with the site name if we're not using the aggregate - if iis_site.Name != '_Total': - tags = instance_tags + ['site:%s' % iis_site.Name] - else: - tags = instance_tags - - status = AgentCheck.CRITICAL if iis_site.ServiceUptime == 0 else AgentCheck.OK - self.service_check("iis.site_up", status, tags=['site:%s' % iis_site.Name]) - expected_sites.remove(iis_site.Name) - - for metric, mtype, wmi_val in self.METRICS: - if not hasattr(iis_site, wmi_val): - if wmi_val == 'TotalBytesTransferred' and hasattr(iis_site, - 'TotalBytesTransfered'): - # Windows 2008 sp2 reports it as TotalbytesTransfered - # instead of TotalBytesTransferred (single r) - wmi_val = 'TotalBytesTransfered' - elif wmi_val == 'TotalConnectionAttemptsAllInstances' \ - and hasattr(iis_site, 'TotalConnectionAttemptsallinstances'): - wmi_val = 'TotalConnectionAttemptsallinstances' - else: - self.warning("Unable to fetch metric %s. Missing %s in " - "Win32_PerfFormattedData_W3SVC_WebService" - % (metric, wmi_val)) - continue - - # Submit the metric value with the correct type - value = float(getattr(iis_site, wmi_val)) - metric_func = getattr(self, mtype) - metric_func(metric, value, tags=tags) - - for remaining_site in expected_sites: - self.service_check("iis.site_up", AgentCheck.CRITICAL, - tags=['site:%s' % remaining_site]) diff --git a/py/checks/jenkins.py b/py/checks/jenkins.py deleted file mode 100644 index 813b24d5a68e..000000000000 --- a/py/checks/jenkins.py +++ /dev/null @@ -1,182 +0,0 @@ -# stdlib -from collections import defaultdict -from glob import glob -import os -import time -from xml.etree.ElementTree import ElementTree - -# project -from checks import AgentCheck -from util import get_hostname - - -class Skip(Exception): - """ - Raised by :class:`Jenkins` when it comes across - a build or job that should be excluded from being checked. - """ - def __init__(self, reason, dir_name): - message = 'skipping build or job at %s because %s' % (dir_name, reason) - Exception.__init__(self, message) - - -class Jenkins(AgentCheck): - datetime_format = '%Y-%m-%d_%H-%M-%S' - - def __init__(self, name, init_config, agentConfig): - AgentCheck.__init__(self, name, init_config, agentConfig) - self.high_watermarks = {} - - def _timestamp_from_build_file(self, dir_name, tree): - timestamp = tree.find('timestamp') - if timestamp is None or not timestamp.text: - raise Skip('the timestamp cannot be found', dir_name) - else: - return int(timestamp.text) / 1000.0 - - def _timestamp_from_dirname(self, dir_name): - if not os.path.isdir(dir_name): - raise Skip('its not a build directory', dir_name) - - try: - # Parse the timestamp from the directory name - date_str = os.path.basename(dir_name) - time_tuple = time.strptime(date_str, self.datetime_format) - return time.mktime(time_tuple) - except ValueError: - return None - - def _get_build_metadata(self, dir_name, watermark): - if os.path.exists(os.path.join(dir_name, 'jenkins_build.tar.gz')): - raise Skip('the build has already been archived', dir_name) - timestamp = self._timestamp_from_dirname(dir_name) - # This is not the latest build - if timestamp is not None and timestamp <= watermark: - return None - # Read the build.xml metadata file that Jenkins generates - build_metadata = os.path.join(dir_name, 'build.xml') - - if not os.access(build_metadata, os.R_OK): - self.log.debug("Can't read build file at %s" % (build_metadata)) - raise Exception("Can't access build.xml at %s" % (build_metadata)) - else: - tree = ElementTree() - tree.parse(build_metadata) - if timestamp is None: - try: - timestamp = self._timestamp_from_build_file(dir_name, tree) - # This is not the latest build - if timestamp <= watermark: - return None - except ValueError: - return None - keys = ['result', 'number', 'duration'] - - kv_pairs = ((k, tree.find(k)) for k in keys) - d = dict([(k, v.text) for k, v in kv_pairs if v is not None]) - d['timestamp'] = timestamp - - try: - d['branch'] = tree.find('actions')\ - .find('hudson.plugins.git.util.BuildData')\ - .find('buildsByBranchName')\ - .find('entry')\ - .find('hudson.plugins.git.util.Build')\ - .find('revision')\ - .find('branches')\ - .find('hudson.plugins.git.Branch')\ - .find('name')\ - .text - except Exception: - pass - return d - - def _get_build_results(self, instance_key, job_dir): - job_name = os.path.basename(job_dir) - try: - dirs = glob(os.path.join(job_dir, 'builds', '*_*')) - # Before Jenkins v1.597 the build folders were named with a timestamp (eg: 2015-03-10_19-59-29) - # Starting from Jenkins v1.597 they are named after the build ID (1, 2, 3...) - # So we need try both format when trying to find the latest build and parsing build.xml - if len(dirs) == 0: - dirs = glob(os.path.join(job_dir, 'builds', '[0-9]*')) - if len(dirs) > 0: - # versions of Jenkins > 1.597 need to be sorted by build number (integer) - try: - dirs = sorted(dirs, key=lambda x: int(x.split('/')[-1]), reverse=True) - except ValueError: - dirs = sorted(dirs, reverse=True) - # We try to get the last valid build - for dir_name in dirs: - watermark = self.high_watermarks[instance_key][job_name] - try: - build_metadata = self._get_build_metadata(dir_name, watermark) - except Exception: - build_metadata = None - if build_metadata is not None: - build_result = build_metadata.get('result') - if build_result is None: - break - - output = { - 'job_name': job_name, - 'event_type': 'build result' - } - - output.update(build_metadata) - if 'number' not in output: - output['number'] = dir_name.split('/')[-1] - self.high_watermarks[instance_key][job_name] = output.get('timestamp') - self.log.debug("Processing %s results '%s'" % (job_name, output)) - yield output - - # If it not a new build, stop here - else: - break - except Exception, e: - self.log.error("Error while working on job %s, exception: %s" % (job_name, e)) - - def check(self, instance, create_event=True): - if self.high_watermarks.get(instance.get('name'), None) is None: - # On the first run of check(), prime the high_watermarks dict - # so that we only send events that occured after the agent - # started. - # (Setting high_watermarks in the next statement prevents - # any kind of infinite loop (assuming nothing ever sets - # high_watermarks to None again!)) - self.high_watermarks[instance.get('name')] = defaultdict(lambda: 0) - self.check(instance, create_event=False) - - jenkins_home = instance.get('jenkins_home') - - if not jenkins_home: - raise Exception("No jenkins_home directory set in the config file") - - jenkins_jobs_dir = os.path.join(jenkins_home, 'jobs', '*') - job_dirs = glob(jenkins_jobs_dir) - - if not job_dirs: - raise Exception('No jobs found in `%s`! ' - 'Check `jenkins_home` in your config' % (jenkins_jobs_dir)) - - for job_dir in job_dirs: - for output in self._get_build_results(instance.get('name'), job_dir): - output['host'] = get_hostname(self.agentConfig) - if create_event: - self.log.debug("Creating event for job: %s" % output['job_name']) - self.event(output) - - tags = [ - 'job_name:%s' % output['job_name'], - 'result:%s' % output['result'], - 'build_number:%s' % output['number'] - ] - - if 'branch' in output: - tags.append('branch:%s' % output['branch']) - self.gauge("jenkins.job.duration", float(output['duration'])/1000.0, tags=tags) - - if output['result'] == 'SUCCESS': - self.increment('jenkins.job.success', tags=tags) - else: - self.increment('jenkins.job.failure', tags=tags) diff --git a/py/checks/kafka_consumer.py b/py/checks/kafka_consumer.py deleted file mode 100644 index 8497768d3bc9..000000000000 --- a/py/checks/kafka_consumer.py +++ /dev/null @@ -1,124 +0,0 @@ -# stdlib -from collections import defaultdict - -# 3p -from kafka.client import KafkaClient -from kafka.common import OffsetRequest -from kazoo.client import KazooClient -from kazoo.exceptions import NoNodeError - -# project -from checks import AgentCheck - -DEFAULT_KAFKA_TIMEOUT = 5 -DEFAULT_ZK_TIMEOUT = 5 - - -class KafkaCheck(AgentCheck): - - SOURCE_TYPE_NAME = 'kafka' - - def __init__(self, name, init_config, agentConfig, instances=None): - AgentCheck.__init__(self, name, init_config, agentConfig, instances=instances) - self.zk_timeout = int( - init_config.get('zk_timeout', DEFAULT_ZK_TIMEOUT)) - self.kafka_timeout = int( - init_config.get('kafka_timeout', DEFAULT_KAFKA_TIMEOUT)) - - def check(self, instance): - consumer_groups = self.read_config(instance, 'consumer_groups', - cast=self._validate_consumer_groups) - zk_connect_str = self.read_config(instance, 'zk_connect_str') - kafka_host_ports = self.read_config(instance, 'kafka_connect_str') - - # Construct the Zookeeper path pattern - zk_prefix = instance.get('zk_prefix', '') - zk_path_tmpl = zk_prefix + '/consumers/%s/offsets/%s/%s' - - # Connect to Zookeeper - zk_conn = KazooClient(zk_connect_str, timeout=self.zk_timeout) - zk_conn.start() - - try: - # Query Zookeeper for consumer offsets - consumer_offsets = {} - topics = defaultdict(set) - for consumer_group, topic_partitions in consumer_groups.iteritems(): - for topic, partitions in topic_partitions.iteritems(): - # Remember the topic partitions that we've see so that we can - # look up their broker offsets later - topics[topic].update(set(partitions)) - for partition in partitions: - zk_path = zk_path_tmpl % (consumer_group, topic, partition) - try: - consumer_offset = int(zk_conn.get(zk_path)[0]) - key = (consumer_group, topic, partition) - consumer_offsets[key] = consumer_offset - except NoNodeError: - self.log.warn('No zookeeper node at %s' % zk_path) - except Exception: - self.log.exception('Could not read consumer offset from %s' % zk_path) - finally: - try: - zk_conn.stop() - zk_conn.close() - except Exception: - self.log.exception('Error cleaning up Zookeeper connection') - - # Connect to Kafka - kafka_conn = KafkaClient(kafka_host_ports, timeout=self.kafka_timeout) - - try: - # Query Kafka for the broker offsets - broker_offsets = {} - for topic, partitions in topics.items(): - offset_responses = kafka_conn.send_offset_request([ - OffsetRequest(topic, p, -1, 1) for p in partitions]) - - for resp in offset_responses: - broker_offsets[(resp.topic, resp.partition)] = resp.offsets[0] - finally: - try: - kafka_conn.close() - except Exception: - self.log.exception('Error cleaning up Kafka connection') - - # Report the broker data - for (topic, partition), broker_offset in broker_offsets.items(): - broker_tags = ['topic:%s' % topic, 'partition:%s' % partition] - broker_offset = broker_offsets.get((topic, partition)) - self.gauge('kafka.broker_offset', broker_offset, tags=broker_tags) - - # Report the consumer - for (consumer_group, topic, partition), consumer_offset in consumer_offsets.items(): - - # Get the broker offset - broker_offset = broker_offsets.get((topic, partition)) - - # Report the consumer offset and lag - tags = ['topic:%s' % topic, 'partition:%s' % partition, - 'consumer_group:%s' % consumer_group] - self.gauge('kafka.consumer_offset', consumer_offset, tags=tags) - self.gauge('kafka.consumer_lag', broker_offset - consumer_offset, - tags=tags) - - # Private config validation/marshalling functions - - def _validate_consumer_groups(self, val): - try: - consumer_group, topic_partitions = val.items()[0] - assert isinstance(consumer_group, (str, unicode)) - topic, partitions = topic_partitions.items()[0] - assert isinstance(topic, (str, unicode)) - assert isinstance(partitions, (list, tuple)) - return val - except Exception, e: - self.log.exception(e) - raise Exception('''The `consumer_groups` value must be a mapping of mappings, like this: -consumer_groups: - myconsumer0: # consumer group name - mytopic0: [0, 1] # topic: list of partitions - myconsumer1: - mytopic0: [0, 1, 2] - mytopic1: [10, 12] -''') diff --git a/py/checks/kubernetes.py b/py/checks/kubernetes.py deleted file mode 100644 index 6a57c813dd33..000000000000 --- a/py/checks/kubernetes.py +++ /dev/null @@ -1,249 +0,0 @@ -"""kubernetes check -Collects metrics from cAdvisor instance -""" -# stdlib -import numbers -import socket -import struct -from fnmatch import fnmatch -import re - -# 3rd party -import requests - -# project -from checks import AgentCheck -from config import _is_affirmative -from utils.kubeutil import set_kube_settings, get_kube_settings, get_kube_labels -from utils.http import retrieve_json - -NAMESPACE = "kubernetes" -DEFAULT_MAX_DEPTH = 10 - -DEFAULT_USE_HISTOGRAM = False -DEFAULT_PUBLISH_ALIASES = False -DEFAULT_ENABLED_RATES = [ - 'diskio.io_service_bytes.stats.total', - 'network.??_bytes', - 'cpu.*.total'] - -NET_ERRORS = ['rx_errors', 'tx_errors', 'rx_dropped', 'tx_dropped'] - -DEFAULT_ENABLED_GAUGES = [ - 'memory.usage', - 'filesystem.usage'] - -GAUGE = AgentCheck.gauge -RATE = AgentCheck.rate -HISTORATE = AgentCheck.generate_historate_func(["container_name"]) -HISTO = AgentCheck.generate_histogram_func(["container_name"]) -FUNC_MAP = { - GAUGE: {True: HISTO, False: GAUGE}, - RATE: {True: HISTORATE, False: RATE} -} - -class Kubernetes(AgentCheck): - """ Collect metrics and events from kubelet """ - - pod_names_by_container = {} - - def __init__(self, name, init_config, agentConfig, instances=None): - if instances is not None and len(instances) > 1: - raise Exception('Kubernetes check only supports one configured instance.') - AgentCheck.__init__(self, name, init_config, agentConfig, instances) - self.kube_settings = set_kube_settings(instances[0]) - - def _get_default_router(self): - try: - with open('/proc/net/route') as f: - for line in f.readlines(): - fields = line.strip().split() - if fields[1] == '00000000': - return socket.inet_ntoa(struct.pack('= self.max_depth: - self.log.warning('Reached max depth on metric=%s' % metric) - return - - if isinstance(dat, numbers.Number): - if self.enabled_rates and any([fnmatch(metric, pat) for pat in self.enabled_rates]): - self.publish_rate(self, metric, float(dat), tags) - elif self.enabled_gauges and any([fnmatch(metric, pat) for pat in self.enabled_gauges]): - self.publish_gauge(self, metric, float(dat), tags) - - elif isinstance(dat, dict): - for k,v in dat.iteritems(): - self._publish_raw_metrics(metric + '.%s' % k.lower(), v, tags, depth + 1) - - elif isinstance(dat, list): - self._publish_raw_metrics(metric, dat[-1], tags, depth + 1) - - @staticmethod - def _shorten_name(name): - # shorten docker image id - return re.sub('([0-9a-fA-F]{64,})', lambda x: x.group(1)[0:12], name) - - def _update_container_metrics(self, instance, subcontainer, kube_labels): - tags = instance.get('tags', []) # add support for custom tags - - if len(subcontainer.get('aliases', [])) >= 1: - # The first alias seems to always match the docker container name - container_name = subcontainer['aliases'][0] - else: - # We default to the container id - container_name = subcontainer['name'] - - tags.append('container_name:%s' % container_name) - - pod_name_set = False - try: - for label_name,label in subcontainer['spec']['labels'].iteritems(): - label_name = label_name.replace('io.kubernetes.pod.name', 'pod_name') - if label_name == "pod_name": - pod_name_set = True - pod_labels = kube_labels.get(label) - if pod_labels: - tags.extend(list(pod_labels)) - - if "-" in label: - replication_controller = "-".join( - label.split("-")[:-1]) - if "/" in replication_controller: - namespace, replication_controller = replication_controller.split("/", 1) - tags.append("kube_namespace:%s" % namespace) - - tags.append("kube_replication_controller:%s" % replication_controller) - tags.append('%s:%s' % (label_name, label)) - except KeyError: - pass - - if not pod_name_set: - tags.append("pod_name:no_pod") - - if self.publish_aliases and subcontainer.get("aliases"): - for alias in subcontainer['aliases'][1:]: - # we don't add the first alias as it will be the container_name - tags.append('container_alias:%s' % (self._shorten_name(alias))) - - stats = subcontainer['stats'][-1] # take the latest - self._publish_raw_metrics(NAMESPACE, stats, tags) - - if subcontainer.get("spec", {}).get("has_filesystem"): - fs = stats['filesystem'][-1] - fs_utilization = float(fs['usage'])/float(fs['capacity']) - self.publish_gauge(self, NAMESPACE + '.filesystem.usage_pct', fs_utilization, tags) - - if subcontainer.get("spec", {}).get("has_network"): - net = stats['network'] - self.publish_rate(self, NAMESPACE + '.network_errors', - sum(float(net[x]) for x in NET_ERRORS), - tags) - - def _retrieve_metrics(self, url): - return retrieve_json(url) - - def _retrieve_kube_labels(self): - return get_kube_labels() - - - def _update_metrics(self, instance, kube_settings): - metrics = self._retrieve_metrics(kube_settings["metrics_url"]) - kube_labels = self._retrieve_kube_labels() - if not metrics: - raise Exception('No metrics retrieved cmd=%s' % self.metrics_cmd) - - for subcontainer in metrics: - try: - self._update_container_metrics(instance, subcontainer, kube_labels) - except Exception, e: - self.log.error("Unable to collect metrics for container: {0} ({1}".format( - subcontainer.get('name'), e)) diff --git a/py/checks/kyototycoon.py b/py/checks/kyototycoon.py deleted file mode 100644 index c312fc8d1bc5..000000000000 --- a/py/checks/kyototycoon.py +++ /dev/null @@ -1,115 +0,0 @@ -# stdlib -from collections import defaultdict -import re - -# 3rd party -import requests - -# project -from checks import AgentCheck - -db_stats = re.compile(r'^db_(\d)+$') -whitespace = re.compile(r'\s') - - -class KyotoTycoonCheck(AgentCheck): - """Report statistics about the Kyoto Tycoon DBM-style - database server (http://fallabs.com/kyototycoon/) - """ - SOURCE_TYPE_NAME = 'kyoto tycoon' - SERVICE_CHECK_NAME = 'kyototycoon.can_connect' - - GAUGES = { - 'repl_delay': 'replication.delay', - 'serv_thread_count': 'threads', - } - - RATES = { - 'serv_conn_count': 'connections', - 'cnt_get': 'ops.get.hits', - 'cnt_get_misses': 'ops.get.misses', - 'cnt_set': 'ops.set.hits', - 'cnt_set_misses': 'ops.set.misses', - 'cnt_remove': 'ops.del.hits', - 'cnt_remove_misses': 'ops.del.misses', - } - - DB_GAUGES = { - 'count': 'records', - 'size': 'size', - } - TOTALS = { - 'cnt_get': 'ops.get.total', - 'cnt_get_misses': 'ops.get.total', - 'cnt_set': 'ops.set.total', - 'cnt_set_misses': 'ops.set.total', - 'cnt_remove': 'ops.del.total', - 'cnt_remove_misses': 'ops.del.total', - } - - def check(self, instance): - url = instance.get('report_url') - if not url: - raise Exception('Invalid Kyoto Tycoon report url %r' % url) - - tags = instance.get('tags', {}) - name = instance.get('name') - - # generate the formatted list of tags - tags = ['%s:%s' % (k, v) for k, v in tags.items()] - if name is not None: - tags.append('instance:%s' % name) - - service_check_tags = [] - if name is not None: - service_check_tags.append('instance:%s' % name) - - - try: - r = requests.get(url) - r.raise_for_status() - except requests.exceptions.HTTPError as e: - self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, - tags=service_check_tags, message=str(e.message)) - raise - except Exception as e: - self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, - tags=service_check_tags, message=str(e)) - raise - else: - self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, - tags=service_check_tags) - - body = r.content - - totals = defaultdict(int) - for line in body.splitlines(): - if '\t' not in line: - continue - - key, value = line.strip().split('\t', 1) - if key in self.GAUGES: - name = self.GAUGES[key] - self.gauge('kyototycoon.%s' % name, float(value), tags=tags) - - elif key in self.RATES: - name = self.RATES[key] - self.rate('kyototycoon.%s_per_s' % name, float(value), tags=tags) - - elif db_stats.match(key): - # Also produce a per-db metrics tagged with the db - # number in addition to the default tags - m = db_stats.match(key) - dbnum = int(m.group(1)) - mytags = tags + ['db:%d' % dbnum] - for part in whitespace.split(value): - k, v = part.split('=', 1) - if k in self.DB_GAUGES: - name = self.DB_GAUGES[k] - self.gauge('kyototycoon.%s' % name, float(v), tags=mytags) - - if key in self.TOTALS: - totals[self.TOTALS[key]] += float(value) - - for key, value in totals.items(): - self.rate('kyototycoon.%s_per_s' % key, value, tags=tags) diff --git a/py/checks/lighttpd.py b/py/checks/lighttpd.py deleted file mode 100644 index a517ed5dfaee..000000000000 --- a/py/checks/lighttpd.py +++ /dev/null @@ -1,159 +0,0 @@ -# stdlib -import re -import urlparse - -# 3rd party -import requests - -# project -from checks import AgentCheck -from util import headers - -VERSION_REGEX = re.compile(r".*/(\d)") - - -class Lighttpd(AgentCheck): - """Tracks basic connection/requests/workers metrics - - See http://redmine.lighttpd.net/projects/1/wiki/Docs_ModStatus for Lighttpd details - See http://redmine.lighttpd.net/projects/lighttpd2/wiki/Mod_status for Lighttpd2 details - """ - - SERVICE_CHECK_NAME = 'lighttpd.can_connect' - - URL_SUFFIX_PER_VERSION = { - 1: '?auto', - 2: '?format=plain', - 'Unknown': '?auto' - } - - GAUGES = { - 'IdleServers': 'lighttpd.performance.idle_server', - 'BusyServers': 'lighttpd.performance.busy_servers', - 'Uptime': 'lighttpd.performance.uptime', - 'Total kBytes': 'lighttpd.net.bytes', - 'Total Accesses': 'lighttpd.net.hits', - 'memory_usage': 'lighttpd.performance.memory_usage', - 'requests_avg': 'lighttpd.net.requests_avg', - 'traffic_out_avg': 'lighttpd.net.bytes_out_avg', - 'traffic_in_avg': 'lighttpd.net.bytes_in_avg', - 'connections_avg': 'lighttpd.net.connections_avg', - 'connection_state_start': 'lighttpd.connections.state_start', - 'connection_state_read_header': 'lighttpd.connections.state_read_header', - 'connection_state_handle_request': 'lighttpd.connections.state_handle_request', - 'connection_state_write_response': 'lighttpd.connections.state_write_response', - 'connection_state_keep_alive': 'lighttpd.connections.state_keep_alive', - 'requests_avg_5sec': 'lighttpd.net.requests_avg_5sec', - 'traffic_out_avg_5sec': 'lighttpd.net.bytes_out_avg_5sec', - 'traffic_in_avg_5sec': 'lighttpd.net.bytes_in_avg_5sec', - 'connections_avg_5sec': 'lighttpd.net.connections_avg_5sec', - } - - COUNTERS = { - 'requests_abs': 'lighttpd.net.requests_total', - 'traffic_out_abs': 'lighttpd.net.bytes_out', - 'traffic_in_abs': 'lighttpd.net.bytes_in', - 'connections_abs': 'lighttpd.net.connections_total', - 'status_1xx': 'lighttpd.response.status_1xx', - 'status_2xx': 'lighttpd.response.status_2xx', - 'status_3xx': 'lighttpd.response.status_3xx', - 'status_4xx': 'lighttpd.response.status_4xx', - 'status_5xx': 'lighttpd.response.status_5xx', - } - - RATES = { - 'Total kBytes': 'lighttpd.net.bytes_per_s', - 'Total Accesses': 'lighttpd.net.request_per_s' - } - - def __init__(self, name, init_config, agentConfig, instances=None): - AgentCheck.__init__(self, name, init_config, agentConfig, instances) - self.assumed_url = {} - - def check(self, instance): - if 'lighttpd_status_url' not in instance: - raise Exception("Missing 'lighttpd_status_url' variable in Lighttpd config") - - url = self.assumed_url.get(instance['lighttpd_status_url'], instance['lighttpd_status_url']) - - tags = instance.get('tags', []) - self.log.debug("Connecting to %s" % url) - - auth = None - if 'user' in instance and 'password' in instance: - auth = (instance['user'], instance['password']) - - # Submit a service check for status page availability. - parsed_url = urlparse.urlparse(url) - lighttpd_url = parsed_url.hostname - lighttpd_port = parsed_url.port or 80 - service_check_tags = ['host:%s' % lighttpd_url, 'port:%s' % lighttpd_port] - try: - r = requests.get(url, auth=auth, headers=headers(self.agentConfig)) - r.raise_for_status() - except Exception: - self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, - tags=service_check_tags) - raise - else: - self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, - tags=service_check_tags) - - headers_resp = r.headers - server_version = self._get_server_version(headers_resp) - response = r.content - - metric_count = 0 - # Loop through and extract the numerical values - for line in response.split('\n'): - values = line.split(': ') - if len(values) == 2: # match - metric, value = values - try: - value = float(value) - except ValueError: - continue - - # Special case: kBytes => bytes - if metric == 'Total kBytes': - value = value * 1024 - - # Send metric as a gauge, if applicable - if metric in self.GAUGES: - metric_count += 1 - metric_name = self.GAUGES[metric] - self.gauge(metric_name, value, tags=tags) - - # Send metric as a rate, if applicable - if metric in self.RATES: - metric_count += 1 - metric_name = self.RATES[metric] - self.rate(metric_name, value, tags=tags) - - # Send metric as a counter, if applicable - if metric in self.COUNTERS: - metric_count += 1 - metric_name = self.COUNTERS[metric] - self.increment(metric_name, value, tags=tags) - - if metric_count == 0: - url_suffix = self.URL_SUFFIX_PER_VERSION[server_version] - if self.assumed_url.get(instance['lighttpd_status_url']) is None and url[-len(url_suffix):] != url_suffix: - self.assumed_url[instance['lighttpd_status_url']] = '%s%s' % (url, url_suffix) - self.warning("Assuming url was not correct. Trying to add %s suffix to the url" % url_suffix) - self.check(instance) - else: - raise Exception("No metrics were fetched for this instance. Make sure " - "that %s is the proper url." % instance['lighttpd_status_url']) - - def _get_server_version(self, headers): - server_version = headers.get("server", "") - - match = VERSION_REGEX.match(server_version) - if match is None: - self.log.debug("Lighttpd server version is Unknown") - return "Unknown" - - version = int(match.group(1)) - self.log.debug("Lighttpd server version is %s" % version) - return version diff --git a/py/checks/marathon.py b/py/checks/marathon.py deleted file mode 100644 index f4900d752ef5..000000000000 --- a/py/checks/marathon.py +++ /dev/null @@ -1,81 +0,0 @@ -# stdlib -from urlparse import urljoin - -# 3rd party -import requests - -# project -from checks import AgentCheck - - -class Marathon(AgentCheck): - - DEFAULT_TIMEOUT = 5 - SERVICE_CHECK_NAME = 'marathon.can_connect' - - APP_METRICS = [ - 'backoffFactor', - 'backoffSeconds', - 'cpus', - 'disk', - 'instances', - 'mem', - 'taskRateLimit', - 'tasksRunning', - 'tasksStaged' - ] - - def check(self, instance): - if 'url' not in instance: - raise Exception('Marathon instance missing "url" value.') - - # Load values from the instance config - url = instance['url'] - user = instance.get('user') - password = instance.get('password') - if user is not None and password is not None: - auth = (user,password) - else: - auth = None - instance_tags = instance.get('tags', []) - default_timeout = self.init_config.get('default_timeout', self.DEFAULT_TIMEOUT) - timeout = float(instance.get('timeout', default_timeout)) - - response = self.get_json(urljoin(url, "/v2/apps"), timeout, auth) - if response is not None: - self.gauge('marathon.apps', len(response['apps']), tags=instance_tags) - for app in response['apps']: - tags = ['app_id:' + app['id'], 'version:' + app['version']] + instance_tags - for attr in self.APP_METRICS: - if attr in app: - self.gauge('marathon.' + attr, app[attr], tags=tags) - - query_url = urljoin(url, "/v2/apps/{0}/versions".format(app['id'])) - versions_reply = self.get_json(query_url, timeout, auth) - - if versions_reply is not None: - self.gauge('marathon.versions', len(versions_reply['versions']), tags=tags) - - def get_json(self, url, timeout, auth): - try: - r = requests.get(url, timeout=timeout, auth=auth) - r.raise_for_status() - except requests.exceptions.Timeout: - # If there's a timeout - self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, - message='%s timed out after %s seconds.' % (url, timeout), - tags = ["url:{0}".format(url)]) - raise Exception("Timeout when hitting %s" % url) - - except requests.exceptions.HTTPError: - self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, - message='%s returned a status of %s' % (url, r.status_code), - tags = ["url:{0}".format(url)]) - raise Exception("Got %s when hitting %s" % (r.status_code, url)) - - else: - self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, - tags = ["url:{0}".format(url)] - ) - - return r.json() diff --git a/py/checks/mcache.py b/py/checks/mcache.py deleted file mode 100644 index 91dc4f284802..000000000000 --- a/py/checks/mcache.py +++ /dev/null @@ -1,201 +0,0 @@ -# 3rd party -import memcache - -# project -from checks import AgentCheck - -# Ref: http://code.sixapart.com/svn/memcached/trunk/server/doc/protocol.txt -# Name Type Meaning -# ---------------------------------- -# pid 32u Process id of this server process -# uptime 32u Number of seconds this server has been running -# time 32u current UNIX time according to the server -# version string Version string of this server -# pointer_size 32 Default size of pointers on the host OS -# (generally 32 or 64) -# rusage_user 32u:32u Accumulated user time for this process -# (seconds:microseconds) -# rusage_system 32u:32u Accumulated system time for this process -# (seconds:microseconds) -# curr_items 32u Current number of items stored by the server -# total_items 32u Total number of items stored by this server -# ever since it started -# bytes 64u Current number of bytes used by this server -# to store items -# curr_connections 32u Number of open connections -# total_connections 32u Total number of connections opened since -# the server started running -# connection_structures 32u Number of connection structures allocated -# by the server -# cmd_get 64u Cumulative number of retrieval requests -# cmd_set 64u Cumulative number of storage requests -# get_hits 64u Number of keys that have been requested and -# found present -# get_misses 64u Number of items that have been requested -# and not found -# delete_misses 64u Number of deletions reqs for missing keys -# delete_hits 64u Number of deletion reqs resulting in -# an item being removed. -# evictions 64u Number of valid items removed from cache -# to free memory for new items -# bytes_read 64u Total number of bytes read by this server -# from network -# bytes_written 64u Total number of bytes sent by this server to -# network -# limit_maxbytes 32u Number of bytes this server is allowed to -# use for storage. -# threads 32u Number of worker threads requested. -# (see doc/threads.txt) -# listen_disabled_num 32u How many times the server has reached maxconns -# (see https://code.google.com/p/memcached/wiki/Timeouts) -# >>> mc.get_stats() -# [('127.0.0.1:11211 (1)', {'pid': '2301', 'total_items': '2', -# 'uptime': '80', 'listen_disabled_num': '0', 'version': '1.2.8', -# 'limit_maxbytes': '67108864', 'rusage_user': '0.002532', -# 'bytes_read': '51', 'accepting_conns': '1', 'rusage_system': -# '0.007445', 'cmd_get': '0', 'curr_connections': '4', 'threads': '2', -# 'total_connections': '5', 'cmd_set': '2', 'curr_items': '0', -# 'get_misses': '0', 'cmd_flush': '0', 'evictions': '0', 'bytes': '0', -# 'connection_structures': '5', 'bytes_written': '25', 'time': -# '1306364220', 'pointer_size': '64', 'get_hits': '0'})] - -# For Membase it gets worse -# http://www.couchbase.org/wiki/display/membase/Membase+Statistics -# https://github.com/membase/ep-engine/blob/master/docs/stats.org - - -class Memcache(AgentCheck): - - SOURCE_TYPE_NAME = 'memcached' - - DEFAULT_PORT = 11211 - - GAUGES = [ - "total_items", - "curr_items", - "limit_maxbytes", - "uptime", - "bytes", - "curr_connections", - "connection_structures", - "threads", - "pointer_size" - ] - - RATES = [ - "rusage_user", - "rusage_system", - "cmd_get", - "cmd_set", - "cmd_flush", - "get_hits", - "get_misses", - "delete_misses", - "delete_hits", - "evictions", - "bytes_read", - "bytes_written", - "cas_misses", - "cas_hits", - "cas_badval", - "total_connections", - "listen_disabled_num" - ] - - SERVICE_CHECK = 'memcache.can_connect' - - def get_library_versions(self): - return {"memcache": memcache.__version__} - - def _get_metrics(self, server, port, tags): - mc = None # client - service_check_tags = ["host:%s" % server, "port:%s" % port] - try: - self.log.debug("Connecting to %s:%s tags:%s", server, port, tags) - mc = memcache.Client(["%s:%s" % (server, port)]) - raw_stats = mc.get_stats() - - assert len(raw_stats) == 1 and len(raw_stats[0]) == 2,\ - "Malformed response: %s" % raw_stats - - - # Access the dict - stats = raw_stats[0][1] - for metric in stats: - # Check if metric is a gauge or rate - if metric in self.GAUGES: - our_metric = self.normalize(metric.lower(), 'memcache') - self.gauge(our_metric, float(stats[metric]), tags=tags) - - # Tweak the name if it's a rate so that we don't use the exact - # same metric name as the memcache documentation - if metric in self.RATES: - our_metric = self.normalize( - "{0}_rate".format(metric.lower()), 'memcache') - self.rate(our_metric, float(stats[metric]), tags=tags) - - # calculate some metrics based on other metrics. - # stats should be present, but wrap in try/except - # and log an exception just in case. - try: - self.gauge( - "memcache.get_hit_percent", - 100.0 * float(stats["get_hits"]) / float(stats["cmd_get"]), - tags=tags, - ) - except ZeroDivisionError: - pass - - try: - self.gauge( - "memcache.fill_percent", - 100.0 * float(stats["bytes"]) / float(stats["limit_maxbytes"]), - tags=tags, - ) - except ZeroDivisionError: - pass - - try: - self.gauge( - "memcache.avg_item_size", - float(stats["bytes"]) / float(stats["curr_items"]), - tags=tags, - ) - except ZeroDivisionError: - pass - - uptime = stats.get("uptime", 0) - self.service_check( - self.SERVICE_CHECK, AgentCheck.OK, - tags=service_check_tags, - message="Server has been up for %s seconds" % uptime) - except AssertionError: - self.service_check( - self.SERVICE_CHECK, AgentCheck.CRITICAL, - tags=service_check_tags, - message="Unable to fetch stats from server") - raise Exception( - "Unable to retrieve stats from memcache instance: {0}:{1}." - "Please check your configuration".format(server, port)) - - if mc is not None: - mc.disconnect_all() - self.log.debug("Disconnected from memcached") - del mc - - def check(self, instance): - socket = instance.get('socket') - server = instance.get('url') - if not server and not socket: - raise Exception('Either "url" or "socket" must be configured') - - if socket: - server = 'unix' - port = socket - else: - port = int(instance.get('port', self.DEFAULT_PORT)) - custom_tags = instance.get('tags') or [] - - tags = ["url:{0}:{1}".format(server, port)] + custom_tags - - self._get_metrics(server, port, tags) diff --git a/py/checks/mesos.py b/py/checks/mesos.py deleted file mode 100644 index 14784e2ba749..000000000000 --- a/py/checks/mesos.py +++ /dev/null @@ -1,127 +0,0 @@ -# stdlib -from hashlib import md5 -import time - -# 3rd party -import requests - -# project -from checks import AgentCheck - - -class Mesos(AgentCheck): - SERVICE_CHECK_NAME = "mesos.can_connect" - - def check(self, instance): - """ - DEPRECATED: - This generic Mesosphere check is deprecated not actively developed anymore. It will be - removed in a future version of the Datadog Agent. - Please head over to the Mesosphere master and slave specific checks. - """ - self.warning("This check is deprecated in favor of Mesos master and slave specific checks." - " It will be removed in a future version of the Datadog Agent.") - - if 'url' not in instance: - raise Exception('Mesos instance missing "url" value.') - - # Load values from the instance config - url = instance['url'] - instance_tags = instance.get('tags', []) - default_timeout = self.init_config.get('default_timeout', 5) - timeout = float(instance.get('timeout', default_timeout)) - - response = self.get_master_roles(url, timeout) - if response is not None: - for role in response['roles']: - tags = ['role:' + role['name']] + instance_tags - self.gauge('mesos.role.frameworks', len(role['frameworks']), tags=tags) - self.gauge('mesos.role.weight', role['weight'], tags=tags) - resources = role['resources'] - for attr in ['cpus','mem']: - if attr in resources: - self.gauge('mesos.role.' + attr, resources[attr], tags=tags) - - response = self.get_master_stats(url, timeout) - if response is not None: - tags = instance_tags - for key in iter(response): - self.gauge('mesos.stats.' + key, response[key], tags=tags) - - response = self.get_master_state(url, timeout) - if response is not None: - tags = instance_tags - for attr in ['deactivated_slaves','failed_tasks','finished_tasks','killed_tasks','lost_tasks','staged_tasks','started_tasks']: - self.gauge('mesos.state.' + attr, response[attr], tags=tags) - - for framework in response['frameworks']: - tags = ['framework:' + framework['id']] + instance_tags - resources = framework['resources'] - for attr in ['cpus','mem']: - if attr in resources: - self.gauge('mesos.state.framework.' + attr, resources[attr], tags=tags) - - for slave in response['slaves']: - tags = ['mesos','slave:' + slave['id']] + instance_tags - resources = slave['resources'] - for attr in ['cpus','mem','disk']: - if attr in resources: - self.gauge('mesos.state.slave.' + attr, resources[attr], tags=tags) - - def get_master_roles(self, url, timeout): - return self.get_json(url + "/master/roles.json", timeout) - - def get_master_stats(self, url, timeout): - return self.get_json(url + "/master/stats.json", timeout) - - def get_master_state(self, url, timeout): - return self.get_json(url + "/master/state.json", timeout) - - def get_json(self, url, timeout): - # Use a hash of the URL as an aggregation key - aggregation_key = md5(url).hexdigest() - tags = ["url:%s" % url] - msg = None - status = None - try: - r = requests.get(url, timeout=timeout) - if r.status_code != 200: - self.status_code_event(url, r, aggregation_key) - status = AgentCheck.CRITICAL - msg = "Got %s when hitting %s" % (r.status_code, url) - else: - status = AgentCheck.OK - msg = "Mesos master instance detected at %s " % url - except requests.exceptions.Timeout as e: - # If there's a timeout - self.timeout_event(url, timeout, aggregation_key) - msg = "%s seconds timeout when hitting %s" % (timeout, url) - status = AgentCheck.CRITICAL - except Exception as e: - msg = str(e) - status = AgentCheck.CRITICAL - finally: - self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags, message=msg) - if status is AgentCheck.CRITICAL: - self.warning(msg) - return None - - return r.json() - - def timeout_event(self, url, timeout, aggregation_key): - self.event({ - 'timestamp': int(time.time()), - 'event_type': 'http_check', - 'msg_title': 'URL timeout', - 'msg_text': '%s timed out after %s seconds.' % (url, timeout), - 'aggregation_key': aggregation_key - }) - - def status_code_event(self, url, r, aggregation_key): - self.event({ - 'timestamp': int(time.time()), - 'event_type': 'http_check', - 'msg_title': 'Invalid reponse code for %s' % url, - 'msg_text': '%s returned a status of %s' % (url, r.status_code), - 'aggregation_key': aggregation_key - }) diff --git a/py/checks/mesos_master.py b/py/checks/mesos_master.py deleted file mode 100644 index ca01d8ccdad2..000000000000 --- a/py/checks/mesos_master.py +++ /dev/null @@ -1,233 +0,0 @@ -"""Mesos Master check - -Collects metrics from mesos master node, only the leader is sending metrics. -""" -# stdlib -from hashlib import md5 - -# 3rd party -import requests - -# project -from checks import AgentCheck, CheckException - - -class MesosMaster(AgentCheck): - GAUGE = AgentCheck.gauge - MONOTONIC_COUNT = AgentCheck.monotonic_count - SERVICE_CHECK_NAME = "mesos_master.can_connect" - service_check_needed = True - - - FRAMEWORK_METRICS = { - 'cpus' : ('mesos.framework.cpu', GAUGE), - 'mem' : ('mesos.framework.mem', GAUGE), - 'disk' : ('mesos.framework.disk', GAUGE), - } - - ROLE_RESOURCES_METRICS = { - 'cpus' : ('mesos.role.cpu', GAUGE), - 'mem' : ('mesos.role.mem', GAUGE), - 'disk' : ('mesos.role.disk', GAUGE), - } - - # These metrics are aggregated only on the elected master - CLUSTER_TASKS_METRICS = { - 'master/tasks_error' : ('mesos.cluster.tasks_error', GAUGE), - 'master/tasks_failed' : ('mesos.cluster.tasks_failed', MONOTONIC_COUNT), - 'master/tasks_finished' : ('mesos.cluster.tasks_finished', MONOTONIC_COUNT), - 'master/tasks_killed' : ('mesos.cluster.tasks_killed', MONOTONIC_COUNT), - 'master/tasks_lost' : ('mesos.cluster.tasks_lost', MONOTONIC_COUNT), - 'master/tasks_running' : ('mesos.cluster.tasks_running', GAUGE), - 'master/tasks_staging' : ('mesos.cluster.tasks_staging', GAUGE), - 'master/tasks_starting' : ('mesos.cluster.tasks_starting', GAUGE), - } - - # These metrics are aggregated only on the elected master - CLUSTER_SLAVES_METRICS = { - 'master/slave_registrations' : ('mesos.cluster.slave_registrations', GAUGE), - 'master/slave_removals' : ('mesos.cluster.slave_removals', GAUGE), - 'master/slave_reregistrations' : ('mesos.cluster.slave_reregistrations', GAUGE), - 'master/slave_shutdowns_canceled' : ('mesos.cluster.slave_shutdowns_canceled', GAUGE), - 'master/slave_shutdowns_scheduled' : ('mesos.cluster.slave_shutdowns_scheduled', GAUGE), - 'master/slaves_active' : ('mesos.cluster.slaves_active', GAUGE), - 'master/slaves_connected' : ('mesos.cluster.slaves_connected', GAUGE), - 'master/slaves_disconnected' : ('mesos.cluster.slaves_disconnected', GAUGE), - 'master/slaves_inactive' : ('mesos.cluster.slaves_inactive', GAUGE), - 'master/recovery_slave_removals' : ('mesos.cluster.recovery_slave_removals', GAUGE), - } - - # These metrics are aggregated only on the elected master - CLUSTER_RESOURCES_METRICS = { - 'master/cpus_percent' : ('mesos.cluster.cpus_percent', GAUGE), - 'master/cpus_total' : ('mesos.cluster.cpus_total', GAUGE), - 'master/cpus_used' : ('mesos.cluster.cpus_used', GAUGE), - 'master/disk_percent' : ('mesos.cluster.disk_percent', GAUGE), - 'master/disk_total' : ('mesos.cluster.disk_total', GAUGE), - 'master/disk_used' : ('mesos.cluster.disk_used', GAUGE), - 'master/mem_percent' : ('mesos.cluster.mem_percent', GAUGE), - 'master/mem_total' : ('mesos.cluster.mem_total', GAUGE), - 'master/mem_used' : ('mesos.cluster.mem_used', GAUGE), - } - - # These metrics are aggregated only on the elected master - CLUSTER_REGISTRAR_METRICS = { - 'registrar/queued_operations' : ('mesos.registrar.queued_operations', GAUGE), - 'registrar/registry_size_bytes' : ('mesos.registrar.registry_size_bytes', GAUGE), - 'registrar/state_fetch_ms' : ('mesos.registrar.state_fetch_ms', GAUGE), - 'registrar/state_store_ms' : ('mesos.registrar.state_store_ms', GAUGE), - 'registrar/state_store_ms/count' : ('mesos.registrar.state_store_ms.count', GAUGE), - 'registrar/state_store_ms/max' : ('mesos.registrar.state_store_ms.max', GAUGE), - 'registrar/state_store_ms/min' : ('mesos.registrar.state_store_ms.min', GAUGE), - 'registrar/state_store_ms/p50' : ('mesos.registrar.state_store_ms.p50', GAUGE), - 'registrar/state_store_ms/p90' : ('mesos.registrar.state_store_ms.p90', GAUGE), - 'registrar/state_store_ms/p95' : ('mesos.registrar.state_store_ms.p95', GAUGE), - 'registrar/state_store_ms/p99' : ('mesos.registrar.state_store_ms.p99', GAUGE), - 'registrar/state_store_ms/p999' : ('mesos.registrar.state_store_ms.p999', GAUGE), - 'registrar/state_store_ms/p9999' : ('mesos.registrar.state_store_ms.p9999', GAUGE), - } - - # These metrics are aggregated only on the elected master - CLUSTER_FRAMEWORK_METRICS = { - 'master/frameworks_active' : ('mesos.cluster.frameworks_active', GAUGE), - 'master/frameworks_connected' : ('mesos.cluster.frameworks_connected', GAUGE), - 'master/frameworks_disconnected' : ('mesos.cluster.frameworks_disconnected', GAUGE), - 'master/frameworks_inactive' : ('mesos.cluster.frameworks_inactive', GAUGE), - } - - # These metrics are aggregated on all nodes in the cluster - SYSTEM_METRICS = { - 'system/cpus_total' : ('mesos.stats.system.cpus_total', GAUGE), - 'system/load_15min' : ('mesos.stats.system.load_15min', GAUGE), - 'system/load_1min' : ('mesos.stats.system.load_1min', GAUGE), - 'system/load_5min' : ('mesos.stats.system.load_5min', GAUGE), - 'system/mem_free_bytes' : ('mesos.stats.system.mem_free_bytes', GAUGE), - 'system/mem_total_bytes' : ('mesos.stats.system.mem_total_bytes', GAUGE), - 'master/elected' : ('mesos.stats.elected', GAUGE), - 'master/uptime_secs' : ('mesos.stats.uptime_secs', GAUGE), - } - - # These metrics are aggregated only on the elected master - STATS_METRICS = { - 'master/dropped_messages' : ('mesos.cluster.dropped_messages', GAUGE), - 'master/outstanding_offers' : ('mesos.cluster.outstanding_offers', GAUGE), - 'master/event_queue_dispatches' : ('mesos.cluster.event_queue_dispatches', GAUGE), - 'master/event_queue_http_requests' : ('mesos.cluster.event_queue_http_requests', GAUGE), - 'master/event_queue_messages' : ('mesos.cluster.event_queue_messages', GAUGE), - 'master/invalid_framework_to_executor_messages' : ('mesos.cluster.invalid_framework_to_executor_messages', GAUGE), - 'master/invalid_status_update_acknowledgements' : ('mesos.cluster.invalid_status_update_acknowledgements', GAUGE), - 'master/invalid_status_updates' : ('mesos.cluster.invalid_status_updates', GAUGE), - 'master/valid_framework_to_executor_messages' : ('mesos.cluster.valid_framework_to_executor_messages', GAUGE), - 'master/valid_status_update_acknowledgements' : ('mesos.cluster.valid_status_update_acknowledgements', GAUGE), - 'master/valid_status_updates' : ('mesos.cluster.valid_status_updates', GAUGE), - } - - def _get_json(self, url, timeout): - # Use a hash of the URL as an aggregation key - aggregation_key = md5(url).hexdigest() - tags = ["url:%s" % url] - msg = None - status = None - try: - r = requests.get(url, timeout=timeout) - if r.status_code != 200: - status = AgentCheck.CRITICAL - msg = "Got %s when hitting %s" % (r.status_code, url) - else: - status = AgentCheck.OK - msg = "Mesos master instance detected at %s " % url - except requests.exceptions.Timeout as e: - # If there's a timeout - msg = "%s seconds timeout when hitting %s" % (timeout, url) - status = AgentCheck.CRITICAL - except Exception as e: - msg = str(e) - status = AgentCheck.CRITICAL - finally: - if self.service_check_needed: - self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags, - message=msg) - self.service_check_needed = False - if status is AgentCheck.CRITICAL: - self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags, - message=msg) - raise CheckException("Cannot connect to mesos, please check your configuration.") - - return r.json() - - def _get_master_state(self, url, timeout): - return self._get_json(url + '/state.json', timeout) - - def _get_master_stats(self, url, timeout): - if self.version >= [0, 22, 0]: - endpoint = '/metrics/snapshot' - else: - endpoint = '/stats.json' - return self._get_json(url + endpoint, timeout) - - def _get_master_roles(self, url, timeout): - return self._get_json(url + '/roles.json', timeout) - - def _check_leadership(self, url, timeout): - state_metrics = self._get_master_state(url, timeout) - self.leader = False - - if state_metrics is not None: - self.version = map(int, state_metrics['version'].split('.')) - if state_metrics['leader'] == state_metrics['pid']: - self.leader = True - - return state_metrics - - def check(self, instance): - if 'url' not in instance: - raise Exception('Mesos instance missing "url" value.') - - url = instance['url'] - instance_tags = instance.get('tags', []) - default_timeout = self.init_config.get('default_timeout', 5) - timeout = float(instance.get('timeout', default_timeout)) - - state_metrics = self._check_leadership(url, timeout) - if state_metrics: - tags = [ - 'mesos_pid:{0}'.format(state_metrics['pid']), - 'mesos_node:master', - ] - if 'cluster' in state_metrics: - tags.append('mesos_cluster:{0}'.format(state_metrics['cluster'])) - - tags += instance_tags - - if self.leader: - self.GAUGE('mesos.cluster.total_frameworks', len(state_metrics['frameworks']), tags=tags) - - for framework in state_metrics['frameworks']: - framework_tags = ['framework_name:' + framework['name']] + tags - self.GAUGE('mesos.framework.total_tasks', len(framework['tasks']), tags=framework_tags) - resources = framework['used_resources'] - for key_name, (metric_name, metric_func) in self.FRAMEWORK_METRICS.iteritems(): - metric_func(self, metric_name, resources[key_name], tags=framework_tags) - - role_metrics = self._get_master_roles(url, timeout) - if role_metrics is not None: - for role in role_metrics['roles']: - role_tags = ['mesos_role:' + role['name']] + tags - self.GAUGE('mesos.role.frameworks.count', len(role['frameworks']), tags=role_tags) - self.GAUGE('mesos.role.weight', role['weight'], tags=role_tags) - for key_name, (metric_name, metric_func) in self.ROLE_RESOURCES_METRICS.iteritems(): - metric_func(self, metric_name, role['resources'][key_name], tags=role_tags) - - stats_metrics = self._get_master_stats(url, timeout) - if stats_metrics is not None: - metrics = [self.SYSTEM_METRICS] - if self.leader: - metrics += [self.CLUSTER_TASKS_METRICS, self.CLUSTER_SLAVES_METRICS, - self.CLUSTER_RESOURCES_METRICS, self.CLUSTER_REGISTRAR_METRICS, - self.CLUSTER_FRAMEWORK_METRICS, self.STATS_METRICS] - for m in metrics: - for key_name, (metric_name, metric_func) in m.iteritems(): - metric_func(self, metric_name, stats_metrics[key_name], tags=tags) - - - self.service_check_needed = True diff --git a/py/checks/mesos_slave.py b/py/checks/mesos_slave.py deleted file mode 100644 index 9eb8de5a19a8..000000000000 --- a/py/checks/mesos_slave.py +++ /dev/null @@ -1,187 +0,0 @@ -"""Mesos Slave check - -Collects metrics from mesos slave node. -""" -# stdlib -from hashlib import md5 - -# 3rd party -import requests - -# project -from checks import AgentCheck, CheckException - - -class MesosSlave(AgentCheck): - GAUGE = AgentCheck.gauge - MONOTONIC_COUNT = AgentCheck.monotonic_count - SERVICE_CHECK_NAME = "mesos_slave.can_connect" - service_check_needed = True - - TASK_STATUS = { - 'TASK_STARTING' : AgentCheck.OK, - 'TASK_RUNNING' : AgentCheck.OK, - 'TASK_FINISHED' : AgentCheck.OK, - 'TASK_FAILED' : AgentCheck.CRITICAL, - 'TASK_KILLED' : AgentCheck.WARNING, - 'TASK_LOST' : AgentCheck.CRITICAL, - 'TASK_STAGING' : AgentCheck.OK, - 'TASK_ERROR' : AgentCheck.CRITICAL, - } - - TASK_METRICS = { - 'cpus' : ('mesos.state.task.cpu', GAUGE), - 'mem' : ('mesos.state.task.mem', GAUGE), - 'disk' : ('mesos.state.task.disk', GAUGE), - } - - SLAVE_TASKS_METRICS = { - 'slave/tasks_failed' : ('mesos.slave.tasks_failed', MONOTONIC_COUNT), - 'slave/tasks_finished' : ('mesos.slave.tasks_finished', MONOTONIC_COUNT), - 'slave/tasks_killed' : ('mesos.slave.tasks_killed', MONOTONIC_COUNT), - 'slave/tasks_lost' : ('mesos.slave.tasks_lost', MONOTONIC_COUNT), - 'slave/tasks_running' : ('mesos.slave.tasks_running', GAUGE), - 'slave/tasks_staging' : ('mesos.slave.tasks_staging', GAUGE), - 'slave/tasks_starting' : ('mesos.slave.tasks_starting', GAUGE), - } - - SYSTEM_METRICS = { - 'system/cpus_total' : ('mesos.stats.system.cpus_total', GAUGE), - 'system/load_15min' : ('mesos.stats.system.load_15min', GAUGE), - 'system/load_1min' : ('mesos.stats.system.load_1min', GAUGE), - 'system/load_5min' : ('mesos.stats.system.load_5min', GAUGE), - 'system/mem_free_bytes' : ('mesos.stats.system.mem_free_bytes', GAUGE), - 'system/mem_total_bytes' : ('mesos.stats.system.mem_total_bytes', GAUGE), - 'slave/registered' : ('mesos.stats.registered', GAUGE), - 'slave/uptime_secs' : ('mesos.stats.uptime_secs', GAUGE), - } - - SLAVE_RESOURCE_METRICS = { - 'slave/cpus_percent' : ('mesos.slave.cpus_percent', GAUGE), - 'slave/cpus_total' : ('mesos.slave.cpus_total', GAUGE), - 'slave/cpus_used' : ('mesos.slave.cpus_used', GAUGE), - 'slave/disk_percent' : ('mesos.slave.disk_percent', GAUGE), - 'slave/disk_total' : ('mesos.slave.disk_total', GAUGE), - 'slave/disk_used' : ('mesos.slave.disk_used', GAUGE), - 'slave/mem_percent' : ('mesos.slave.mem_percent', GAUGE), - 'slave/mem_total' : ('mesos.slave.mem_total', GAUGE), - 'slave/mem_used' : ('mesos.slave.mem_used', GAUGE), - } - - SLAVE_EXECUTORS_METRICS = { - 'slave/executors_registering' : ('mesos.slave.executors_registering', GAUGE), - 'slave/executors_running' : ('mesos.slave.executors_running', GAUGE), - 'slave/executors_terminated' : ('mesos.slave.executors_terminated', GAUGE), - 'slave/executors_terminating' : ('mesos.slave.executors_terminating', GAUGE), - } - - STATS_METRICS = { - 'slave/frameworks_active' : ('mesos.slave.frameworks_active', GAUGE), - 'slave/invalid_framework_messages' : ('mesos.slave.invalid_framework_messages', GAUGE), - 'slave/invalid_status_updates' : ('mesos.slave.invalid_status_updates', GAUGE), - 'slave/recovery_errors' : ('mesos.slave.recovery_errors', GAUGE), - 'slave/valid_framework_messages' : ('mesos.slave.valid_framework_messages', GAUGE), - 'slave/valid_status_updates' : ('mesos.slave.valid_status_updates', GAUGE), - } - - def __init__(self, name, init_config, agentConfig, instances=None): - AgentCheck.__init__(self, name, init_config, agentConfig, instances) - self.cluster_name = None - - def _get_json(self, url, timeout): - # Use a hash of the URL as an aggregation key - aggregation_key = md5(url).hexdigest() - tags = ["url:%s" % url] - msg = None - status = None - try: - r = requests.get(url, timeout=timeout) - if r.status_code != 200: - status = AgentCheck.CRITICAL - msg = "Got %s when hitting %s" % (r.status_code, url) - else: - status = AgentCheck.OK - msg = "Mesos master instance detected at %s " % url - except requests.exceptions.Timeout as e: - # If there's a timeout - msg = "%s seconds timeout when hitting %s" % (timeout, url) - status = AgentCheck.CRITICAL - except Exception as e: - msg = str(e) - status = AgentCheck.CRITICAL - finally: - if self.service_check_needed: - self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags, message=msg) - self.service_check_needed = False - if status is AgentCheck.CRITICAL: - raise CheckException("Cannot connect to mesos, please check your configuration.") - - return r.json() - - def _get_state(self, url, timeout): - return self._get_json(url + '/state.json', timeout) - - def _get_stats(self, url, timeout): - if self.version >= [0, 22, 0]: - endpoint = '/metrics/snapshot' - else: - endpoint = '/stats.json' - return self._get_json(url + endpoint, timeout) - - def _get_constant_attributes(self, url, timeout): - state_metrics = None - if self.cluster_name is None: - state_metrics = self._get_state(url, timeout) - if state_metrics is not None: - self.version = map(int, state_metrics['version'].split('.')) - master_state = self._get_state('http://' + state_metrics['master_hostname'] + ':5050', timeout) - if master_state is not None: - self.cluster_name = master_state.get('cluster') - - return state_metrics - - def check(self, instance): - if 'url' not in instance: - raise Exception('Mesos instance missing "url" value.') - - url = instance['url'] - instance_tags = instance.get('tags', []) - tasks = instance.get('tasks', []) - default_timeout = self.init_config.get('default_timeout', 5) - timeout = float(instance.get('timeout', default_timeout)) - - state_metrics = self._get_constant_attributes(url, timeout) - tags = None - - if state_metrics is None: - state_metrics = self._get_state(url, timeout) - if state_metrics: - tags = [ - 'mesos_pid:{0}'.format(state_metrics['pid']), - 'mesos_node:slave', - ] - if self.cluster_name: - tags.append('mesos_cluster:{0}'.format(self.cluster_name)) - - tags += instance_tags - - for task in tasks: - for framework in state_metrics['frameworks']: - for executor in framework['executors']: - for t in executor['tasks']: - if task.lower() in t['name'].lower() and t['slave_id'] == state_metrics['id']: - task_tags = ['task_name:' + t['name']] + tags - self.service_check(t['name'] + '.ok', self.TASK_STATUS[t['state']], tags=task_tags) - for key_name, (metric_name, metric_func) in self.TASK_METRICS.iteritems(): - metric_func(self, metric_name, t['resources'][key_name], tags=task_tags) - - stats_metrics = self._get_stats(url, timeout) - if stats_metrics: - tags = tags if tags else instance_tags - metrics = [self.SLAVE_TASKS_METRICS, self.SYSTEM_METRICS, self.SLAVE_RESOURCE_METRICS, - self.SLAVE_EXECUTORS_METRICS, self.STATS_METRICS] - for m in metrics: - for key_name, (metric_name, metric_func) in m.iteritems(): - metric_func(self, metric_name, stats_metrics[key_name], tags=tags) - - self.service_check_needed = True diff --git a/py/checks/mongo.py b/py/checks/mongo.py deleted file mode 100644 index c222db68e324..000000000000 --- a/py/checks/mongo.py +++ /dev/null @@ -1,450 +0,0 @@ -# stdlib -import time - -# 3p -import pymongo - -# project -from checks import AgentCheck -from config import _is_affirmative -from util import get_hostname - -DEFAULT_TIMEOUT = 30 -GAUGE = AgentCheck.gauge -RATE = AgentCheck.rate - - -class MongoDb(AgentCheck): - SERVICE_CHECK_NAME = 'mongodb.can_connect' - SOURCE_TYPE_NAME = 'mongodb' - - COMMON_METRICS = { - "asserts.msg": RATE, - "asserts.regular": RATE, - "asserts.rollovers": RATE, - "asserts.user": RATE, - "asserts.warning": RATE, - "connections.available": GAUGE, - "connections.current": GAUGE, - "connections.totalCreated": GAUGE, - "cursors.timedOut": GAUGE, - "cursors.totalOpen": GAUGE, - "extra_info.heap_usage_bytes": RATE, - "extra_info.page_faults": RATE, - "globalLock.activeClients.readers": GAUGE, - "globalLock.activeClients.total": GAUGE, - "globalLock.activeClients.writers": GAUGE, - "globalLock.currentQueue.readers": GAUGE, - "globalLock.currentQueue.total": GAUGE, - "globalLock.currentQueue.writers": GAUGE, - "globalLock.totalTime": GAUGE, - "mem.bits": GAUGE, - "mem.mapped": GAUGE, - "mem.mappedWithJournal": GAUGE, - "mem.resident": GAUGE, - "mem.virtual": GAUGE, - "metrics.document.deleted": RATE, - "metrics.document.inserted": RATE, - "metrics.document.returned": RATE, - "metrics.document.updated": RATE, - "metrics.getLastError.wtime.num": RATE, - "metrics.getLastError.wtime.totalMillis": RATE, - "metrics.getLastError.wtimeouts": RATE, - "metrics.operation.fastmod": RATE, - "metrics.operation.idhack": RATE, - "metrics.operation.scanAndOrder": RATE, - "metrics.queryExecutor.scanned": RATE, - "metrics.record.moves": RATE, - "metrics.repl.apply.batches.num": RATE, - "metrics.repl.apply.batches.totalMillis": RATE, - "metrics.repl.apply.ops": RATE, - "metrics.repl.buffer.count": GAUGE, - "metrics.repl.buffer.maxSizeBytes": GAUGE, - "metrics.repl.buffer.sizeBytes": GAUGE, - "metrics.repl.network.bytes": RATE, - "metrics.repl.network.getmores.num": RATE, - "metrics.repl.network.getmores.totalMillis": RATE, - "metrics.repl.network.ops": RATE, - "metrics.repl.network.readersCreated": RATE, - "metrics.repl.oplog.insert.num": RATE, - "metrics.repl.oplog.insert.totalMillis": RATE, - "metrics.repl.oplog.insertBytes": RATE, - "metrics.repl.preload.indexes.num": RATE, - "metrics.repl.preload.indexes.totalMillis": RATE, - "metrics.ttl.deletedDocuments": RATE, - "metrics.ttl.passes": RATE, - "opcounters.command": RATE, - "opcounters.delete": RATE, - "opcounters.getmore": RATE, - "opcounters.insert": RATE, - "opcounters.query": RATE, - "opcounters.update": RATE, - "opcountersRepl.command": RATE, - "opcountersRepl.delete": RATE, - "opcountersRepl.getmore": RATE, - "opcountersRepl.insert": RATE, - "opcountersRepl.query": RATE, - "opcountersRepl.update": RATE, - "replSet.health": GAUGE, - "replSet.replicationLag": GAUGE, - "replSet.state": GAUGE, - "stats.avgObjSize": GAUGE, - "stats.collections": GAUGE, - "stats.dataSize": GAUGE, - "stats.fileSize": GAUGE, - "stats.indexes": GAUGE, - "stats.indexSize": GAUGE, - "stats.nsSizeMB": GAUGE, - "stats.numExtents": GAUGE, - "stats.objects": GAUGE, - "stats.storageSize": GAUGE, - "uptime": GAUGE, - } - - V2_ONLY_METRICS = { - "globalLock.lockTime": GAUGE, - "globalLock.ratio": GAUGE, # < 2.2 - "indexCounters.accesses": RATE, - "indexCounters.btree.accesses": RATE, # < 2.4 - "indexCounters.btree.hits": RATE, # < 2.4 - "indexCounters.btree.misses": RATE, # < 2.4 - "indexCounters.btree.missRatio": GAUGE, # < 2.4 - "indexCounters.hits": RATE, - "indexCounters.misses": RATE, - "indexCounters.missRatio": GAUGE, - "indexCounters.resets": RATE, - } - - TCMALLOC_METRICS = { - "tcmalloc.generic.current_allocated_bytes": GAUGE, - "tcmalloc.generic.heap_size": GAUGE, - "tcmalloc.tcmalloc.aggressive_memory_decommit": GAUGE, - "tcmalloc.tcmalloc.central_cache_free_bytes": GAUGE, - "tcmalloc.tcmalloc.current_total_thread_cache_bytes": GAUGE, - "tcmalloc.tcmalloc.max_total_thread_cache_bytes": GAUGE, - "tcmalloc.tcmalloc.pageheap_free_bytes": GAUGE, - "tcmalloc.tcmalloc.pageheap_unmapped_bytes": GAUGE, - "tcmalloc.tcmalloc.thread_cache_free_bytes": GAUGE, - "tcmalloc.tcmalloc.transfer_cache_free_bytes": GAUGE, - } - - def __init__(self, name, init_config, agentConfig, instances=None): - AgentCheck.__init__(self, name, init_config, agentConfig, instances) - self._last_state_by_server = {} - self.metrics_to_collect_by_instance = {} - - def get_library_versions(self): - return {"pymongo": pymongo.version} - - def check_last_state(self, state, clean_server_name, agentConfig): - if self._last_state_by_server.get(clean_server_name, -1) != state: - self._last_state_by_server[clean_server_name] = state - return self.create_event(state, clean_server_name, agentConfig) - - def create_event(self, state, clean_server_name, agentConfig): - """Create an event with a message describing the replication - state of a mongo node""" - - def get_state_description(state): - if state == 0: - return 'Starting Up' - elif state == 1: - return 'Primary' - elif state == 2: - return 'Secondary' - elif state == 3: - return 'Recovering' - elif state == 4: - return 'Fatal' - elif state == 5: - return 'Starting up (forking threads)' - elif state == 6: - return 'Unknown' - elif state == 7: - return 'Arbiter' - elif state == 8: - return 'Down' - elif state == 9: - return 'Rollback' - - status = get_state_description(state) - hostname = get_hostname(agentConfig) - msg_title = "%s is %s" % (clean_server_name, status) - msg = "MongoDB %s just reported as %s" % (clean_server_name, status) - - self.event({ - 'timestamp': int(time.time()), - 'event_type': 'Mongo', - 'api_key': agentConfig.get('api_key', ''), - 'msg_title': msg_title, - 'msg_text': msg, - 'host': hostname - }) - - @classmethod - def _build_metric_list_to_collect(cls, collect_tcmalloc_metrics=False): - """ - Build the metric list to collect based on the instance preferences. - """ - metrics_to_collect = {} - - # Defaut metrics - metrics_to_collect.update(cls.COMMON_METRICS) - metrics_to_collect.update(cls.V2_ONLY_METRICS) - - # Optional metrics - if collect_tcmalloc_metrics: - metrics_to_collect.update(cls.TCMALLOC_METRICS) - - return metrics_to_collect - - def _get_metrics_to_collect(self, instance_key, **instance_preferences): - """ - Return and cache the list of metrics to collect. - """ - if instance_key not in self.metrics_to_collect_by_instance: - self.metrics_to_collect_by_instance[instance_key] = \ - self._build_metric_list_to_collect(**instance_preferences) - return self.metrics_to_collect_by_instance[instance_key] - - def _normalize(self, metric_name, submit_method): - """ - Normalize the metric name considering its type. - """ - if submit_method == RATE: - return self.normalize(metric_name.lower(), 'mongodb') + "ps" - - return self.normalize(metric_name.lower(), 'mongodb') - - def check(self, instance): - """ - Returns a dictionary that looks a lot like what's sent back by - db.serverStatus() - """ - if 'server' not in instance: - raise Exception("Missing 'server' in mongo config") - - server = instance['server'] - - ssl_params = { - 'ssl': instance.get('ssl', None), - 'ssl_keyfile': instance.get('ssl_keyfile', None), - 'ssl_certfile': instance.get('ssl_certfile', None), - 'ssl_cert_reqs': instance.get('ssl_cert_reqs', None), - 'ssl_ca_certs': instance.get('ssl_ca_certs', None) - } - - for key, param in ssl_params.items(): - if param is None: - del ssl_params[key] - - # Configuration a URL, mongodb://user:pass@server/db - parsed = pymongo.uri_parser.parse_uri(server) - username = parsed.get('username') - password = parsed.get('password') - db_name = parsed.get('database') - clean_server_name = server.replace(password, "*" * 5) if password is not None else server - - tags = instance.get('tags', []) - tags.append('server:%s' % clean_server_name) - - # Get the list of metrics to collect - collect_tcmalloc_metrics = _is_affirmative( - instance.get('collect_tcmalloc_metrics', False) - ) - metrics_to_collect = self._get_metrics_to_collect( - server, - collect_tcmalloc_metrics=collect_tcmalloc_metrics, - ) - - # de-dupe tags to avoid a memory leak - tags = list(set(tags)) - - if not db_name: - self.log.info('No MongoDB database found in URI. Defaulting to admin.') - db_name = 'admin' - - service_check_tags = [ - "db:%s" % db_name - ] - - nodelist = parsed.get('nodelist') - if nodelist: - host = nodelist[0][0] - port = nodelist[0][1] - service_check_tags = service_check_tags + [ - "host:%s" % host, - "port:%s" % port - ] - - do_auth = True - if username is None or password is None: - self.log.debug("Mongo: cannot extract username and password from config %s" % server) - do_auth = False - - timeout = float(instance.get('timeout', DEFAULT_TIMEOUT)) * 1000 - try: - cli = pymongo.mongo_client.MongoClient( - server, - socketTimeoutMS=timeout, - read_preference=pymongo.ReadPreference.PRIMARY_PREFERRED, - **ssl_params) - # some commands can only go against the admin DB - admindb = cli['admin'] - db = cli[db_name] - except Exception: - self.service_check( - self.SERVICE_CHECK_NAME, - AgentCheck.CRITICAL, - tags=service_check_tags) - raise - - if do_auth and not db.authenticate(username, password): - message = "Mongo: cannot connect with config %s" % server - self.service_check( - self.SERVICE_CHECK_NAME, - AgentCheck.CRITICAL, - tags=service_check_tags, - message=message) - raise Exception(message) - - self.service_check( - self.SERVICE_CHECK_NAME, - AgentCheck.OK, - tags=service_check_tags) - - status = db["$cmd"].find_one({"serverStatus": 1, "tcmalloc": collect_tcmalloc_metrics}) - if status['ok'] == 0: - raise Exception(status['errmsg'].__str__()) - - status['stats'] = db.command('dbstats') - dbstats = {} - dbstats[db_name] = {'stats': status['stats']} - - # Handle replica data, if any - # See - # http://www.mongodb.org/display/DOCS/Replica+Set+Commands#ReplicaSetCommands-replSetGetStatus # noqa - try: - data = {} - dbnames = [] - - replSet = admindb.command('replSetGetStatus') - if replSet: - primary = None - current = None - - # need a new connection to deal with replica sets - setname = replSet.get('set') - cli = pymongo.mongo_client.MongoClient( - server, - socketTimeoutMS=timeout, - replicaset=setname, - read_preference=pymongo.ReadPreference.NEAREST, - **ssl_params) - db = cli[db_name] - - if do_auth and not db.authenticate(username, password): - message = ("Mongo: cannot connect with config %s" % server) - self.service_check( - self.SERVICE_CHECK_NAME, - AgentCheck.CRITICAL, - tags=service_check_tags, - message=message) - raise Exception(message) - - # find nodes: master and current node (ourself) - for member in replSet.get('members'): - if member.get('self'): - current = member - if int(member.get('state')) == 1: - primary = member - - # If we have both we can compute a lag time - if current is not None and primary is not None: - lag = primary['optimeDate'] - current['optimeDate'] - # Python 2.7 has this built in, python < 2.7 don't... - if hasattr(lag, 'total_seconds'): - data['replicationLag'] = lag.total_seconds() - else: - data['replicationLag'] = ( - lag.microseconds + - (lag.seconds + lag.days * 24 * 3600) * 10**6 - ) / 10.0**6 - - if current is not None: - data['health'] = current['health'] - - data['state'] = replSet['myState'] - self.check_last_state( - data['state'], - clean_server_name, - self.agentConfig) - status['replSet'] = data - - except Exception as e: - if "OperationFailure" in repr(e) and "replSetGetStatus" in str(e): - pass - else: - raise e - - # If these keys exist, remove them for now as they cannot be serialized - try: - status['backgroundFlushing'].pop('last_finished') - except KeyError: - pass - try: - status.pop('localTime') - except KeyError: - pass - - dbnames = cli.database_names() - for db_n in dbnames: - db_aux = cli[db_n] - dbstats[db_n] = {'stats': db_aux.command('dbstats')} - - # Go through the metrics and save the values - for metric_name, submit_method in metrics_to_collect.iteritems(): - # each metric is of the form: x.y.z with z optional - # and can be found at status[x][y][z] - value = status - - if metric_name.startswith('stats'): - continue - else: - try: - for c in metric_name.split("."): - value = value[c] - except KeyError: - continue - - # value is now status[x][y][z] - if not isinstance(value, (int, long, float)): - raise TypeError( - u"{0} value is a {1}, it should be an int, a float or a long instead." - .format(metric_name, type(value))) - - # Submit the metric - metric_name = self._normalize(metric_name, submit_method) - submit_method(self, metric_name, value, tags=tags) - - for st, value in dbstats.iteritems(): - for metric_name, submit_method in metrics_to_collect.iteritems(): - if not metric_name.startswith('stats.'): - continue - - try: - val = value['stats'][metric_name.split('.')[1]] - except KeyError: - continue - - # value is now status[x][y][z] - if not isinstance(val, (int, long, float)): - raise TypeError( - u"{0} value is a {1}, it should be an int, a float or a long instead." - .format(metric_name, type(val)) - ) - - # Submit the metric - metric_name = self._normalize(metric_name, submit_method) - metrics_tags = tags + ['cluster:db:%s' % st] - submit_method(self, metric_name, val, tags=metrics_tags) diff --git a/py/checks/mysql.py b/py/checks/mysql.py deleted file mode 100644 index 769b2a2aec58..000000000000 --- a/py/checks/mysql.py +++ /dev/null @@ -1,396 +0,0 @@ -# stdlib -import os -import sys -import re -import traceback - -# 3p -import pymysql - -# project -from checks import AgentCheck -from utils.platform import Platform -from utils.subprocess_output import get_subprocess_output - -GAUGE = "gauge" -RATE = "rate" - -STATUS_VARS = { - 'Connections': ('mysql.net.connections', RATE), - 'Max_used_connections': ('mysql.net.max_connections', GAUGE), - 'Open_files': ('mysql.performance.open_files', GAUGE), - 'Table_locks_waited': ('mysql.performance.table_locks_waited', GAUGE), - 'Threads_connected': ('mysql.performance.threads_connected', GAUGE), - 'Threads_running': ('mysql.performance.threads_running', GAUGE), - 'Innodb_data_reads': ('mysql.innodb.data_reads', RATE), - 'Innodb_data_writes': ('mysql.innodb.data_writes', RATE), - 'Innodb_os_log_fsyncs': ('mysql.innodb.os_log_fsyncs', RATE), - 'Slow_queries': ('mysql.performance.slow_queries', RATE), - 'Questions': ('mysql.performance.questions', RATE), - 'Queries': ('mysql.performance.queries', RATE), - 'Com_select': ('mysql.performance.com_select', RATE), - 'Com_insert': ('mysql.performance.com_insert', RATE), - 'Com_update': ('mysql.performance.com_update', RATE), - 'Com_delete': ('mysql.performance.com_delete', RATE), - 'Com_insert_select': ('mysql.performance.com_insert_select', RATE), - 'Com_update_multi': ('mysql.performance.com_update_multi', RATE), - 'Com_delete_multi': ('mysql.performance.com_delete_multi', RATE), - 'Com_replace_select': ('mysql.performance.com_replace_select', RATE), - 'Qcache_hits': ('mysql.performance.qcache_hits', RATE), - 'Innodb_mutex_spin_waits': ('mysql.innodb.mutex_spin_waits', RATE), - 'Innodb_mutex_spin_rounds': ('mysql.innodb.mutex_spin_rounds', RATE), - 'Innodb_mutex_os_waits': ('mysql.innodb.mutex_os_waits', RATE), - 'Created_tmp_tables': ('mysql.performance.created_tmp_tables', RATE), - 'Created_tmp_disk_tables': ('mysql.performance.created_tmp_disk_tables', RATE), - 'Created_tmp_files': ('mysql.performance.created_tmp_files', RATE), - 'Innodb_row_lock_waits': ('mysql.innodb.row_lock_waits', RATE), - 'Innodb_row_lock_time': ('mysql.innodb.row_lock_time', RATE), - 'Innodb_current_row_locks': ('mysql.innodb.current_row_locks', GAUGE), - 'Open_tables': ('mysql.performance.open_tables', GAUGE), -} - - -class MySql(AgentCheck): - SERVICE_CHECK_NAME = 'mysql.can_connect' - MAX_CUSTOM_QUERIES = 20 - DEFAULT_TIMEOUT = 5 - - def __init__(self, name, init_config, agentConfig, instances=None): - AgentCheck.__init__(self, name, init_config, agentConfig, instances) - self.mysql_version = {} - self.greater_502 = {} - - def get_library_versions(self): - return {"pymysql": pymysql.__version__} - - def check(self, instance): - host, port, user, password, mysql_sock, defaults_file, tags, options, queries = \ - self._get_config(instance) - - default_timeout = self.init_config.get('default_timeout', self.DEFAULT_TIMEOUT) - - if (not host or not user) and not defaults_file: - raise Exception("Mysql host and user are needed.") - - db = self._connect(host, port, mysql_sock, user, password, defaults_file) - - # Metadata collection - self._collect_metadata(db, host) - - # Metric collection - self._collect_metrics(host, db, tags, options, queries) - if Platform.is_linux(): - self._collect_system_metrics(host, db, tags) - - # Close connection - db.close() - - def _get_config(self, instance): - host = instance.get('server', '') - user = instance.get('user', '') - port = int(instance.get('port', 0)) - password = instance.get('pass', '') - mysql_sock = instance.get('sock', '') - defaults_file = instance.get('defaults_file', '') - tags = instance.get('tags', None) - options = instance.get('options', {}) - queries = instance.get('queries', []) - - return host, port, user, password, mysql_sock, defaults_file, tags, options, queries - - def _connect(self, host, port, mysql_sock, user, password, defaults_file): - service_check_tags = [ - 'host:%s' % host, - 'port:%s' % port - ] - - try: - if defaults_file != '': - db = pymysql.connect(read_default_file=defaults_file) - elif mysql_sock != '': - db = pymysql.connect( - unix_socket=mysql_sock, - user=user, - passwd=password - ) - service_check_tags = [ - 'host:%s' % mysql_sock, - 'port:unix_socket' - ] - elif port: - db = pymysql.connect( - host=host, - port=port, - user=user, - passwd=password - ) - else: - db = pymysql.connect( - host=host, - user=user, - passwd=password - ) - self.log.debug("Connected to MySQL") - self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, - tags=service_check_tags) - except Exception: - self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, - tags=service_check_tags) - raise - - return db - - def _collect_metrics(self, host, db, tags, options, queries): - cursor = db.cursor() - cursor.execute("SHOW /*!50002 GLOBAL */ STATUS;") - status_results = dict(cursor.fetchall()) - self._rate_or_gauge_statuses(STATUS_VARS, status_results, tags) - cursor.execute("SHOW VARIABLES LIKE 'Key%';") - variables_results = dict(cursor.fetchall()) - cursor.close() - del cursor - - # Compute key cache utilization metric - key_blocks_unused = self._collect_scalar('Key_blocks_unused', status_results) - key_cache_block_size = self._collect_scalar('key_cache_block_size', variables_results) - key_buffer_size = self._collect_scalar('key_buffer_size', variables_results) - key_cache_utilization = 1 - ((key_blocks_unused * key_cache_block_size) / key_buffer_size) - self.gauge("mysql.performance.key_cache_utilization", key_cache_utilization, tags=tags) - - # Compute InnoDB buffer metrics - # Be sure InnoDB is enabled - if 'Innodb_page_size' in status_results: - page_size = self._collect_scalar('Innodb_page_size', status_results) - innodb_buffer_pool_pages_total = self._collect_scalar('Innodb_buffer_pool_pages_total', - status_results) - innodb_buffer_pool_pages_free = self._collect_scalar('Innodb_buffer_pool_pages_free', - status_results) - innodb_buffer_pool_pages_total = innodb_buffer_pool_pages_total * page_size - innodb_buffer_pool_pages_free = innodb_buffer_pool_pages_free * page_size - innodb_buffer_pool_pages_used = \ - innodb_buffer_pool_pages_total - innodb_buffer_pool_pages_free - - self.gauge("mysql.innodb.buffer_pool_free", innodb_buffer_pool_pages_free, tags=tags) - self.gauge("mysql.innodb.buffer_pool_used", innodb_buffer_pool_pages_used, tags=tags) - self.gauge("mysql.innodb.buffer_pool_total", innodb_buffer_pool_pages_total, tags=tags) - - if innodb_buffer_pool_pages_total != 0: - innodb_buffer_pool_pages_utilization = \ - innodb_buffer_pool_pages_used / innodb_buffer_pool_pages_total - self.gauge("mysql.innodb.buffer_pool_utilization", - innodb_buffer_pool_pages_utilization, tags=tags) - - if 'galera_cluster' in options and options['galera_cluster']: - value = self._collect_scalar('wsrep_cluster_size', status_results) - self.gauge('mysql.galera.wsrep_cluster_size', value, tags=tags) - - if 'replication' in options and options['replication']: - # get slave running form global status page - slave_running = self._collect_string('Slave_running', status_results) - if slave_running is not None: - if slave_running.lower().strip() == 'on': - slave_running = 1 - else: - slave_running = 0 - self.gauge("mysql.replication.slave_running", slave_running, tags=tags) - self._collect_dict( - GAUGE, - {"Seconds_behind_master": "mysql.replication.seconds_behind_master"}, - "SHOW SLAVE STATUS", db, tags=tags - ) - - # Collect custom query metrics - # Max of 20 queries allowed - if isinstance(queries, list): - for index, check in enumerate(queries[:self.MAX_CUSTOM_QUERIES]): - self._collect_dict(check['type'], {check['field']: check['metric']}, check['query'], db, tags=tags) - - if len(queries) > self.MAX_CUSTOM_QUERIES: - self.warning("Maximum number (%s) of custom queries reached. Skipping the rest." - % self.MAX_CUSTOM_QUERIES) - - - def _collect_metadata(self, db, host): - self._get_version(db, host) - - def _rate_or_gauge_statuses(self, statuses, dbResults, tags): - for status, metric in statuses.iteritems(): - metric_name, metric_type = metric - value = self._collect_scalar(status, dbResults) - if value is not None: - if metric_type == RATE: - self.rate(metric_name, value, tags=tags) - elif metric_type == GAUGE: - self.gauge(metric_name, value, tags=tags) - - def _version_greater_502(self, db, host): - # show global status was introduced in 5.0.2 - # some patch version numbers contain letters (e.g. 5.0.51a) - # so let's be careful when we compute the version number - if host in self.greater_502: - return self.greater_502[host] - - greater_502 = False - try: - mysql_version = self._get_version(db, host) - self.log.debug("MySQL version %s" % mysql_version) - - major = int(mysql_version[0]) - minor = int(mysql_version[1]) - patchlevel = int(re.match(r"([0-9]+)", mysql_version[2]).group(1)) - - if (major, minor, patchlevel) > (5, 0, 2): - greater_502 = True - - except Exception, exception: - self.warning("Cannot compute mysql version, assuming older than 5.0.2: %s" - % str(exception)) - - self.greater_502[host] = greater_502 - - return greater_502 - - def _get_version(self, db, host): - if host in self.mysql_version: - version = self.mysql_version[host] - self.service_metadata('version', ".".join(version)) - return version - - # Get MySQL version - cursor = db.cursor() - cursor.execute('SELECT VERSION()') - result = cursor.fetchone() - cursor.close() - del cursor - # Version might include a description e.g. 4.1.26-log. - # See http://dev.mysql.com/doc/refman/4.1/en/information-functions.html#function_version - version = result[0].split('-') - version = version[0].split('.') - self.mysql_version[host] = version - self.service_metadata('version', ".".join(version)) - return version - - def _collect_scalar(self, key, dict): - return self._collect_type(key, dict, float) - - def _collect_string(self, key, dict): - return self._collect_type(key, dict, unicode) - - def _collect_type(self, key, dict, the_type): - self.log.debug("Collecting data with %s" % key) - if key not in dict: - self.log.debug("%s returned None" % key) - return None - self.log.debug("Collecting done, value %s" % dict[key]) - return the_type(dict[key]) - - def _collect_dict(self, metric_type, field_metric_map, query, db, tags): - """ - Query status and get a dictionary back. - Extract each field out of the dictionary - and stuff it in the corresponding metric. - - query: show status... - field_metric_map: {"Seconds_behind_master": "mysqlSecondsBehindMaster"} - """ - try: - cursor = db.cursor() - cursor.execute(query) - result = cursor.fetchone() - if result is not None: - for field in field_metric_map.keys(): - # Get the agent metric name from the column name - metric = field_metric_map[field] - # Find the column name in the cursor description to identify the column index - # http://www.python.org/dev/peps/pep-0249/ - # cursor.description is a tuple of (column_name, ..., ...) - try: - col_idx = [d[0].lower() for d in cursor.description].index(field.lower()) - self.log.debug("Collecting metric: %s" % metric) - if result[col_idx] is not None: - self.log.debug("Collecting done, value %s" % result[col_idx]) - if metric_type == GAUGE: - self.gauge(metric, float(result[col_idx]), tags=tags) - elif metric_type == RATE: - self.rate(metric, float(result[col_idx]), tags=tags) - else: - self.gauge(metric, float(result[col_idx]), tags=tags) - else: - self.log.debug("Received value is None for index %d" % col_idx) - except ValueError: - self.log.exception("Cannot find %s in the columns %s" - % (field, cursor.description)) - cursor.close() - del cursor - except Exception: - self.warning("Error while running %s\n%s" % (query, traceback.format_exc())) - self.log.exception("Error while running %s" % query) - - def _collect_system_metrics(self, host, db, tags): - pid = None - # The server needs to run locally, accessed by TCP or socket - if host in ["localhost", "127.0.0.1"] or db.port == long(0): - pid = self._get_server_pid(db) - - if pid: - self.log.debug("pid: %s" % pid) - # At last, get mysql cpu data out of procfs - try: - # See http://www.kernel.org/doc/man-pages/online/pages/man5/proc.5.html - # for meaning: we get 13 & 14: utime and stime, in clock ticks and convert - # them with the right sysconf value (SC_CLK_TCK) - proc_file = open("/proc/%d/stat" % pid) - data = proc_file.readline() - proc_file.close() - fields = data.split(' ') - ucpu = fields[13] - kcpu = fields[14] - clk_tck = os.sysconf(os.sysconf_names["SC_CLK_TCK"]) - - # Convert time to s (number of second of CPU used by mysql) - # It's a counter, it will be divided by the period, multiply by 100 - # to get the percentage of CPU used by mysql over the period - self.rate("mysql.performance.user_time", - int((float(ucpu) / float(clk_tck)) * 100), tags=tags) - self.rate("mysql.performance.kernel_time", - int((float(kcpu) / float(clk_tck)) * 100), tags=tags) - except Exception: - self.warning("Error while reading mysql (pid: %s) procfs data\n%s" - % (pid, traceback.format_exc())) - - def _get_server_pid(self, db): - pid = None - - # Try to get pid from pid file, it can fail for permission reason - pid_file = None - try: - cursor = db.cursor() - cursor.execute("SHOW VARIABLES LIKE 'pid_file'") - pid_file = cursor.fetchone()[1] - cursor.close() - del cursor - except Exception: - self.warning("Error while fetching pid_file variable of MySQL.") - - if pid_file is not None: - self.log.debug("pid file: %s" % str(pid_file)) - try: - f = open(pid_file) - pid = int(f.readline()) - f.close() - except IOError: - self.log.debug("Cannot read mysql pid file %s" % pid_file) - - # If pid has not been found, read it from ps - if pid is None: - try: - if sys.platform.startswith("linux"): - ps, _, _ = get_subprocess_output(['ps', '-C', 'mysqld', '-o', 'pid'], self.log) - pslines = ps.strip().splitlines() - # First line is header, second line is mysql pid - if len(pslines) == 2: - pid = int(pslines[1]) - except Exception: - self.log.exception("Error while fetching mysql pid from ps") - - return pid diff --git a/py/checks/nagios.py b/py/checks/nagios.py deleted file mode 100644 index f98cf31469c8..000000000000 --- a/py/checks/nagios.py +++ /dev/null @@ -1,395 +0,0 @@ -# stdlib -from collections import namedtuple -import re - -# project -from checks import AgentCheck -from utils.tailfile import TailFile - -# fields order for each event type, as named tuples -EVENT_FIELDS = { - 'CURRENT HOST STATE': namedtuple('E_CurrentHostState', 'host, event_state, event_soft_hard, return_code, payload'), - 'CURRENT SERVICE STATE': namedtuple('E_CurrentServiceState', 'host, check_name, event_state, event_soft_hard, return_code, payload'), - 'SERVICE ALERT': namedtuple('E_ServiceAlert', 'host, check_name, event_state, event_soft_hard, return_code, payload'), - 'PASSIVE SERVICE CHECK': namedtuple('E_PassiveServiceCheck', 'host, check_name, return_code, payload'), - 'HOST ALERT': namedtuple('E_HostAlert', 'host, event_state, event_soft_hard, return_code, payload'), - - # [1305744274] SERVICE NOTIFICATION: ops;ip-10-114-237-165;Metric ETL;ACKNOWLEDGEMENT (CRITICAL);notify-service-by-email;HTTP CRITICAL: HTTP/1.1 503 Service Unavailable - 394 bytes in 0.010 second response time;datadog;alq - 'SERVICE NOTIFICATION': namedtuple('E_ServiceNotification', 'contact, host, check_name, event_state, notification_type, payload'), - - # [1296509331] SERVICE FLAPPING ALERT: ip-10-114-97-27;cassandra JVM Heap;STARTED; Service appears to have started flapping (23.4% change >= 20.0% threshold) - # [1296662511] SERVICE FLAPPING ALERT: ip-10-114-97-27;cassandra JVM Heap;STOPPED; Service appears to have stopped flapping (3.8% change < 5.0% threshold) - 'SERVICE FLAPPING ALERT': namedtuple('E_FlappingAlert', 'host, check_name, flap_start_stop, payload'), - - # Reference for external commands: http://old.nagios.org/developerinfo/externalcommands/commandlist.php - # Command Format: - # ACKNOWLEDGE_SVC_PROBLEM;;;;;;; - # [1305832665] EXTERNAL COMMAND: ACKNOWLEDGE_SVC_PROBLEM;ip-10-202-161-236;Resources ETL;2;1;0;datadog;alq checking - 'ACKNOWLEDGE_SVC_PROBLEM': namedtuple('E_ServiceAck', 'host, check_name, sticky_ack, notify_ack, persistent_ack, ack_author, payload'), - - # Command Format: - # ACKNOWLEDGE_HOST_PROBLEM;;;;;; - 'ACKNOWLEDGE_HOST_PROBLEM': namedtuple('E_HostAck', 'host, sticky_ack, notify_ack, persistent_ack, ack_author, payload'), - - # Comment Format: - # PROCESS_SERVICE_CHECK_RESULT;;;; - # We ignore it because Nagios will log a "PASSIVE SERVICE CHECK" after - # receiving this, and we don't want duplicate events to be counted. - 'PROCESS_SERVICE_CHECK_RESULT': False, - - # Host Downtime - # [1297894825] HOST DOWNTIME ALERT: ip-10-114-89-59;STARTED; Host has entered a period of scheduled downtime - # [1297894825] SERVICE DOWNTIME ALERT: ip-10-114-237-165;intake;STARTED; Service has entered a period of scheduled downtime - - 'HOST DOWNTIME ALERT': namedtuple('E_HostDowntime', 'host, downtime_start_stop, payload'), - 'SERVICE DOWNTIME ALERT': namedtuple('E_ServiceDowntime', 'host, check_name, downtime_start_stop, payload'), -} - -# Regex for the Nagios event log -RE_LINE_REG = re.compile('^\[(\d+)\] EXTERNAL COMMAND: (\w+);(.*)$') -RE_LINE_EXT = re.compile('^\[(\d+)\] ([^:]+): (.*)$') - - -class Nagios(AgentCheck): - - NAGIOS_CONF_KEYS = [ - re.compile('^(?Plog_file)\s*=\s*(?P.+)$'), - re.compile('^(?Phost_perfdata_file_template)\s*=\s*(?P.+)$'), - re.compile('^(?Pservice_perfdata_file_template)\s*=\s*(?P.+)$'), - re.compile('^(?Phost_perfdata_file)\s*=\s*(?P.+)$'), - re.compile('^(?Pservice_perfdata_file)\s*=\s*(?P.+)$'), - ] - - def __init__(self, name, init_config, agentConfig, instances=None): - AgentCheck.__init__(self, name, init_config, agentConfig, instances) - self.nagios_tails = {} - check_freq = init_config.get("check_freq", 15) - if instances is not None: - for instance in instances: - tailers = [] - nagios_conf = {} - instance_key = None - - if 'nagios_conf' in instance: # conf.d check - conf_path = instance['nagios_conf'] - nagios_conf = self.parse_nagios_config(conf_path) - instance_key = conf_path - # Retrocompatibility Code - elif 'nagios_perf_cfg' in instance: - conf_path = instance['nagios_perf_cfg'] - nagios_conf = self.parse_nagios_config(conf_path) - instance["collect_host_performance_data"] = True - instance["collect_service_performance_data"] = True - instance_key = conf_path - if 'nagios_log' in instance: - nagios_conf["log_file"] = instance['nagios_log'] - if instance_key is None: - instance_key = instance['nagios_log'] - # End of retrocompatibility code - if not nagios_conf: - self.log.warning("Missing path to nagios_conf") - continue - - if 'log_file' in nagios_conf and \ - instance.get('collect_events', True): - self.log.debug("Starting to tail the event log") - tailers.append(NagiosEventLogTailer( - log_path=nagios_conf['log_file'], - file_template=None, - logger=self.log, - hostname=self.hostname, - event_func=self.event, - gauge_func=self.gauge, - freq=check_freq, - passive_checks=instance.get('passive_checks_events', False))) - if 'host_perfdata_file' in nagios_conf and \ - 'host_perfdata_file_template' in nagios_conf and \ - instance.get('collect_host_performance_data', False): - self.log.debug("Starting to tail the host_perfdata file") - tailers.append(NagiosHostPerfDataTailer( - log_path=nagios_conf['host_perfdata_file'], - file_template=nagios_conf['host_perfdata_file_template'], - logger=self.log, - hostname=self.hostname, - event_func=self.event, - gauge_func=self.gauge, - freq=check_freq)) - if 'service_perfdata_file' in nagios_conf and \ - 'service_perfdata_file_template' in nagios_conf and \ - instance.get('collect_service_performance_data', False): - self.log.debug("Starting to tail the service_perfdata file") - tailers.append(NagiosServicePerfDataTailer( - log_path=nagios_conf['service_perfdata_file'], - file_template=nagios_conf['service_perfdata_file_template'], - logger=self.log, - hostname=self.hostname, - event_func=self.event, - gauge_func=self.gauge, - freq=check_freq)) - - self.nagios_tails[instance_key] = tailers - - def parse_nagios_config(self, filename): - output = {} - - f = None - try: - f = open(filename) - for line in f: - line = line.strip() - if not line: - continue - for key in self.NAGIOS_CONF_KEYS: - m = key.match(line) - if m: - output[m.group('key')] = m.group('value') - break - return output - except Exception as e: - # Can't parse, assume it's just not working - # Don't return an incomplete config - self.log.exception(e) - raise Exception("Could not parse Nagios config file") - finally: - if f is not None: - f.close() - - def check(self, instance): - ''' - Parse until the end of each tailer associated with this instance. - We match instance and tailers based on the path to the Nagios configuration file - - Special case: Compatibility with the old conf when no conf file is specified - but the path to the event_log is given - ''' - instance_key = instance.get('nagios_conf', - instance.get('nagios_perf_cfg', - instance.get('nagios_log', - None))) - # Bad configuration: This instance does not contain any necessary configuration - if not instance_key or instance_key not in self.nagios_tails: - raise Exception('No Nagios configuration file specified') - for tailer in self.nagios_tails[instance_key]: - tailer.check() - - -class NagiosTailer(object): - - def __init__(self, log_path, file_template, logger, hostname, event_func, gauge_func, freq): - ''' - :param log_path: string, path to the file to parse - :param file_template: string, format of the perfdata file - :param logger: Logger object - :param hostname: string, name of the host this agent is running on - :param event_func: function to create event, should accept dict - :param gauge_func: function to report a gauge - :param freq: int, size of bucket to aggregate perfdata metrics - ''' - self.log_path = log_path - self.log = logger - self.gen = None - self.tail = None - self.hostname = hostname - self._event = event_func - self._gauge = gauge_func - self._line_parsed = 0 - self._freq = freq - - if file_template is not None: - self.compile_file_template(file_template) - - self.tail = TailFile(self.log, self.log_path, self._parse_line) - self.gen = self.tail.tail(line_by_line=False, move_end=True) - self.gen.next() - - def check(self): - self._line_parsed = 0 - # read until the end of file - try: - self.log.debug("Start nagios check for file %s" % (self.log_path)) - self.gen.next() - self.log.debug("Done nagios check for file %s (parsed %s line(s))" % - (self.log_path, self._line_parsed)) - except StopIteration, e: - self.log.exception(e) - self.log.warning("Can't tail %s file" % (self.log_path)) - - def compile_file_template(self, file_template): - try: - # Escape characters that will be interpreted as regex bits - # e.g. [ and ] in "[SERVICEPERFDATA]" - regex = re.sub(r'[[\]*]', r'.', file_template) - regex = re.sub(r'\$([^\$]*)\$', r'(?P<\1>[^\$]*)', regex) - self.line_pattern = re.compile(regex) - except Exception, e: - raise InvalidDataTemplate("%s (%s)" % (file_template, e)) - - -class NagiosEventLogTailer(NagiosTailer): - - def __init__(self, log_path, file_template, logger, hostname, event_func, - gauge_func, freq, passive_checks=False): - ''' - :param log_path: string, path to the file to parse - :param file_template: string, format of the perfdata file - :param logger: Logger object - :param hostname: string, name of the host this agent is running on - :param event_func: function to create event, should accept dict - :param gauge_func: function to report a gauge - :param freq: int, size of bucket to aggregate perfdata metrics - :param passive_checks: bool, enable or not passive checks events - ''' - self.passive_checks = passive_checks - super(NagiosEventLogTailer, self).__init__( - log_path, file_template, - logger, hostname, event_func, gauge_func, freq - ) - - def _parse_line(self, line): - """Actual nagios parsing - Return True if we found an event, False otherwise - """ - # first isolate the timestamp and the event type - try: - self._line_parsed = self._line_parsed + 1 - - m = RE_LINE_REG.match(line) - if m is None: - m = RE_LINE_EXT.match(line) - if m is None: - return False - self.log.debug("Matching line found %s" % line) - (tstamp, event_type, remainder) = m.groups() - tstamp = int(tstamp) - - # skip passive checks reports by default for spamminess - if event_type == 'PASSIVE SERVICE CHECK' and not self.passive_checks: - return False - # then retrieve the event format for each specific event type - fields = EVENT_FIELDS.get(event_type, None) - if fields is None: - self.log.warning("Ignoring unknown nagios event for line: %s" % (line[:-1])) - return False - elif fields is False: - # Ignore and skip - self.log.debug("Ignoring Nagios event for line: %s" % (line[:-1])) - return False - - # and parse the rest of the line - parts = map(lambda p: p.strip(), remainder.split(';')) - # Chop parts we don't recognize - parts = parts[:len(fields._fields)] - - event = self.create_event(tstamp, event_type, self.hostname, fields._make(parts)) - - self._event(event) - self.log.debug("Nagios event: %s" % (event)) - - return True - except Exception: - self.log.exception("Unable to create a nagios event from line: [%s]" % (line)) - return False - - def create_event(self, timestamp, event_type, hostname, fields): - """Factory method called by the parsers - """ - d = fields._asdict() - d.update({'timestamp': timestamp, - 'event_type': event_type}) - - # if host is localhost, turn that into the internal host name - host = d.get('host', None) - if host == "localhost": - d["host"] = hostname - return d - - -class NagiosPerfDataTailer(NagiosTailer): - perfdata_field = '' # Should be overriden by subclasses - metric_prefix = 'nagios' - pair_pattern = re.compile(r"".join([ - r"'?(?P