From bb000b7e2f092b4f5ce2ed5b06aa630450f2eb92 Mon Sep 17 00:00:00 2001
From: cloud-profiler-team <cloud-profiler-team@google.com>
Date: Tue, 2 Jul 2019 03:29:22 -0700
Subject: [PATCH] Update profiler Java agent to the latest version.

PiperOrigin-RevId: 256133819
---
 Dockerfile                                    |  45 ++-
 Makefile                                      |  15 +-
 src/cloud_env.cc                              |  65 +++-
 src/cloud_profiler_java_agent.lds             |  10 +-
 src/entry.cc                                  |  55 ++-
 src/globals.h                                 |   1 +
 src/http.cc                                   |   1 +
 src/{config_dataflow_jni.cc => jni.cc}        |  21 +-
 src/pem_roots.cc                              |  23 --
 src/profiler.cc                               |  24 +-
 src/profiler.h                                |   2 +-
 src/proto.cc                                  |  21 +-
 src/proto.h                                   |   3 +-
 src/throttler.h                               |   5 +
 src/throttler_api.cc                          |  82 +++-
 src/throttler_api.h                           |  19 +-
 src/throttler_timed.cc                        |  54 ++-
 src/throttler_timed.h                         |  10 +-
 src/worker.cc                                 |  93 +++--
 src/worker.h                                  |   1 +
 third_party/javaprofiler/accessors.cc         |  54 +++
 third_party/javaprofiler/accessors.h          | 104 +++++
 .../javaprofiler/async_ref_counted_string.cc  | 192 +++++++++
 .../javaprofiler/async_ref_counted_string.h   | 112 ++++++
 third_party/javaprofiler/clock.cc             |  10 +
 third_party/javaprofiler/clock.h              |  15 +-
 third_party/javaprofiler/display.cc           |  71 ++--
 third_party/javaprofiler/display.h            |  10 +-
 third_party/javaprofiler/globals.h            |  52 +--
 third_party/javaprofiler/heap_sampler.cc      | 365 ++++++++++++++++++
 third_party/javaprofiler/heap_sampler.h       | 205 ++++++++++
 third_party/javaprofiler/native.h             |   2 +-
 .../javaprofiler/profile_proto_builder.cc     | 106 +++--
 .../javaprofiler/profile_proto_builder.h      | 155 +++++---
 third_party/javaprofiler/profile_test_lib.cc  |  55 +++
 third_party/javaprofiler/profile_test_lib.h   |  42 +-
 third_party/javaprofiler/stacktraces.cc       |  65 ++--
 third_party/javaprofiler/stacktraces.h        |  32 +-
 third_party/javaprofiler/tags.cc              | 195 ++++++++++
 third_party/javaprofiler/tags.h               |  92 +++++
 .../perftools/profiles/proto/builder.cc       |  27 +-
 .../perftools/profiles/proto/builder.h        |  43 ++-
 .../perftools/profiles/proto/profile.proto    |  22 +-
 43 files changed, 2146 insertions(+), 430 deletions(-)
 rename src/{config_dataflow_jni.cc => jni.cc} (68%)
 create mode 100644 third_party/javaprofiler/accessors.cc
 create mode 100644 third_party/javaprofiler/accessors.h
 create mode 100644 third_party/javaprofiler/async_ref_counted_string.cc
 create mode 100644 third_party/javaprofiler/async_ref_counted_string.h
 create mode 100644 third_party/javaprofiler/heap_sampler.cc
 create mode 100644 third_party/javaprofiler/heap_sampler.h
 create mode 100644 third_party/javaprofiler/tags.cc
 create mode 100644 third_party/javaprofiler/tags.h

diff --git a/Dockerfile b/Dockerfile
index 8164065c8..30f686dfd 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -15,24 +15,34 @@
 #
 # Base image
 #
-FROM debian:jessie
+FROM ubuntu:trusty
 
 #
 # Dependencies
 #
 
+# Install JDK 11 as sampling heap profiler depends on the new JVMTI APIs.
+RUN apt-get update && apt-get install -y software-properties-common
+RUN add-apt-repository -y ppa:openjdk-r/ppa
+RUN apt-get update
+
+# Installing openjdk-11-jdk-headless can be very slow due to repo download
+# speed.
+
 # Everything we can get through apt-get
 RUN apt-get update && apt-get -y -q install \
   apt-utils \
   autoconf \
   automake \
+  cmake \
   curl \
   g++ \
   git \
+  libcurl4-openssl-dev \
   libssl-dev \
   libtool \
   make \
-  openjdk-7-jdk \
+  openjdk-11-jdk-headless \
   python \
   unzip \
   zlib1g-dev
@@ -47,13 +57,6 @@ RUN git clone --depth=1 -b curl-7_55_1 https://github.com/curl/curl.git /tmp/cur
     make -j && make install && \
     cd ~ && rm -rf /tmp/curl
 
-# cmake
-RUN git clone -b v3.6.2 https://github.com/Kitware/CMake.git /tmp/cmake && \
-    cd /tmp/cmake && \
-    ./bootstrap && \
-    make -j && make install && \
-    cd ~ && rm -rf /tmp/cmake
-
 # gflags
 RUN git clone --depth=1 -b v2.1.2 https://github.com/gflags/gflags.git /tmp/gflags && \
     cd /tmp/gflags && \
@@ -90,12 +93,26 @@ RUN mkdir /tmp/openssl && cd /tmp/openssl && \
 # process of gRPC puts the OpenSSL static object files into the gRPC archive
 # which causes link errors later when the agent is linked with the static
 # OpenSSL library itself.
-RUN git clone --depth=1 --recursive -b v1.15.0 https://github.com/grpc/grpc.git /tmp/grpc && \
+RUN git clone --depth=1 --recursive -b v1.21.0 https://github.com/grpc/grpc.git /tmp/grpc && \
     cd /tmp/grpc && \
+
+# TODO: Remove patch when GKE Istio versions support HTTP 1.0 or
+# gRPC http_cli supports HTTP 1.1
+# This sed command is needed until GKE provides Istio 1.1+ exclusively which
+# supports HTTP 1.0.
+    sed -i 's/1\.0/1.1/g' src/core/lib/http/format_request.cc && \
+
+# TODO: Remove patch when GKE Istio versions support unambiguous
+# FQDNs in rule sets.
+# https://github.com/istio/istio/pull/14405 is merged but wait till GKE
+# Istio versions includes this PR
+    sed -i 's/metadata\.google\.internal\./metadata.google.internal/g' src/core/lib/security/credentials/google_default/google_default_credentials.cc && \
+    sed -i 's/metadata\.google\.internal\./metadata.google.internal/g' src/core/lib/security/credentials/credentials.h && \
     cd third_party/protobuf && \
-    ./autogen.sh && ./configure --with-pic CXXFLAGS=-Os && make -j && make install && ldconfig && \
+    ./autogen.sh && \
+    ./configure --with-pic CXXFLAGS="$(pkg-config --cflags protobuf)" LIBS="$(pkg-config --libs protobuf)" LDFLAGS="-Wl,--no-as-needed" && \
+    make -j && make install && ldconfig && \
     cd ../.. && \
-    CPPFLAGS="-I /usr/local/ssl/include" make -j CONFIG=opt EMBED_OPENSSL=false V=1 HAS_SYSTEM_OPENSSL_NPN=0 && \
-    CPPFLAGS="-I /usr/local/ssl/include" make CONFIG=opt EMBED_OPENSSL=false V=1 HAS_SYSTEM_OPENSSL_NPN=0 install && \
+    CPPFLAGS="-I /usr/local/ssl/include" LDFLAGS="-Wl,--no-as-needed" make -j CONFIG=opt EMBED_OPENSSL=false V=1 HAS_SYSTEM_OPENSSL_NPN=0 && \
+    CPPFLAGS="-I /usr/local/ssl/include" LDFLAGS="-Wl,--no-as-needed" make CONFIG=opt EMBED_OPENSSL=false V=1 HAS_SYSTEM_OPENSSL_NPN=0 install && \
     rm -rf /tmp/grpc
-
diff --git a/Makefile b/Makefile
index cea17785a..75a144a81 100644
--- a/Makefile
+++ b/Makefile
@@ -29,10 +29,11 @@ CFLAGS = \
 	-g0 \
 	-DSTANDALONE_BUILD \
 	-D_GNU_SOURCE \
+	-DENABLE_HEAP_SAMPLING
 
 SRC_ROOT_PATH=.
 
-JAVA_PATH ?= /usr/lib/jvm/java-7-openjdk-amd64
+JAVA_PATH ?= /usr/lib/jvm/java-11-openjdk-amd64
 PROTOC ?= /usr/local/bin/protoc
 PROTOC_GEN_GRPC ?= /usr/local/bin/grpc_cpp_plugin
 
@@ -70,23 +71,33 @@ PROFILER_API_SOURCES = \
 	$(GENFILES_PATH)/google/rpc/error_details.pb.cc \
 
 JAVAPROFILER_LIB_SOURCES = \
+	$(JAVAPROFILER_LIB_PATH)/accessors.cc \
+	$(JAVAPROFILER_LIB_PATH)/async_ref_counted_string.cc \
 	$(JAVAPROFILER_LIB_PATH)/clock.cc \
 	$(JAVAPROFILER_LIB_PATH)/display.cc \
+	$(JAVAPROFILER_LIB_PATH)/heap_sampler.cc \
 	$(JAVAPROFILER_LIB_PATH)/native.cc \
+	$(JAVAPROFILER_LIB_PATH)/profile_proto_builder.cc \
 	$(JAVAPROFILER_LIB_PATH)/stacktrace_fixer.cc \
 	$(JAVAPROFILER_LIB_PATH)/stacktraces.cc \
+	$(JAVAPROFILER_LIB_PATH)/tags.cc \
 
 # Add any header not already as a .cc in JAVAPROFILER_LIB_SOURCES.
 JAVAPROFILER_LIB_HEADERS = \
+	$(JAVAPROFILER_LIB_PATH)/accessors.h \
+	$(JAVAPROFILER_LIB_PATH)/async_ref_counted_string.h \
+	$(JAVAPROFILER_LIB_PATH)/heap_sampler.h \
 	$(JAVAPROFILER_LIB_PATH)/jvmti_error.h \
+	$(JAVAPROFILER_LIB_PATH)/profile_proto_builder.h \
 	$(JAVAPROFILER_LIB_PATH)/stacktrace_decls.h \
 	$(JAVAPROFILER_LIB_PATH)/stacktraces.h \
+	$(JAVAPROFILER_LIB_PATH)/tags.h \
 
 SOURCES = \
 	$(JAVA_AGENT_PATH)/cloud_env.cc \
-	$(JAVA_AGENT_PATH)/config_dataflow_jni.cc \
 	$(JAVA_AGENT_PATH)/entry.cc \
 	$(JAVA_AGENT_PATH)/http.cc \
+	$(JAVA_AGENT_PATH)/jni.cc \
 	$(JAVA_AGENT_PATH)/pem_roots.cc \
 	$(JAVA_AGENT_PATH)/profiler.cc \
 	$(JAVA_AGENT_PATH)/proto.cc \
diff --git a/src/cloud_env.cc b/src/cloud_env.cc
index 607380ced..2e68d1b30 100644
--- a/src/cloud_env.cc
+++ b/src/cloud_env.cc
@@ -18,15 +18,19 @@
 #include <sstream>
 #include <string>
 
+#include "src/clock.h"
 #include "src/http.h"
 #include "src/string.h"
 
+DEFINE_int32(cprof_gce_metadata_server_retry_count, 3,
+             "Number of retries to Google Compute Engine metadata host");
+DEFINE_int32(
+    cprof_gce_metadata_server_retry_sleep_sec, 1,
+    "Seconds to sleep between retries to Google Compute Engine metadata host");
 DEFINE_string(cprof_gce_metadata_server_address, "169.254.169.254:80",
               "Google Compute Engine metadata host to use");
-
 DEFINE_string(cprof_access_token_test_only, "",
               "override OAuth2 access token for testing");
-
 DEFINE_string(cprof_project_id, "", "cloud project ID");
 DEFINE_string(cprof_zone_name, "", "zone name");
 DEFINE_string(cprof_service, "", "deployment service name");
@@ -47,23 +51,40 @@ const char kNoData[] = "";
 namespace {
 
 string GceMetadataRequest(HTTPRequest* req, const string& path) {
+  Clock* clock = DefaultClock();
   req->AddHeader("Metadata-Flavor", "Google");
   req->SetTimeout(2);  // seconds
 
   string url = FLAGS_cprof_gce_metadata_server_address + path, resp;
-  if (!req->DoGet(url, &resp)) {
-    LOG(ERROR) << "Error making HTTP request to the GCE metadata server";
-    return kNoData;
-  }
 
-  int resp_code = req->GetResponseCode();
-  if (resp_code != kHTTPStatusOK) {
-    LOG(ERROR) << "Request to the GCE metadata server failed, status code: "
-               << resp_code;
-    return kNoData;
+  int retry_sleep_sec = FLAGS_cprof_gce_metadata_server_retry_sleep_sec;
+  int retry_count = FLAGS_cprof_gce_metadata_server_retry_count;
+  struct timespec retry_ts = NanosToTimeSpec(kNanosPerSecond * retry_sleep_sec);
+
+  for (int i = 0; i <= retry_count; i++) {
+    if (!req->DoGet(url, &resp)) {
+      if (i < retry_count) {
+        LOG(ERROR) << "Error making HTTP request for " << url
+                   << " to the GCE metadata server. "
+                   << "Will retry in " << retry_ts.tv_sec << "s";
+        clock->SleepFor(retry_ts);
+      } else {
+        LOG(ERROR) << "Error making HTTP request for " << url
+                   << " to the GCE metadata server.";
+      }
+      continue;
+    }
+    int resp_code = req->GetResponseCode();
+    if (resp_code != kHTTPStatusOK) {
+      LOG(ERROR) << "Request to the GCE metadata server for " << url
+                 << " failed, status code: " << resp_code;
+      return kNoData;
+    }
+    return resp;
   }
-
-  return resp;
+  LOG(ERROR) << "Unable to contact GCE metadata server for " << url << " after "
+             << retry_count << " retries.";
+  return kNoData;
 }
 
 const char* Getenv(const string& var) {
@@ -82,15 +103,25 @@ CloudEnv::CloudEnv() {
   } else if (!FLAGS_cprof_target.empty()) {
     service_ = FLAGS_cprof_target;
   } else {
-    const char* val = Getenv("GAE_SERVICE");
-    service_ = val == nullptr ? "" : val;
+    for (const string& env_var : {"GAE_SERVICE", "K_SERVICE"}) {
+      const char* val = Getenv(env_var);
+      if (val != nullptr) {
+        service_ = val;
+        break;
+      }
+    }
   }
 
   if (!FLAGS_cprof_service_version.empty()) {
     service_version_ = FLAGS_cprof_service_version;
   } else {
-    const char* val = Getenv("GAE_VERSION");
-    service_version_ = val == nullptr ? "" : val;
+    for (const string& env_var : {"GAE_VERSION", "K_REVISION"}) {
+      const char* val = Getenv(env_var);
+      if (val != nullptr) {
+        service_version_ = val;
+        break;
+      }
+    }
   }
 
   if (!FLAGS_cprof_project_id.empty()) {
diff --git a/src/cloud_profiler_java_agent.lds b/src/cloud_profiler_java_agent.lds
index 25fdfdaff..cb432ed37 100644
--- a/src/cloud_profiler_java_agent.lds
+++ b/src/cloud_profiler_java_agent.lds
@@ -2,11 +2,11 @@ VERS_1.0 {
   global:
     Agent_OnLoad;
     Agent_OnUnload;
-    Java_com_google_cloud_dataflow_worker_profiler_Profiler_disable;
-    Java_com_google_cloud_dataflow_worker_profiler_Profiler_enable;
-    Java_com_google_cloud_dataflow_worker_profiler_Profiler_getAttribute;
-    Java_com_google_cloud_dataflow_worker_profiler_Profiler_registerAttribute;
-    Java_com_google_cloud_dataflow_worker_profiler_Profiler_setAttribute;
+    Java_org_apache_beam_runners_dataflow_worker_profiler_Profiler_disable;
+    Java_org_apache_beam_runners_dataflow_worker_profiler_Profiler_enable;
+    Java_org_apache_beam_runners_dataflow_worker_profiler_Profiler_getAttribute;
+    Java_org_apache_beam_runners_dataflow_worker_profiler_Profiler_registerAttribute;
+    Java_org_apache_beam_runners_dataflow_worker_profiler_Profiler_setAttribute;
   local:
     *;
 };
diff --git a/src/entry.cc b/src/entry.cc
index 4d5eb8aa6..501a14da4 100644
--- a/src/entry.cc
+++ b/src/entry.cc
@@ -18,7 +18,9 @@
 
 #include "src/string.h"
 #include "src/worker.h"
+#include "third_party/javaprofiler/accessors.h"
 #include "third_party/javaprofiler/globals.h"
+#include "third_party/javaprofiler/heap_sampler.h"
 #include "third_party/javaprofiler/stacktraces.h"
 
 DEFINE_bool(cprof_cpu_use_per_thread_timers, false,
@@ -28,6 +30,10 @@ DEFINE_bool(cprof_force_debug_non_safepoints, true,
             "when true, force DebugNonSafepoints flag by subscribing to the"
             "code generation events. This improves the accuracy of profiles,"
             "but may incur a bit of overhead.");
+DEFINE_bool(cprof_enable_heap_sampling, false,
+            "when unset, heap allocation sampling is disabled");
+DEFINE_int32(cprof_heap_sampling_interval, 512 * 1024,
+             "sampling interval for heap allocation sampling, 512k by default");
 
 namespace cloud {
 namespace profiler {
@@ -111,6 +117,12 @@ void JNICALL OnVMInit(jvmtiEnv *jvmti, JNIEnv *jni_env, jthread thread) {
     jclass klass = class_list[i];
     CreateJMethodIDsForClass(jvmti, klass);
   }
+
+  if (FLAGS_cprof_enable_heap_sampling) {
+    google::javaprofiler::HeapMonitor::Enable(
+        jvmti, jni_env, FLAGS_cprof_heap_sampling_interval);
+  }
+
   worker->Start(jni_env);
 }
 
@@ -132,9 +144,13 @@ void JNICALL OnVMDeath(jvmtiEnv *jvmti_env, JNIEnv *jni_env) {
   worker->Stop();
   delete worker;
   worker = NULL;
+
+  if (google::javaprofiler::HeapMonitor::Enabled()) {
+    google::javaprofiler::HeapMonitor::Disable();
+  }
 }
 
-static bool PrepareJvmti(jvmtiEnv *jvmti) {
+static bool PrepareJvmti(JavaVM *vm, jvmtiEnv *jvmti) {
   LOG(INFO) << "Prepare JVMTI";
 
   // Set the list of permissions to do the various internal VM things
@@ -175,20 +191,23 @@ static bool PrepareJvmti(jvmtiEnv *jvmti) {
       return false;
     }
   }
+
   return true;
 }
 
 static bool RegisterJvmti(jvmtiEnv *jvmti) {
   // Create the list of callbacks to be called on given events.
-  jvmtiEventCallbacks *callbacks = new jvmtiEventCallbacks();
-  memset(callbacks, 0, sizeof(jvmtiEventCallbacks));
+  jvmtiEventCallbacks callbacks;
+  memset(&callbacks, 0, sizeof(jvmtiEventCallbacks));
 
-  callbacks->ThreadStart = &OnThreadStart;
-  callbacks->ThreadEnd = &OnThreadEnd;
-  callbacks->VMInit = &OnVMInit;
-  callbacks->VMDeath = &OnVMDeath;
-  callbacks->ClassLoad = &OnClassLoad;
-  callbacks->ClassPrepare = &OnClassPrepare;
+  callbacks.ThreadStart = &OnThreadStart;
+  callbacks.ThreadEnd = &OnThreadEnd;
+  callbacks.VMInit = &OnVMInit;
+  callbacks.VMDeath = &OnVMDeath;
+  callbacks.ClassLoad = &OnClassLoad;
+  callbacks.ClassPrepare = &OnClassPrepare;
+
+  google::javaprofiler::HeapMonitor::AddCallback(&callbacks);
 
   std::vector<jvmtiEvent> events = {
       JVMTI_EVENT_CLASS_LOAD, JVMTI_EVENT_CLASS_PREPARE,
@@ -197,12 +216,12 @@ static bool RegisterJvmti(jvmtiEnv *jvmti) {
   };
 
   if (FLAGS_cprof_force_debug_non_safepoints) {
-    callbacks->CompiledMethodLoad = &OnCompiledMethodLoad;
+    callbacks.CompiledMethodLoad = &OnCompiledMethodLoad;
     events.push_back(JVMTI_EVENT_COMPILED_METHOD_LOAD);
   }
 
   JVMTI_ERROR_1(
-      (jvmti->SetEventCallbacks(callbacks, sizeof(jvmtiEventCallbacks))),
+      (jvmti->SetEventCallbacks(&callbacks, sizeof(jvmtiEventCallbacks))),
       false);
 
   // Enable the callbacks to be triggered when the events occur.
@@ -248,16 +267,21 @@ jint JNICALL Agent_OnLoad(JavaVM *vm, char *options, void *reserved) {
   ParseArguments(options);  // Initializes logger -- do not log before this call
   LOG(INFO) << "Profiler agent loaded";
 
-  google::javaprofiler::Accessors::Init();
   google::javaprofiler::AttributeTable::Init();
 
-  if ((err = (vm->GetEnv(reinterpret_cast<void **>(&jvmti), JVMTI_VERSION))) !=
-      JNI_OK) {
+  // Try to get the latest JVMTI_VERSION the agent was built with.
+  err = vm->GetEnv(reinterpret_cast<void **>(&jvmti), JVMTI_VERSION);
+  if (err == JNI_EVERSION) {
+    // The above call can fail if the VM is actually from an older VM, therefore
+    // try to get an older JVMTI (compatible with JDK8).
+    err = (vm->GetEnv(reinterpret_cast<void **>(&jvmti), JVMTI_VERSION_1_2));
+  }
+  if (err != JNI_OK) {
     LOG(ERROR) << "JNI Error " << err;
     return 1;
   }
 
-  if (!PrepareJvmti(jvmti)) {
+  if (!PrepareJvmti(vm, jvmti)) {
     LOG(ERROR) << "Failed to initialize JVMTI.  Continuing...";
     return 0;
   }
@@ -285,7 +309,6 @@ jint JNICALL Agent_OnLoad(JavaVM *vm, char *options, void *reserved) {
 
 void JNICALL Agent_OnUnload(JavaVM *vm) {
   IMPLICITLY_USE(vm);
-  google::javaprofiler::Accessors::Destroy();
 }
 
 }  // namespace profiler
diff --git a/src/globals.h b/src/globals.h
index 515f1b738..764a61bc3 100644
--- a/src/globals.h
+++ b/src/globals.h
@@ -26,6 +26,7 @@
 
 #include "third_party/javaprofiler/jvmti_error.h"
 #include "third_party/javaprofiler/stacktraces.h"
+
 #include <glog/logging.h>
 
 #include <string>
diff --git a/src/http.cc b/src/http.cc
index 444559b0f..42be74038 100644
--- a/src/http.cc
+++ b/src/http.cc
@@ -58,6 +58,7 @@ void HTTPRequest::SetTimeout(int timeout_sec) {
 
 bool HTTPRequest::DoGet(const string& url, string *data) {
   data->clear();
+  curl_easy_setopt(curl_, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1);
   curl_easy_setopt(curl_, CURLOPT_WRITEDATA, data);
   curl_easy_setopt(curl_, CURLOPT_WRITEFUNCTION, ResponseCallback);
   return DoRequest(url);
diff --git a/src/config_dataflow_jni.cc b/src/jni.cc
similarity index 68%
rename from src/config_dataflow_jni.cc
rename to src/jni.cc
index c9fb1a923..a85bdece0 100644
--- a/src/config_dataflow_jni.cc
+++ b/src/jni.cc
@@ -15,24 +15,23 @@
 #include <jni.h>
 
 #include "src/worker.h"
+#include "third_party/javaprofiler/accessors.h"
 #include "third_party/javaprofiler/stacktraces.h"
 
-extern "C" AGENTEXPORT
-void JNICALL
-Java_com_google_cloud_dataflow_worker_profiler_Profiler_enable(
-    JNIEnv *, jclass) {
+extern "C" AGENTEXPORT void JNICALL
+Java_org_apache_beam_runners_dataflow_worker_profiler_Profiler_enable(JNIEnv *,
+                                                                      jclass) {
   cloud::profiler::Worker::EnableProfiling();
 }
 
-extern "C" AGENTEXPORT
-void JNICALL
-Java_com_google_cloud_dataflow_worker_profiler_Profiler_disable(
-    JNIEnv *, jclass) {
+extern "C" AGENTEXPORT void JNICALL
+Java_org_apache_beam_runners_dataflow_worker_profiler_Profiler_disable(JNIEnv *,
+                                                                       jclass) {
   cloud::profiler::Worker::DisableProfiling();
 }
 
 extern "C" AGENTEXPORT jint JNICALL
-Java_com_google_cloud_dataflow_worker_profiler_Profiler_registerAttribute(
+Java_org_apache_beam_runners_dataflow_worker_profiler_Profiler_registerAttribute(
     JNIEnv *env, jclass, jstring value) {
   const char *value_utf = env->GetStringUTFChars(value, nullptr);
   int ret = google::javaprofiler::AttributeTable::RegisterString(value_utf);
@@ -41,7 +40,7 @@ Java_com_google_cloud_dataflow_worker_profiler_Profiler_registerAttribute(
 }
 
 extern "C" AGENTEXPORT jint JNICALL
-Java_com_google_cloud_dataflow_worker_profiler_Profiler_setAttribute(
+Java_org_apache_beam_runners_dataflow_worker_profiler_Profiler_setAttribute(
     JNIEnv *env, jclass, jint attr) {
   int64_t ret = google::javaprofiler::Accessors::GetAttribute();
   google::javaprofiler::Accessors::SetAttribute(static_cast<int64_t>(attr));
@@ -49,7 +48,7 @@ Java_com_google_cloud_dataflow_worker_profiler_Profiler_setAttribute(
 }
 
 extern "C" AGENTEXPORT jint JNICALL
-Java_com_google_cloud_dataflow_worker_profiler_Profiler_getAttribute(
+Java_org_apache_beam_runners_dataflow_worker_profiler_Profiler_getAttribute(
     JNIEnv *env, jclass) {
   int64_t ret = google::javaprofiler::Accessors::GetAttribute();
   return static_cast<jint>(ret);
diff --git a/src/pem_roots.cc b/src/pem_roots.cc
index 7a2cb8c11..2225a1c91 100644
--- a/src/pem_roots.cc
+++ b/src/pem_roots.cc
@@ -973,29 +973,6 @@ JJUEeKgDu+6B5dpffItKoZB0JaezPkvILFa9x8jvOOJckvB595yEunQtYQEgfn7R
 
 # Note: "GlobalSign Root CA - R7" not added on purpose. It is P-521.
 
-# Operating CA: GlobalSign
-# Issuer: C=BE, O=GlobalSign nv-sa, OU=Root CA, CN=GlobalSign Root CA - R8
-# Subject: C=BE, O=GlobalSign nv-sa, OU=Root CA, CN=GlobalSign Root CA - R8
-# Label: "GlobalSign Root CA - R8"
-# Serial: 1462505469299036457243287072048861
-# MD5 Fingerprint: 26:15:db:de:38:b4:45:5e:19:3f:1b:57:af:53:2b:36
-# SHA1 Fingerprint: 62:01:ff:ce:4f:09:cd:c7:e0:2f:e1:10:f4:fd:67:f0:37:1a:2f:2a
-# SHA256 Fingerprint: ae:48:51:ff:42:03:9b:ad:e0:58:27:91:51:d8:26:83:04:1d:25:98:e2:40:68:3c:c5:6d:76:fb:8c:f5:3d:42
------BEGIN CERTIFICATE-----
-MIICMzCCAbmgAwIBAgIOSBtqCfT5YHE6/oHMht0wCgYIKoZIzj0EAwMwXDELMAkG
-A1UEBhMCQkUxGTAXBgNVBAoTEEdsb2JhbFNpZ24gbnYtc2ExEDAOBgNVBAsTB1Jv
-b3QgQ0ExIDAeBgNVBAMTF0dsb2JhbFNpZ24gUm9vdCBDQSAtIFI4MB4XDTE2MDYx
-NTAwMDAwMFoXDTM2MDYxNTAwMDAwMFowXDELMAkGA1UEBhMCQkUxGTAXBgNVBAoT
-EEdsb2JhbFNpZ24gbnYtc2ExEDAOBgNVBAsTB1Jvb3QgQ0ExIDAeBgNVBAMTF0ds
-b2JhbFNpZ24gUm9vdCBDQSAtIFI4MHYwEAYHKoZIzj0CAQYFK4EEACIDYgAEuO58
-MIfYlB9Ua22Ynfx1+1uIq0K6jX05ft1EPTk84QWhSmRgrDemc7D5yUVLCwbQOuDx
-bV/6XltaUrV240bb1R6MdHpCyUE1T8bU4ihgqzSKzrFAI0alrhkkUnyQVUTOo0Iw
-QDAOBgNVHQ8BAf8EBAMCAQYwDwYDVR0TAQH/BAUwAwEB/zAdBgNVHQ4EFgQULzoS
-JoDoisJQeG0GxDR+4kk5V3YwCgYIKoZIzj0EAwMDaAAwZQIxAMehPbKSkPrKXeAn
-hII7Icz0jfiUVvIgXxHArLxfFaULyBZDp/jFf40goH9e/BYcJwIwHoz1Vr8425zm
-pteEKebfDVMu6CsBt30JPLEyahqauArq6K0I8nQ51SsiNtzvRmbY
------END CERTIFICATE-----
-
 # Operating CA: GoDaddy
 # Issuer: CN=Go Daddy Root Certificate Authority - G2 O=GoDaddy.com, Inc.
 # Subject: CN=Go Daddy Root Certificate Authority - G2 O=GoDaddy.com, Inc.
diff --git a/src/profiler.cc b/src/profiler.cc
index df91023e5..90dde14a6 100644
--- a/src/profiler.cc
+++ b/src/profiler.cc
@@ -25,6 +25,7 @@
 #include "src/clock.h"
 #include "src/globals.h"
 #include "src/proto.h"
+#include "third_party/javaprofiler/accessors.h"
 
 DEFINE_int32(cprof_wall_num_threads_cutoff, 4096,
              "Do not take wall profiles if more than this # of threads exist.");
@@ -37,6 +38,8 @@ DEFINE_bool(cprof_record_native_stack, false,
 namespace cloud {
 namespace profiler {
 
+using google::javaprofiler::AlmostThere;
+
 google::javaprofiler::AsyncSafeTraceMultiset *Profiler::fixed_traces_ = nullptr;
 std::atomic<int> Profiler::unknown_stack_count_;
 
@@ -151,8 +154,7 @@ bool SignalHandler::SetSigprofInterval(int64_t period_usec) {
   timer.it_interval.tv_usec = period_usec;
   timer.it_value = timer.it_interval;
   if (setitimer(ITIMER_PROF, &timer, 0) == -1) {
-    fprintf(stderr, "Scheduling profiler interval failed with error %d\n",
-            errno);
+    LOG(ERROR) << "Scheduling profiler interval failed with error " << errno;
     return false;
   }
   return true;
@@ -198,24 +200,12 @@ void Profiler::Reset() {
 }
 
 string Profiler::SerializeProfile(
-    const google::javaprofiler::NativeProcessInfo &native_info) {
+    JNIEnv *jni, const google::javaprofiler::NativeProcessInfo &native_info) {
   return SerializeAndClearJavaCpuTraces(
-      jvmti_, native_info, ProfileType(), duration_nanos_, period_nanos_,
+      jni, jvmti_, native_info, ProfileType(), duration_nanos_, period_nanos_,
       &aggregated_traces_, unknown_stack_count_);
 }
 
-bool AlmostThere(const struct timespec &finish, const struct timespec &lap) {
-  // Determine if there is time for another lap before reaching the
-  // finish line. Have a margin of multiple laps to ensure we do not
-  // overrun the finish line.
-  int64_t margin_laps = 2;
-
-  struct timespec now = DefaultClock()->Now();
-  struct timespec laps = {lap.tv_sec * margin_laps, lap.tv_nsec * margin_laps};
-
-  return TimeLessThan(finish, TimeAdd(now, laps));
-}
-
 bool CPUProfiler::Collect() {
   Reset();
 
@@ -231,7 +221,7 @@ bool CPUProfiler::Collect() {
 
   // Sleep until finish_line, but wakeup periodically to flush the
   // internal tables.
-  while (!AlmostThere(finish_line, flush_interval)) {
+  while (!AlmostThere(clock, finish_line, flush_interval)) {
     clock->SleepFor(flush_interval);
     Flush();
   }
diff --git a/src/profiler.h b/src/profiler.h
index 78fdd1901..c76640094 100644
--- a/src/profiler.h
+++ b/src/profiler.h
@@ -57,7 +57,7 @@ class Profiler {
 
   // Serialize the collected traces into a compressed serialized profile.proto
   string SerializeProfile(
-      const google::javaprofiler::NativeProcessInfo &native_info);
+      JNIEnv *jni, const google::javaprofiler::NativeProcessInfo &native_info);
 
   // Signal handler, which records the current stack trace into the profile.
   static void Handle(int signum, siginfo_t *info, void *context);
diff --git a/src/proto.cc b/src/proto.cc
index 2b7d3d25b..3cd1bc1ff 100644
--- a/src/proto.cc
+++ b/src/proto.cc
@@ -41,7 +41,7 @@ class ProfileProtoBuilder {
   }
 
   // Populate the profile with a set of traces
-  void Populate(const char *profile_type,
+  void Populate(JNIEnv *jni, const char *profile_type,
                 const google::javaprofiler::TraceMultiset &traces,
                 int64_t duration_ns, int64_t period_ns);
   void AddArtificialSample(const string &name, int64_t count, int64_t weight);
@@ -60,7 +60,8 @@ class ProfileProtoBuilder {
  private:
   void AddSample(const std::vector<uint64_t> &locations, int64_t count,
                  int64_t weight, int64_t attr);
-  uint64_t LocationID(const google::javaprofiler::JVMPI_CallFrame &frame);
+  uint64_t LocationID(JNIEnv *jni,
+                      const google::javaprofiler::JVMPI_CallFrame &frame);
   uint64_t LocationID(uint64_t address);
   uint64_t LocationID(const string &name);
   uint64_t LocationID(const string &class_name, const string &method_name,
@@ -130,7 +131,7 @@ int64_t ProfileProtoBuilder::TotalCount() const { return total_count_; }
 int64_t ProfileProtoBuilder::TotalWeight() const { return total_weight_; }
 
 uint64_t ProfileProtoBuilder::LocationID(
-    const google::javaprofiler::JVMPI_CallFrame &frame) {
+    JNIEnv *jni, const google::javaprofiler::JVMPI_CallFrame &frame) {
   if (frame.lineno == google::javaprofiler::kNativeFrameLineNum) {
     return LocationID(reinterpret_cast<uint64_t>(frame.method_id));
   }
@@ -142,7 +143,7 @@ uint64_t ProfileProtoBuilder::LocationID(
 
   string method_name, class_name, file_name, signature;
   int line_number = 0;
-  google::javaprofiler::GetStackFrameElements(jvmti_, frame, &file_name,
+  google::javaprofiler::GetStackFrameElements(jni, jvmti_, frame, &file_name,
                                               &class_name, &method_name,
                                               &signature, &line_number);
   google::javaprofiler::FixMethodParameters(&signature);
@@ -211,8 +212,9 @@ uint64_t ProfileProtoBuilder::LocationID(const string &class_name,
 }
 
 void ProfileProtoBuilder::Populate(
-    const char *profile_type, const google::javaprofiler::TraceMultiset &traces,
-    int64_t duration_ns, int64_t period_ns) {
+    JNIEnv *jni, const char *profile_type,
+    const google::javaprofiler::TraceMultiset &traces, int64_t duration_ns,
+    int64_t period_ns) {
   perftools::profiles::Profile *profile = builder_.mutable_profile();
 
   profile->mutable_period_type()->set_type(builder_.StringId(profile_type));
@@ -233,7 +235,7 @@ void ProfileProtoBuilder::Populate(
     if (count != 0) {
       std::vector<uint64_t> locations;
       for (const auto &frame : trace.first.frames) {
-        locations.push_back(LocationID(frame));
+        locations.push_back(LocationID(jni, frame));
       }
       AddSample(locations, count, count * period_ns, trace.first.attr);
     }
@@ -271,11 +273,12 @@ void ProfileProtoBuilder::AddSample(const std::vector<uint64_t> &locations,
 }
 
 string SerializeAndClearJavaCpuTraces(
-    jvmtiEnv *jvmti, const google::javaprofiler::NativeProcessInfo &native_info,
+    JNIEnv *env, jvmtiEnv *jvmti,
+    const google::javaprofiler::NativeProcessInfo &native_info,
     const char *profile_type, int64_t duration_ns, int64_t period_ns,
     google::javaprofiler::TraceMultiset *traces, int64_t unknown_count) {
   ProfileProtoBuilder b(jvmti, native_info);
-  b.Populate(profile_type, *traces, duration_ns, period_ns);
+  b.Populate(env, profile_type, *traces, duration_ns, period_ns);
   b.AddArtificialSample("[Unknown]", unknown_count, unknown_count * period_ns);
   LOG(INFO) << "Collected a profile: total count=" << b.TotalCount()
             << ", weight=" << b.TotalWeight();
diff --git a/src/proto.h b/src/proto.h
index 1a0a0f589..f760de48b 100644
--- a/src/proto.h
+++ b/src/proto.h
@@ -27,7 +27,8 @@ namespace profiler {
 // from a collection of java stack traces, symbolized using the jvmti.
 // Data in traces will be cleared.
 string SerializeAndClearJavaCpuTraces(
-    jvmtiEnv *jvmti, const google::javaprofiler::NativeProcessInfo &native_info,
+    JNIEnv *jni, jvmtiEnv *jvmti,
+    const google::javaprofiler::NativeProcessInfo &native_info,
     const char *profile_type, int64_t duration_nanos, int64_t period_nanos,
     google::javaprofiler::TraceMultiset *traces, int64_t unknown_count);
 
diff --git a/src/throttler.h b/src/throttler.h
index 40efe2d1c..816991cdd 100644
--- a/src/throttler.h
+++ b/src/throttler.h
@@ -27,6 +27,7 @@ namespace profiler {
 // Supported profile types.
 constexpr char kTypeCPU[] = "cpu";
 constexpr char kTypeWall[] = "wall";
+constexpr char kTypeHeap[] = "heap";
 
 // Iterator-like abstraction used to guide a profiling loop comprising of
 // waiting for when the next profile may be collected and saving its data once
@@ -60,6 +61,10 @@ class Throttler {
 
   // Upload the compressed profile proto bytes. Returns false on error.
   virtual bool Upload(string profile) = 0;
+
+  // Closes the throttler by trying to cancel WaitNext() / Upload() in flight.
+  // Those calls may return cancellation error. This method is thread-safe.
+  virtual void Close() = 0;
 };
 
 }  // namespace profiler
diff --git a/src/throttler_api.cc b/src/throttler_api.cc
index fdd351ccb..538f40e3b 100644
--- a/src/throttler_api.cc
+++ b/src/throttler_api.cc
@@ -33,6 +33,8 @@
 #include "grpcpp/security/credentials.h"
 #include "grpcpp/support/channel_arguments.h"
 
+#include "third_party/javaprofiler/heap_sampler.h"
+
 // API curated profiling configuration.
 DEFINE_string(cprof_api_address, "cloudprofiler.googleapis.com",
               "API server address");
@@ -165,7 +167,13 @@ bool InitializeDeployment(CloudEnv* env, const string& labels,
 
   string service = env->Service();
   if (service.empty()) {
-    LOG(ERROR) << "Deployment service is not configured";
+    LOG(ERROR) << "Deployment service name is not configured";
+    return false;
+  }
+  if (!IsValidServiceName(service)) {
+    LOG(ERROR)
+        << "Deployment service name '" << service
+        << "' does not match pattern '^[a-z]([-a-z0-9_.]{0,253}[a-z0-9])?$'";
     return false;
   }
   d->set_target(service);
@@ -190,6 +198,11 @@ bool InitializeDeployment(CloudEnv* env, const string& labels,
   for (const auto& kv : label_kvs) {
     (*d->mutable_labels())[kv.first] = kv.second;
   }
+
+  LOG(INFO) << "Initialized deployment: project_id=" << project_id
+            << ", service=" << service
+            << ", service_version=" << service_version
+            << ", zone_name=" << zone_name;
   return true;
 }
 
@@ -208,6 +221,29 @@ bool AddProfileLabels(api::Profile* p, const string& labels) {
 
 }  // namespace
 
+// Returns true if the service name matches the regex
+// "^[a-z]([-a-z0-9_.]{0,253}[a-z0-9])?$", and false otherwise.
+bool IsValidServiceName(string s) {
+  if (s.length() < 1 || s.length() > 255) {
+    return false;
+  }
+  if (s[0] < 'a' || s[0] > 'z') {
+    return false;
+  }
+  if ((s.back() < 'a' || s.back() > 'z') &&
+      (s.back() < '0' || s.back() > '9')) {
+    return false;
+  }
+  for (size_t i = 1; i < s.length() - 1; ++i) {
+    char c = s[i];
+    if ((c < 'a' || c > 'z') && (c < '0' || c > '9') && c != '.' && c != '-' &&
+        c != '_') {
+      return false;
+    }
+  }
+  return true;
+}
+
 APIThrottler::APIThrottler()
     : APIThrottler(DefaultCloudEnv(), DefaultClock(), nullptr) {}
 
@@ -220,7 +256,8 @@ APIThrottler::APIThrottler(
       clock_(clock),
       stub_(std::move(stub)),
       types_({api::CPU, api::WALL}),
-      creation_backoff_envelope_ns_(kBackoffNanos) {
+      creation_backoff_envelope_ns_(kBackoffNanos),
+      closed_(false) {
   grpc_init();
   gpr_set_log_function(GRPCLog);
 
@@ -233,6 +270,11 @@ APIThrottler::APIThrottler(
               << " to create and upload profiles";
     stub_ = NewProfilerServiceStub(FLAGS_cprof_api_address);
   }
+
+  if (google::javaprofiler::HeapMonitor::Enabled()) {
+    LOG(INFO) << "Heap allocation sampling supported for this JDK";
+    types_.push_back(api::HEAP);
+  }
 }
 
 void APIThrottler::SetProfileTypes(const std::vector<api::ProfileType>& types) {
@@ -254,13 +296,14 @@ bool APIThrottler::WaitNext() {
     LOG(ERROR) << "Failed to initialize deployment, stop profiling";
     return false;
   }
+  req.set_parent("projects/" + req.deployment().project_id());
 
   while (true) {
     LOG(INFO) << "Creating a new profile via profiler service";
 
-    grpc::ClientContext ctx;
     profile_.Clear();
-    grpc::Status st = stub_->CreateProfile(&ctx, req, &profile_);
+    ResetClientContext();
+    grpc::Status st = stub_->CreateProfile(ctx_.get(), req, &profile_);
     if (st.ok()) {
       LOG(INFO) << "Profile created: " << ProfileType() << " "
                 << profile_.name();
@@ -268,7 +311,10 @@ bool APIThrottler::WaitNext() {
       creation_backoff_envelope_ns_ = kBackoffNanos;
       break;
     }
-    OnCreationError(ctx, st);
+    if (closed_) {
+      return false;
+    }
+    OnCreationError(st);
   }
 
   return true;
@@ -281,6 +327,8 @@ string APIThrottler::ProfileType() {
       return kTypeCPU;
     case api::WALL:
       return kTypeWall;
+    case api::HEAP:
+      return kTypeHeap;
     default:
       const string& pt_name = api::ProfileType_Name(pt);
       LOG(ERROR) << "Unsupported profile type " << pt_name;
@@ -297,8 +345,6 @@ bool APIThrottler::Upload(string profile) {
   LOG(INFO) << "Uploading " << profile.size() << " bytes of '" << ProfileType()
             << "' profile data";
 
-  grpc::ClientContext ctx;
-
   if (!AddProfileLabels(&profile_, FLAGS_cprof_profile_labels)) {
     LOG(ERROR) << "Failed to add profile labels, won't upload the profile";
     return false;
@@ -308,7 +354,8 @@ bool APIThrottler::Upload(string profile) {
   *req.mutable_profile() = profile_;
 
   req.mutable_profile()->set_profile_bytes(std::move(profile));
-  grpc::Status st = stub_->UpdateProfile(&ctx, req, &profile_);
+  ResetClientContext();
+  grpc::Status st = stub_->UpdateProfile(ctx_.get(), req, &profile_);
 
   if (!st.ok()) {
     // TODO: Recognize and retry transient errors.
@@ -319,11 +366,10 @@ bool APIThrottler::Upload(string profile) {
   return true;
 }
 
-void APIThrottler::OnCreationError(const grpc::ClientContext& ctx,
-                                   const grpc::Status& st) {
+void APIThrottler::OnCreationError(const grpc::Status& st) {
   if (st.error_code() == grpc::StatusCode::ABORTED) {
     int64_t backoff_ns;
-    if (AbortedBackoffDuration(ctx, &backoff_ns)) {
+    if (AbortedBackoffDuration(*ctx_, &backoff_ns)) {
       if (backoff_ns > 0) {
         LOG(INFO) << "Got ABORTED, will retry after backing off for "
                   << backoff_ns / kNanosPerMilli << "ms";
@@ -343,5 +389,19 @@ void APIThrottler::OnCreationError(const grpc::ClientContext& ctx,
       kMaxBackoffNanos);
 }
 
+void APIThrottler::ResetClientContext() {
+  std::lock_guard<std::mutex> lock(ctx_mutex_);
+  ctx_.reset(new grpc::ClientContext());  // NOLINT
+  if (closed_) {
+    ctx_->TryCancel();
+  }
+}
+
+void APIThrottler::Close() {
+  std::lock_guard<std::mutex> lock(ctx_mutex_);
+  closed_ = true;
+  ctx_->TryCancel();
+}
+
 }  // namespace profiler
 }  // namespace cloud
diff --git a/src/throttler_api.h b/src/throttler_api.h
index 2631734f7..868feb618 100644
--- a/src/throttler_api.h
+++ b/src/throttler_api.h
@@ -17,7 +17,9 @@
 #ifndef CLOUD_PROFILER_AGENT_JAVA_THROTTLER_API_H_
 #define CLOUD_PROFILER_AGENT_JAVA_THROTTLER_API_H_
 
+#include <atomic>
 #include <memory>
+#include <mutex>  // NOLINT
 #include <random>
 #include <vector>
 
@@ -25,7 +27,6 @@
 #include "src/cloud_env.h"
 #include "src/throttler.h"
 #include "google/devtools/cloudprofiler/v2/profiler.grpc.pb.h"
-
 #include "grpcpp/client_context.h"
 #include "grpcpp/support/status.h"
 
@@ -52,12 +53,16 @@ class APIThrottler : public Throttler {
   string ProfileType() override;
   int64_t DurationNanos() override;
   bool Upload(string profile) override;
+  void Close() override;
 
  private:
   // Takes a backoff on profile creation error. The backoff duration
   // may be specified by the server. Otherwise it will be a randomized
   // exponentially increasing value, bounded by kMaxBackoffNanos.
-  void OnCreationError(const grpc::ClientContext& ctx, const grpc::Status& st);
+  void OnCreationError(const grpc::Status& st);
+
+  // Resets the client gRPC context for the next call.
+  void ResetClientContext();
 
  private:
   CloudEnv* env_;
@@ -72,8 +77,18 @@ class APIThrottler : public Throttler {
   int64_t creation_backoff_envelope_ns_;
   std::default_random_engine gen_;
   std::uniform_int_distribution<int64_t> dist_;
+
+  // The throttler is closing, cancel ongoing and future requests.
+  std::atomic<bool> closed_;
+  std::mutex ctx_mutex_;
+  std::unique_ptr<grpc::ClientContext> ctx_;
 };
 
+// Returns true if the service name matches the regex
+// "^[a-z]([-a-z0-9_.]{0,253}[a-z0-9])?$", and false otherwise.
+// Public for testing.
+bool IsValidServiceName(string service);
+
 }  // namespace profiler
 }  // namespace cloud
 
diff --git a/src/throttler_timed.cc b/src/throttler_timed.cc
index f4ca1c59e..0dd646a07 100644
--- a/src/throttler_timed.cc
+++ b/src/throttler_timed.cc
@@ -30,14 +30,19 @@ namespace profiler {
 
 namespace {
 
-const int64_t kRandomRange = 65536;
+const int64_t kRandomRange = 100000;
 
 // Gets the sampling configuration from the flags.
-int64_t GetConfiguration(int64_t *duration_cpu_ns, int64_t *duration_wall_ns) {
+int64_t GetConfiguration(int64_t* duration_cpu_ns, int64_t* duration_wall_ns,
+                         bool* enable_heap) {
   int64_t duration_ns = FLAGS_cprof_duration_sec * kNanosPerSecond;
 
   *duration_cpu_ns = 0;
   *duration_wall_ns = 0;
+
+  // Currently heap is always disabled if not forced explictly.
+  *enable_heap = false;
+
   if (FLAGS_cprof_force == "") {
     *duration_cpu_ns = duration_ns;
     *duration_wall_ns = duration_ns;
@@ -45,6 +50,8 @@ int64_t GetConfiguration(int64_t *duration_cpu_ns, int64_t *duration_wall_ns) {
     *duration_cpu_ns = duration_ns;
   } else if (FLAGS_cprof_force == kTypeWall) {
     *duration_wall_ns = duration_ns;
+  } else if (FLAGS_cprof_force == kTypeHeap) {
+    *enable_heap = true;
   } else {
     LOG(ERROR) << "Unrecognized option cprof_force=" << FLAGS_cprof_force
                << ", profiling disabled";
@@ -83,13 +90,19 @@ TimedThrottler::TimedThrottler(const string& path)
     : TimedThrottler(UploaderFromFlags(path), DefaultClock(), false) {}
 
 TimedThrottler::TimedThrottler(std::unique_ptr<ProfileUploader> uploader,
-                               Clock* clock, bool fixed_seed)
-    : clock_(clock), profile_count_(), uploader_(std::move(uploader)) {
-  interval_ns_ = GetConfiguration(&duration_cpu_ns_, &duration_wall_ns_);
+                               Clock* clock, bool no_randomize)
+    : clock_(clock),
+      closed_(false),
+      profile_count_(),
+      uploader_(std::move(uploader)) {
+  interval_ns_ =
+      GetConfiguration(&duration_cpu_ns_, &duration_wall_ns_, &enable_heap_);
+
   LOG(INFO) << "sampling duration: cpu=" << duration_cpu_ns_ / kNanosPerSecond
-            << "s, wall=" << duration_wall_ns_ / kNanosPerSecond << "s";
+            << "s, wall=" << duration_wall_ns_ / kNanosPerSecond;
   LOG(INFO) << "sampling interval: " << interval_ns_ / kNanosPerSecond << "s";
   LOG(INFO) << "sampling delay: " << FLAGS_cprof_delay_sec << "s";
+  LOG(INFO) << "heap sampling enabled: " << enable_heap_;
 
   struct timespec now = clock_->Now();
 
@@ -100,19 +113,19 @@ TimedThrottler::TimedThrottler(std::unique_ptr<ProfileUploader> uploader,
     next_interval_ = TimeAdd(next_interval_, delay_ts);
   }
 
-  // Create a random number generator, seeded on the microseconds from the
-  // current timer if not asked for fixed seed (which should only be used for
-  // determinism in tests).
-  gen_ = std::default_random_engine(fixed_seed ? 10 : now.tv_nsec / 1000);
-  dist_ = std::uniform_int_distribution<int64_t>(0, kRandomRange);
+  // Create a random number generator, seeded on the current time.
+  gen_ = std::default_random_engine(now.tv_nsec / 1000);
+  dist_ = std::uniform_int_distribution<int64_t>(
+      no_randomize ? kRandomRange : 0, kRandomRange);
 
   // This will get popped on the first WaitNext() call.
   cur_.push_back({"", 0});
 }
 
 bool TimedThrottler::WaitNext() {
-  if (!uploader_ || (duration_cpu_ns_ == 0 && duration_wall_ns_ == 0)) {
-    // Refuse profiling if both CPU and wall are disabled or no uploader.
+  if (!uploader_ ||
+      (duration_cpu_ns_ == 0 && duration_wall_ns_ == 0 && !enable_heap_)) {
+    // Refuse profiling if CPU, wall, and heap are disabled or no uploader.
     LOG(WARNING) << "Profiling disabled";
     return false;
   }
@@ -139,6 +152,16 @@ bool TimedThrottler::WaitNext() {
 
     struct timespec profiling_start =
         TimeAdd(next_interval_, NanosToTimeSpec(wait_ns));
+
+    // Wait till the next profiling time polling for the cancellation.
+    const struct timespec poll_interval = {0, 500 * 1000 * 1000};  // 0.5s
+    while (!google::javaprofiler::AlmostThere(clock_, profiling_start,
+                                              poll_interval)) {
+      clock_->SleepFor(poll_interval);
+      if (closed_) {
+        return false;
+      }
+    }
     clock_->SleepUntil(profiling_start);
     next_interval_ = TimeAdd(next_interval_, NanosToTimeSpec(interval_ns_));
 
@@ -148,6 +171,9 @@ bool TimedThrottler::WaitNext() {
     if (duration_wall_ns_ > 0) {
       cur_.push_back({kTypeWall, duration_wall_ns_});
     }
+    if (enable_heap_) {
+      cur_.push_back({kTypeHeap, 0});
+    }
     // Randomize the profile type order.
     std::shuffle(cur_.begin(), cur_.end(), gen_);
   }
@@ -170,5 +196,7 @@ bool TimedThrottler::Upload(string profile) {
   return uploader_->Upload(cur_.back().first, profile);
 }
 
+void TimedThrottler::Close() { closed_ = true; }
+
 }  // namespace profiler
 }  // namespace cloud
diff --git a/src/throttler_timed.h b/src/throttler_timed.h
index 7f6499ce1..5979b9e15 100644
--- a/src/throttler_timed.h
+++ b/src/throttler_timed.h
@@ -17,6 +17,7 @@
 #ifndef CLOUD_PROFILER_AGENT_JAVA_THROTTLER_TIMED_H_
 #define CLOUD_PROFILER_AGENT_JAVA_THROTTLER_TIMED_H_
 
+#include <atomic>
 #include <memory>
 #include <random>
 
@@ -37,17 +38,22 @@ class TimedThrottler : public Throttler {
 
   // Testing-only constructor.
   TimedThrottler(std::unique_ptr<ProfileUploader> uploader, Clock* clock,
-                 bool fixed_seed);
+                 bool no_randomize);
 
   bool WaitNext() override;
   string ProfileType() override;
   int64_t DurationNanos() override;
   bool Upload(string profile) override;
+  void Close() override;
 
  private:
   Clock* clock_;
-  int64_t duration_cpu_ns_, duration_wall_ns_;
+  int64_t duration_cpu_ns_;
+  int64_t duration_wall_ns_;
+  bool enable_heap_;
   int64_t interval_ns_;
+  // The throttler is closing, cancel ongoing and future requests.
+  std::atomic<bool> closed_;
 
   std::default_random_engine gen_;
   std::uniform_int_distribution<int64_t> dist_;
diff --git a/src/worker.cc b/src/worker.cc
index 01139b34b..ddd11d674 100644
--- a/src/worker.cc
+++ b/src/worker.cc
@@ -18,6 +18,7 @@
 #include "src/profiler.h"
 #include "src/throttler_api.h"
 #include "src/throttler_timed.h"
+#include "third_party/javaprofiler/heap_sampler.h"
 
 DEFINE_bool(cprof_enabled, true,
             "when unset, unconditionally disable the profiling");
@@ -43,6 +44,13 @@ void Worker::Start(JNIEnv *jni) {
     return;
   }
 
+  // Initialize the throttler here rather in the constructor, since the
+  // constructor is invoked too early, before the heap profiler is initialized.
+  throttler_ = FLAGS_cprof_profile_filename.empty()
+                   ? std::unique_ptr<Throttler>(new APIThrottler())
+                   : std::unique_ptr<Throttler>(
+                         new TimedThrottler(FLAGS_cprof_profile_filename));
+
   // Pass 'this' as the arg to access members from the worker thread.
   jvmtiError err = jvmti_->RunAgentThread(thread, ProfileThread, this,
                                           JVMTI_THREAD_MIN_PRIORITY);
@@ -55,14 +63,16 @@ void Worker::Start(JNIEnv *jni) {
 }
 
 void Worker::Stop() {
-  // Signal the worker thread to exit and wait until it does.
   stopping_.store(true, std::memory_order_release);
+  // Close the throttler which will initiate cancellation of WaitNext / Upload.
+  throttler_->Close();
+  // Wait till the worker thread is done.
   std::lock_guard<std::mutex> lock(mutex_);
 }
 
 namespace {
 
-string Collect(Profiler *p,
+string Collect(Profiler *p, JNIEnv *env,
                google::javaprofiler::NativeProcessInfo *native_info) {
   const char *profile_type = p->ProfileType();
   if (!p->Collect()) {
@@ -70,31 +80,39 @@ string Collect(Profiler *p,
     return "";
   }
   native_info->Refresh();
-  return p->SerializeProfile(*native_info);
+  return p->SerializeProfile(env, *native_info);
 }
 
+class JNILocalFrame {
+ public:
+  explicit JNILocalFrame(JNIEnv *jni_env) : jni_env_(jni_env) {
+    // Put 100, it doesn't really matter: new spec implementations don't care.
+    jni_env_->PushLocalFrame(100);
+  }
+
+  ~JNILocalFrame() { jni_env_->PopLocalFrame(nullptr); }
+
+  // Not copyable or movable.
+  JNILocalFrame(const JNILocalFrame &) = delete;
+  JNILocalFrame &operator=(const JNILocalFrame &) = delete;
+
+ private:
+  JNIEnv *jni_env_;
+};
+
 }  // namespace
 
-void Worker::EnableProfiling() {
-  enabled_ = true;
-}
+void Worker::EnableProfiling() { enabled_ = true; }
 
-void Worker::DisableProfiling() {
-  enabled_ = false;
-}
+void Worker::DisableProfiling() { enabled_ = false; }
 
 void Worker::ProfileThread(jvmtiEnv *jvmti_env, JNIEnv *jni_env, void *arg) {
   Worker *w = static_cast<Worker *>(arg);
-  google::javaprofiler::NativeProcessInfo n("/proc/self/maps");
+  std::lock_guard<std::mutex> lock(w->mutex_);
 
-  std::unique_ptr<Throttler> t =
-      FLAGS_cprof_profile_filename.empty()
-          ? std::unique_ptr<Throttler>(new APIThrottler())
-          : std::unique_ptr<Throttler>(
-                new TimedThrottler(FLAGS_cprof_profile_filename));
+  google::javaprofiler::NativeProcessInfo n("/proc/self/maps");
 
-  while (t->WaitNext()) {
-    std::lock_guard<std::mutex> lock(w->mutex_);
+  while (w->throttler_->WaitNext()) {
     if (w->stopping_) {
       // The worker is exiting.
       break;
@@ -103,18 +121,47 @@ void Worker::ProfileThread(jvmtiEnv *jvmti_env, JNIEnv *jni_env, void *arg) {
       // Skip the collection and upload steps when profiling is disabled.
       continue;
     }
+
+    // There are a number of JVMTI functions the agent uses that return
+    // local references. Normally, local references are freed when a JNI
+    // call returns to Java. E.g. in the internal /profilez profiler
+    // those would get freed when the C++ code returns back to the request
+    // handler. But in case of the cloud agent the agent thread never exits
+    // and so the local references keep accumulating. Adding an explicit
+    // local frame around each profiling iteration fixes this.
+    // Note: normally the various JNIHandles are properly lifetime managed
+    // now (via b/133409114) and there should be no leaks; but leaving this in
+    // so that, if ever JNI handle leaks do happen again, this will release the
+    // handles automatically.
+    JNILocalFrame local_frame(jni_env);
     string profile;
-    string pt = t->ProfileType();
+    string pt = w->throttler_->ProfileType();
     if (pt == kTypeCPU) {
-      CPUProfiler p(w->jvmti_, w->threads_, t->DurationNanos(),
+      CPUProfiler p(w->jvmti_, w->threads_, w->throttler_->DurationNanos(),
                     FLAGS_cprof_cpu_sampling_period_msec * kNanosPerMilli);
-      profile = Collect(&p, &n);
+      profile = Collect(&p, jni_env, &n);
     } else if (pt == kTypeWall) {
       // Note that the requested sampling period for the wall profiling may be
       // increased if the number of live threads is too large.
-      WallProfiler p(w->jvmti_, w->threads_, t->DurationNanos(),
+      WallProfiler p(w->jvmti_, w->threads_, w->throttler_->DurationNanos(),
                      FLAGS_cprof_wall_sampling_period_msec * kNanosPerMilli);
-      profile = Collect(&p, &n);
+      profile = Collect(&p, jni_env, &n);
+    } else if (pt == kTypeHeap) {
+      if (!google::javaprofiler::HeapMonitor::Enabled()) {
+        LOG(WARNING) << "Asked for a heap sampler but it is disabled";
+        continue;
+      }
+
+      // Note: we do not force GC here, instead we rely on what was seen as
+      // still live at the last GC; this means that technically:
+      //   - Some objects might be dead now.
+      //   - Some other objects might be sampled but not show up yet.
+      // On the flip side, this allows the profile collection to not provoke a
+      // GC.
+      perftools::profiles::Builder::Marshal(
+          *google::javaprofiler::HeapMonitor::GetHeapProfiles(
+              jni_env, false /* force_gc */),
+          &profile);
     } else {
       LOG(ERROR) << "Unknown profile type '" << pt << "', skipping the upload";
       continue;
@@ -123,7 +170,7 @@ void Worker::ProfileThread(jvmtiEnv *jvmti_env, JNIEnv *jni_env, void *arg) {
       LOG(ERROR) << "No profile bytes collected, skipping the upload";
       continue;
     }
-    if (!t->Upload(profile)) {
+    if (!w->throttler_->Upload(profile)) {
       LOG(ERROR) << "Error on profile upload, discarding the profile";
     }
   }
diff --git a/src/worker.h b/src/worker.h
index e721b6bdc..82566ead9 100644
--- a/src/worker.h
+++ b/src/worker.h
@@ -43,6 +43,7 @@ class Worker {
 
   jvmtiEnv *jvmti_;
   ThreadTable *threads_;
+  std::unique_ptr<Throttler> throttler_;
   std::mutex mutex_;  // Held by the worker thread while it's running.
   std::atomic<bool> stopping_;
   static std::atomic<bool> enabled_;
diff --git a/third_party/javaprofiler/accessors.cc b/third_party/javaprofiler/accessors.cc
new file mode 100644
index 000000000..ef1eb7b3f
--- /dev/null
+++ b/third_party/javaprofiler/accessors.cc
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2018 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "third_party/javaprofiler/accessors.h"
+
+namespace google {
+namespace javaprofiler {
+
+__thread JNIEnv *Accessors::env_;
+__thread int64 Accessors::attr_;
+__thread Tags *Accessors::tags_;
+
+void Accessors::InitTags() {
+  assert(tags_ == nullptr);
+  tags_ = new Tags();
+}
+
+void Accessors::DestroyTags() {
+  const Tags *tags_tmp = tags_;
+  tags_ = nullptr;
+  // The thread local storage tags_ is set to null before deallocation to avoid
+  // getting the destructed object in the signal handler. The compiler barrier
+  // below makes sure that "delete tags_tmp" will not be scheduled before "tags_
+  // = nullptr".
+  __asm__ __volatile__("" : : : "memory");
+  delete tags_tmp;
+}
+
+Tags *Accessors::AllocateAndCopyTags() {
+  return tags_ == nullptr ? nullptr : new Tags(*tags_);
+}
+
+void Accessors::ApplyAndDeleteTags(Tags *tags) {
+  if (tags_ != nullptr) {
+    *tags_ = std::move(*tags);
+  }
+  delete tags;
+}
+
+}  // namespace javaprofiler
+}  // namespace google
diff --git a/third_party/javaprofiler/accessors.h b/third_party/javaprofiler/accessors.h
new file mode 100644
index 000000000..755e29f56
--- /dev/null
+++ b/third_party/javaprofiler/accessors.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2018 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef THIRD_PARTY_JAVAPROFILER_ACCESSORS_H_
+#define THIRD_PARTY_JAVAPROFILER_ACCESSORS_H_
+
+#include "third_party/javaprofiler/globals.h"
+#include "third_party/javaprofiler/tags.h"
+
+namespace google {
+namespace javaprofiler {
+
+// Accessors for a JNIEnv for this thread.
+class Accessors {
+ public:
+  static void SetCurrentJniEnv(JNIEnv *env) { env_ = env; }
+  static JNIEnv *CurrentJniEnv() { return env_; }
+
+  static void SetAttribute(int64 value) { attr_ = value; }
+  static int64 GetAttribute() { return attr_; }
+
+  // Allocates the current thread's tags storage which can be later retrieved by
+  // GetTags(). If the tags storage is already allocated, asserts an error.
+  static void InitTags();
+  // Deallocates the current thread's tags storage. No tag can be
+  // set for the current thread after this function is called.
+  static void DestroyTags();
+  // Both GetTags() and GetMutableTags() return the tags storage of the current
+  // thread. If InitTags() is not called before, GetTags() returns a constant
+  // reference to an empty Tags instance and GetMutableTags() returns nullptr.
+  // For all Java threads, InitTags() is called inside the callback function of
+  // onThreadStart. However, non-Java threads do not trigger the callback,
+  // which leaves InitTags() not called.
+  static const Tags &GetTags() {
+    return tags_ == nullptr ? Tags::Empty() : *tags_;
+  }
+  static Tags *GetMutableTags() { return tags_; }
+
+  // AllocateAndCopyTags() and ApplyAndDeleteTags() are used to propagate tags
+  // between threads. The usage should be like:
+  //
+  // Tags *tags_copy = AllocateAndCopyTags(); // Make a copy of tags storage of
+  // // current thread.
+  // ... // Pass tags_copy into another thread.
+  // ApplyAndDeleteTags(tags_copy); // Overwrite the tags storage of current
+  // //thread with the information stored in tags_copy.
+  // // If ApplyAndDeleteTags() is never called, users must explicitly reclaim
+  // // the memory reserved for tags_copy by "delete tags_copy".
+
+  // Copies the thread-local tags. Returns the pointer of newly allocated Tags
+  // on success; otherwise, return nullptr.
+  static Tags *AllocateAndCopyTags();
+  // Overrides the thread-local tags with the tags passed in and releases the
+  // memory pointed by tags.
+  static void ApplyAndDeleteTags(Tags *tags);
+
+  template <class FunctionType>
+  static inline FunctionType GetJvmFunction(const char *function_name) {
+    // get handle to library
+    static void *handle = dlopen("libjvm.so", RTLD_LAZY);
+    if (handle == NULL) {
+      return NULL;
+    }
+
+    // get address of function, return null if not found
+    return bit_cast<FunctionType>(dlsym(handle, function_name));
+  }
+
+ private:
+  // This is dangerous. TLS accesses are by default not async safe, as
+  // they can call malloc for lazy initialization. The initial-exec
+  // TLS mode avoids this potential allocation, with the limitation
+  // that there is a fixed amount of space to hold all TLS variables
+  // referenced in the module. This should be OK for the cloud
+  // profiler agent, which is relatively small. We do provide a way
+  // to override the TLS model for compilation environments where the
+  // TLS access is async-safe.
+#ifdef JAVAPROFILER_GLOBAL_DYNAMIC_TLS
+  static __thread JNIEnv *env_ __attribute__((tls_model("global-dynamic")));
+  static __thread int64 attr_ __attribute__((tls_model("global-dynamic")));
+  static __thread Tags *tags_ __attribute__((tls_model("global-dynamic")));
+#else
+  static __thread JNIEnv *env_ __attribute__((tls_model("initial-exec")));
+  static __thread int64 attr_ __attribute__((tls_model("initial-exec")));
+  static __thread Tags *tags_ __attribute__((tls_model("initial-exec")));
+#endif
+};
+
+}  // namespace javaprofiler
+}  // namespace google
+#endif  // THIRD_PARTY_JAVAPROFILER_ACCESSORS_H_
diff --git a/third_party/javaprofiler/async_ref_counted_string.cc b/third_party/javaprofiler/async_ref_counted_string.cc
new file mode 100644
index 000000000..9704f6a78
--- /dev/null
+++ b/third_party/javaprofiler/async_ref_counted_string.cc
@@ -0,0 +1,192 @@
+/*
+ * Copyright 2018 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "third_party/javaprofiler/async_ref_counted_string.h"
+
+#include <mutex>  // NOLINT
+#include <unordered_map>
+
+namespace google {
+namespace javaprofiler {
+namespace {
+
+using StringRefCountTable = std::unordered_map<string, std::atomic<int32_t>>;
+using StringRefCount = StringRefCountTable::value_type;
+
+// Maps string to its reference count.
+std::unordered_map<string, std::atomic<int32_t>> *string_table = nullptr;
+// All accesses to string_table should be protected by string_table_mutex.
+std::mutex string_table_mutex;
+
+// The following methods are used to transfer ownership of a string.
+// To own a string, the first step is to resolve (or create if necessary) the
+// corresponding StringRefCount and claim the ownership by incrementing the
+// reference count with either of AcquireByXXX methods. Externally, the
+// current AsyncRefCountedString still refers to the old StringRefCount. Next,
+// switch ptr_ to the new StringRefCount atomically and release the resource of
+// old StringRefCount by calling Release() (or calling AsyncSafeRelease(), but
+// it may fail). If the reference count of old StringRefCount reaches 0, the
+// string stored in the old StringRefCount is removed. The ownership
+// transference is safe upon interrupts. In the signal handler, the current
+// instance of AsyncRefCountedString refers to either the old StringRefCount or
+// the new StringRefCount without ending up with corrupted data.
+
+// Resolves the StringRefCount of a given string, increments the reference count
+// and returns the pointer of StringRefCount. It returns nullptr if the internal
+// string table is not initialized.
+StringRefCount *AcquireByString(const string &str) {
+  std::lock_guard<std::mutex> lock(string_table_mutex);
+  if (string_table == nullptr) {
+    return nullptr;
+  }
+  // The following statement is equivalent to
+  //   "const auto &it = string_table->emplace(str, 0).first".
+  // However, gcc-4.8 does not compile "emplace(str,0)" as it complains that the
+  // atomic copy constructor is deleted. Since gcc-4.8 is used in the docker
+  // container to generate the cloud Java agent, use the pair's piecewise
+  // constructor to make it compilable under gcc-4.8
+  const auto &it =
+      string_table
+          ->emplace(std::piecewise_construct, std::forward_as_tuple(str),
+                    std::forward_as_tuple(0))
+          .first;
+  it->second++;
+  return &*it;
+}
+
+// Increments the reference count of the given StringRefCount pointer if it is
+// not null, and returns the StringRefCount pointer. It is used to copy the
+// StringRefCount from another AsyncRefCountedString.
+StringRefCount *AcquireByCopy(StringRefCount *str_ref_cnt) {
+  if (str_ref_cnt != nullptr) {
+    // As "other" already holds a valid StringRefCount, we can directly
+    // increment the reference count by 1. If "other" is owned by another
+    // thread, the user has to make sure that "other" is not under editing.
+    str_ref_cnt->second++;
+  }
+  return str_ref_cnt;
+}
+
+// Renounces the ownership of a string represented by the given StringRefCount
+// pointer. It is async-signal-safe. If the string is not referred by any other
+// AsyncRefCountedString and needs removing from the string table, it fails by
+// returning false.
+bool AsyncSafeRelease(StringRefCount *str_ref_cnt) {
+  if (str_ref_cnt == nullptr) {
+    return true;
+  }
+
+  int32_t ref_count = str_ref_cnt->second.load();
+  while (ref_count > 1) {
+    if (str_ref_cnt->second.compare_exchange_weak(ref_count, ref_count - 1)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Renounces the ownership of a string represented by the given StringRefCount
+// pointer. It firstly calls AsyncSafeRelease() to release the string. If
+// AsyncSafeRelease(), it acquires the lock of the string table to make it
+// succeed eventually.
+void Release(StringRefCount *str_ref_cnt) {
+  if (!AsyncSafeRelease(str_ref_cnt)) {
+    std::lock_guard<std::mutex> lock(string_table_mutex);
+    if (str_ref_cnt->second.fetch_sub(1) == 1) {
+      // The counter becomes zero and delete the entry.
+      string_table->erase(str_ref_cnt->first);
+    }
+  }
+}
+
+}  // namespace
+
+AsyncRefCountedString::AsyncRefCountedString(const string &str)
+    : AsyncRefCountedString() {
+  Release(ptr_.exchange(AcquireByString(str)));
+}
+
+AsyncRefCountedString::AsyncRefCountedString(const AsyncRefCountedString &other)
+    : AsyncRefCountedString() {
+  Release(ptr_.exchange(AcquireByCopy(other.ptr_.load())));
+}
+
+AsyncRefCountedString::~AsyncRefCountedString() {
+  Release(ptr_.exchange(nullptr));
+}
+
+AsyncRefCountedString &AsyncRefCountedString::operator=(const string &str) {
+  Release(ptr_.exchange(AcquireByString(str)));
+  return *this;
+}
+
+AsyncRefCountedString &AsyncRefCountedString::operator=(
+    const AsyncRefCountedString &other) {
+  Release(ptr_.exchange(AcquireByCopy(other.ptr_.load())));
+  return *this;
+}
+
+AsyncRefCountedString &AsyncRefCountedString::operator=(
+    AsyncRefCountedString &&other) {
+  Reset();
+  ptr_.store(other.ptr_);
+  other.ptr_ = nullptr;
+  return *this;
+}
+
+AsyncRefCountedString &AsyncRefCountedString::AsyncSafeCopy(
+    const AsyncRefCountedString &other) {
+  assert(ptr_.load() == nullptr);
+  ptr_.exchange(AcquireByCopy(other.ptr_.load()));
+  return *this;
+}
+
+void AsyncRefCountedString::Reset() { Release(ptr_.exchange(nullptr)); }
+
+void AsyncRefCountedString::AsyncSafeReset() {
+  assert(AsyncSafeRelease(ptr_.exchange(nullptr)));
+}
+
+const string *AsyncRefCountedString::Get() const {
+  StringRefCount *str_ref_cnt = ptr_.load();
+  if (str_ref_cnt == nullptr) {
+    return nullptr;
+  } else {
+    return &(str_ref_cnt->first);
+  }
+}
+
+bool AsyncRefCountedString::Init() {
+  std::lock_guard<std::mutex> lock(string_table_mutex);
+  if (string_table == nullptr) {
+    string_table = new std::unordered_map<string, std::atomic<int32_t>>();
+    return true;
+  }
+  return false;
+}
+
+bool AsyncRefCountedString::Destroy() {
+  std::lock_guard<std::mutex> lock(string_table_mutex);
+  if (string_table == nullptr || !string_table->empty()) {
+    return false;
+  }
+  delete string_table;
+  string_table = nullptr;
+  return true;
+}
+
+}  // namespace javaprofiler
+}  // namespace google
diff --git a/third_party/javaprofiler/async_ref_counted_string.h b/third_party/javaprofiler/async_ref_counted_string.h
new file mode 100644
index 000000000..276ccb88a
--- /dev/null
+++ b/third_party/javaprofiler/async_ref_counted_string.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright 2018 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef THIRD_PARTY_JAVAPROFILER_ASYNC_REF_COUNTED_STRING_H_
+#define THIRD_PARTY_JAVAPROFILER_ASYNC_REF_COUNTED_STRING_H_
+
+#include <atomic>
+
+#include "third_party/javaprofiler/globals.h"
+
+namespace google {
+namespace javaprofiler {
+
+// A string wrapper which uses string interning to store only one copy of a
+// string.
+// Init() must be called before any usage and the internal storage allocated by
+// Init() will intentionally leak.
+// The methods that are safe to call in a signal handler, are explicitly marked
+// with "async-signal-safe".
+// An underlying string table helps maintain only one copy per string to reduce
+// memory overhead and speed up equality checking. Different
+// AsyncRefCountedString instances owned by different threads do not need extra
+// synchronization even if they refer to the same string. However, users have to
+// provide their own synchronization to access the same AsyncRefCountedString
+// instance from different threads. String interning cannot be achieved only by
+// std::shared_ptr<string> since an additional associative array is needed to
+// look up a string's intern. Combining a hashtable, std::shared_ptr<string> can
+// only de-allocate itself if not needed and will not remove it from the
+// hashtable. Furthermore, a sampling interrupt can happen when
+// std::shared_ptr<string> is being copied and destructed, and the interrupt
+// routine may see corrupted data. In other words, std::shared_ptr<string> is
+// not async-signal-safe and thus not used.
+class AsyncRefCountedString {
+ public:
+  AsyncRefCountedString() : ptr_(nullptr) {}
+  explicit AsyncRefCountedString(const string &str);
+  AsyncRefCountedString(const AsyncRefCountedString &other);
+
+  ~AsyncRefCountedString();
+
+  AsyncRefCountedString &operator=(const string &str);
+  AsyncRefCountedString &operator=(const AsyncRefCountedString &other);
+  AsyncRefCountedString &operator=(AsyncRefCountedString &&other);
+  // Async-signal-safe version of operator=(const AsyncRefCountedString &other).
+  // It requires that the instance refers to nullptr (does not contain any
+  // string). Otherwise, it asserts an error.
+  AsyncRefCountedString &AsyncSafeCopy(const AsyncRefCountedString &other);
+  // Tests whether the referred string equals the one in another
+  // AsyncRefCountedString. As a specific string only has one internal copy, the
+  // stored string address is directly used for comparison without checking the
+  // string content. Async-signal-safe.
+  bool operator==(const AsyncRefCountedString &other) const {
+    return ptr_.load() == other.ptr_.load();
+  }
+  // Async-signal-safe.
+  bool operator!=(const AsyncRefCountedString &other) const {
+    return !(*this == other);
+  }
+
+  // Renounces the ownership of currently referred string (if it currently owns
+  // one) and refers to nullptr.
+  void Reset();
+  // Async-signal-safe version of Reset(). It will succeed under the condition
+  // that there is at least one AsyncRefCountedString still refers to the string
+  // that is released by this call. Otherwise, it asserts an error.
+  void AsyncSafeReset();
+
+  // Returns the string pointer it refers.
+  // Returns nullptr when it does not refer to any string.
+  // The returned string pointer is valid as long as the AsyncRefCountedString
+  // instance stay unchanged and it is async-signal-safe.
+  const string *Get() const;
+
+  // Returns the hash value. As a specific string only has one internal copy,
+  // the stored string address is directly used as the hash value.
+  uint64 Hash() const { return reinterpret_cast<uintptr_t>(Get()); }
+
+  // Initializes the internal string storage. Must be called before using
+  // AsyncRefCountedString to store any string. Should only be called once,
+  // subsequent calls have no effect and return false. Destroy() should be
+  // called to free the storage.
+  static bool Init();
+
+  // Frees the internal string storage. Must be called after all outstanding
+  // AsyncRefCountedString objects are gone. No string can be stored after
+  // Destroy() is called. Returns false if the storage is not currently
+  // allocated or if there are known outstanding references to strings.
+  static bool Destroy();
+
+ private:
+  // The value of ptr_ requires to be changed atomically so that
+  // its value will not get corrupted upon interrupts.
+  std::atomic<std::pair<const string, std::atomic<int32_t>> *> ptr_;
+};
+
+}  // namespace javaprofiler
+}  // namespace google
+
+#endif  // THIRD_PARTY_JAVAPROFILER_ASYNC_REF_COUNTED_STRING_H_
diff --git a/third_party/javaprofiler/clock.cc b/third_party/javaprofiler/clock.cc
index b6025971a..d6fe5718a 100644
--- a/third_party/javaprofiler/clock.cc
+++ b/third_party/javaprofiler/clock.cc
@@ -21,6 +21,16 @@ namespace {
   Clock DefaultClockInstance;
 }
 
+bool AlmostThere(Clock* clock, const struct timespec& finish,
+                 const struct timespec& lap) {
+  const int64_t kMarginLaps = 2;
+
+  struct timespec now = clock->Now();
+  struct timespec laps = {lap.tv_sec * kMarginLaps, lap.tv_nsec * kMarginLaps};
+
+  return TimeLessThan(finish, TimeAdd(now, laps));
+}
+
 Clock* DefaultClock() {
   return &DefaultClockInstance;
 }
diff --git a/third_party/javaprofiler/clock.h b/third_party/javaprofiler/clock.h
index 1ff3f7203..2a7362ed8 100644
--- a/third_party/javaprofiler/clock.h
+++ b/third_party/javaprofiler/clock.h
@@ -20,11 +20,13 @@
 #include <stdint.h>
 #include <time.h>
 
+#include "third_party/javaprofiler/globals.h"
+
 namespace google {
 namespace javaprofiler {
 
-static const int64_t kNanosPerSecond = 1000 * 1000 * 1000;
-static const int64_t kNanosPerMilli = 1000 * 1000;
+constexpr int64 kNanosPerSecond = 1000 * 1000 * 1000;
+constexpr int64 kNanosPerMilli = 1000 * 1000;
 
 inline struct timespec TimeAdd(const struct timespec t1,
                                const struct timespec t2) {
@@ -41,13 +43,13 @@ inline bool TimeLessThan(const struct timespec &t1, const struct timespec &t2) {
          (t1.tv_sec == t2.tv_sec && t1.tv_nsec < t2.tv_nsec);
 }
 
-inline struct timespec NanosToTimeSpec(int64_t nanos) {
+inline struct timespec NanosToTimeSpec(int64 nanos) {
   time_t seconds = nanos / kNanosPerSecond;
   int32_t nano_seconds = nanos % kNanosPerSecond;
   return timespec{seconds, nano_seconds};
 }
 
-inline int64_t TimeSpecToNanos(const struct timespec &ts) {
+inline int64 TimeSpecToNanos(const struct timespec &ts) {
   return kNanosPerSecond * ts.tv_sec + ts.tv_nsec;
 }
 
@@ -77,6 +79,11 @@ class Clock {
   }
 };
 
+// Determines if there is time for another lap before reaching the finish line.
+// Uses a margin of multiple laps to ensure to not overrun the finish line.
+bool AlmostThere(Clock *clock, const struct timespec &finish,
+                 const struct timespec &lap);
+
 Clock *DefaultClock();
 
 }  // namespace javaprofiler
diff --git a/third_party/javaprofiler/display.cc b/third_party/javaprofiler/display.cc
index 3e336a433..f4d940ef5 100644
--- a/third_party/javaprofiler/display.cc
+++ b/third_party/javaprofiler/display.cc
@@ -15,8 +15,6 @@
 #include "third_party/javaprofiler/display.h"
 
 #include <inttypes.h>
-#include <jni.h>
-#include <jvmti.h>
 #include <cstring>
 
 #include "third_party/javaprofiler/stacktrace_fixer.h"
@@ -67,8 +65,6 @@ void GetMethodName(jvmtiEnv *jvmti, jmethodID method_id, string *method_name,
     return;
   }
 
-  signature_ptr.AbandonBecauseOfError();
-  name_ptr.AbandonBecauseOfError();
   static int once = 0;
   if (!once) {
     once = 1;
@@ -92,13 +88,12 @@ void GetMethodName(jvmtiEnv *jvmti, jmethodID method_id, string *method_name,
   *signature = kSignatureUnknown;
 }
 
-static void GetClassAndFileName(jvmtiEnv *jvmti, jmethodID method_id,
+void GetClassAndFileName(jvmtiEnv *jvmti, jmethodID method_id,
                                 jclass declaring_class, string *file_name,
                                 string *class_name) {
   JvmtiScopedPtr<char> source_name_ptr(jvmti);
   if (JVMTI_ERROR_NONE !=
       jvmti->GetSourceFileName(declaring_class, source_name_ptr.GetRef())) {
-    source_name_ptr.AbandonBecauseOfError();
     *file_name = kFileUnknown;
   } else {
     *file_name = source_name_ptr.Get();
@@ -108,7 +103,6 @@ static void GetClassAndFileName(jvmtiEnv *jvmti, jmethodID method_id,
   if (JVMTI_ERROR_NONE != jvmti->GetClassSignature(declaring_class,
                                                    signature_ptr.GetRef(),
                                                    nullptr)) {
-    signature_ptr.AbandonBecauseOfError();
     *class_name = kClassUnknown;
   } else {
     bool cleaned = CleanJavaSignature(signature_ptr.Get());
@@ -123,6 +117,32 @@ static void GetClassAndFileName(jvmtiEnv *jvmti, jmethodID method_id,
   }
 }
 
+void FillFieldsWithUnknown(string *file_name, string *class_name,
+                                  string *method_name, string *signature,
+                                  int *line_number) {
+  *file_name = kFileUnknown;
+  *class_name = kClassUnknown;
+  *method_name = kMethodUnknown;
+  *signature = kSignatureUnknown;
+  if (line_number) {
+    *line_number = 0;
+  }
+}
+
+void FillMethodSignatureAndLine(jvmtiEnv *jvmti,
+                                       const JVMPI_CallFrame &frame,
+                                       string *method_name, string *signature,
+                                       int *line_number) {
+  GetMethodName(jvmti, frame.method_id, method_name, signature);
+
+  // frame.lineno is actually a bci if it is a Java method; Asgct is piggy
+  // backing on the structure field. For natives, this would be -1 and
+  // GetLineNumber handles it.
+  if (line_number) {
+    *line_number = GetLineNumber(jvmti, frame.method_id, frame.lineno);
+  }
+}
+
 }  // end namespace
 
 jint GetLineNumber(jvmtiEnv *jvmti, jmethodID method, jlocation location) {
@@ -148,8 +168,6 @@ jint GetLineNumber(jvmtiEnv *jvmti, jmethodID method, jlocation location) {
         no_debug_info = true;
       }
     }
-
-    table_ptr_ctr.AbandonBecauseOfError();
     return -1;
   }
 
@@ -177,36 +195,10 @@ jint GetLineNumber(jvmtiEnv *jvmti, jmethodID method, jlocation location) {
   return -1;
 }
 
-static void FillFieldsWithUnknown(string *file_name, string *class_name,
-                                  string *method_name, string *signature,
-                                  int *line_number) {
-  *file_name = kFileUnknown;
-  *class_name = kClassUnknown;
-  *method_name = kMethodUnknown;
-  *signature = kSignatureUnknown;
-  if (line_number) {
-    *line_number = 0;
-  }
-}
-
-static void FillMethodSignatureAndLine(jvmtiEnv *jvmti,
-                                       const JVMPI_CallFrame &frame,
-                                       string *method_name, string *signature,
-                                       int *line_number) {
-  GetMethodName(jvmti, frame.method_id, method_name, signature);
-
-  // frame.lineno is actually a bci if it is a Java method; Asgct is piggy
-  // backing on the structure field. For natives, this would be -1 and
-  // GetLineNumber handles it.
-  if (line_number) {
-    *line_number = GetLineNumber(jvmti, frame.method_id, frame.lineno);
-  }
-}
-
-bool GetStackFrameElements(jvmtiEnv *jvmti, const JVMPI_CallFrame &frame,
-                           string *file_name, string *class_name,
-                           string *method_name, string *signature,
-                           int *line_number) {
+bool GetStackFrameElements(JNIEnv *jni, jvmtiEnv *jvmti,
+                           const JVMPI_CallFrame &frame, string *file_name,
+                           string *class_name, string *method_name,
+                           string *signature, int *line_number) {
   if (!jvmti) {
     FillFieldsWithUnknown(file_name, class_name, method_name, signature,
                           line_number);
@@ -223,6 +215,7 @@ bool GetStackFrameElements(jvmtiEnv *jvmti, const JVMPI_CallFrame &frame,
     return true;
   }
 
+  ScopedLocalRef<jclass> declaring_class_managed(jni, declaring_class);
   return GetStackFrameElements(jvmti, frame, declaring_class, file_name,
                                class_name, method_name, signature, line_number);
 }
diff --git a/third_party/javaprofiler/display.h b/third_party/javaprofiler/display.h
index ba127539a..ee897759c 100644
--- a/third_party/javaprofiler/display.h
+++ b/third_party/javaprofiler/display.h
@@ -15,6 +15,7 @@
  */
 
 #include <jvmti.h>
+
 #include <string>
 
 #include "third_party/javaprofiler/globals.h"
@@ -36,11 +37,10 @@ jint GetLineNumber(jvmtiEnv *jvmti, jmethodID method, jlocation location);
 // the information provided by the frame and using the JVMTI environment.
 // When unknown, it fills the parameters with: UnknownFile, UnknownClass,
 // UnknownMethod, "", and -1.
-bool GetStackFrameElements(jvmtiEnv *jvmti,
-                           const JVMPI_CallFrame &frame,
-                           string *file_name, string *class_name,
-                           string *method_name, string *signature,
-                           int *line_number);
+bool GetStackFrameElements(JNIEnv *jni, jvmtiEnv *jvmti,
+                           const JVMPI_CallFrame &frame, string *file_name,
+                           string *class_name, string *method_name,
+                           string *signature, int *line_number);
 
 // Fill the file_name, class_name, method_name, and line_number parameters using
 // the information provided by the frame and using the JVMTI environment.
diff --git a/third_party/javaprofiler/globals.h b/third_party/javaprofiler/globals.h
index 1ea3fd27f..7ceeb4034 100644
--- a/third_party/javaprofiler/globals.h
+++ b/third_party/javaprofiler/globals.h
@@ -26,6 +26,7 @@
 #include <glog/logging.h>
 #include <string>
 
+using std::hash;
 using std::string;
 
 #define DISALLOW_COPY_AND_ASSIGN(TypeName) \
@@ -73,8 +74,6 @@ class JvmtiScopedPtr {
 
   T *Get() { return ref_; }
 
-  void AbandonBecauseOfError() { ref_ = NULL; }
-
  private:
   jvmtiEnv *jvmti_;
   T *ref_;
@@ -82,49 +81,24 @@ class JvmtiScopedPtr {
   DISALLOW_IMPLICIT_CONSTRUCTORS(JvmtiScopedPtr);
 };
 
-// Accessors for a JNIEnv for this thread.
-class Accessors {
+template <class T>
+class ScopedLocalRef {
  public:
-  static void SetCurrentJniEnv(JNIEnv *env) { env_ = env; }
-
-  static JNIEnv *CurrentJniEnv() { return env_; }
-
-  static void Init() {}
-
-  static void Destroy() {}
-
-  static void SetAttribute(int64_t value) { attr_ = value; }
+  ScopedLocalRef(JNIEnv *jni, T ref) : jni_(jni), ref_(ref) {}
 
-  static int64_t GetAttribute() { return attr_; }
-
-  template <class FunctionType>
-  static inline FunctionType GetJvmFunction(const char *function_name) {
-    // get handle to library
-    static void *handle = dlopen("libjvm.so", RTLD_LAZY);
-    if (handle == NULL) {
-      return NULL;
+  ~ScopedLocalRef() {
+    if (NULL != ref_) {
+      jni_->DeleteLocalRef(ref_);
     }
-
-    // get address of function, return null if not found
-    return bit_cast<FunctionType>(dlsym(handle, function_name));
   }
 
+  T Get() { return ref_; }
+
  private:
-  // This is dangerous. TLS accesses are by default not async safe, as
-  // they can call malloc for lazy initialization. The initial-exec
-  // TLS mode avoids this potential allocation, with the limitation
-  // that there is a fixed amount of space to hold all TLS variables
-  // referenced in the module. This should be OK for the cloud
-  // profiler agent, which is relatively small. We do provide a way
-  // to override the TLS model for compilation environments where the
-  // TLS access is async-safe.
-#ifdef JAVAPROFILER_GLOBAL_DYNAMIC_TLS
-  static __thread JNIEnv *env_ __attribute__((tls_model("global-dynamic")));
-  static __thread int64_t attr_ __attribute__((tls_model("global-dynamic")));
-#else
-  static __thread JNIEnv *env_ __attribute__((tls_model("initial-exec")));
-  static __thread int64_t attr_ __attribute__((tls_model("initial-exec")));
-#endif
+  JNIEnv *jni_;
+  T ref_;
+
+  DISALLOW_IMPLICIT_CONSTRUCTORS(ScopedLocalRef);
 };
 
 }  // namespace javaprofiler
diff --git a/third_party/javaprofiler/heap_sampler.cc b/third_party/javaprofiler/heap_sampler.cc
new file mode 100644
index 000000000..e2a69138c
--- /dev/null
+++ b/third_party/javaprofiler/heap_sampler.cc
@@ -0,0 +1,365 @@
+/*
+ * Copyright 2018 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "third_party/javaprofiler/heap_sampler.h"
+#include "third_party/javaprofiler/profile_proto_builder.h"
+
+namespace {
+
+std::unique_ptr<std::vector<google::javaprofiler::JVMPI_CallFrame>>
+TransformFrames(jvmtiFrameInfo *stack_frames, int count) {
+  auto frames =
+      std::unique_ptr<std::vector<google::javaprofiler::JVMPI_CallFrame>>(
+          new std::vector<google::javaprofiler::JVMPI_CallFrame>(count));
+
+  for (int i = 0; i < count; i++) {
+    // Note that technically this is not the line number; it is the location but
+    // our CPU profiler piggy-backs on JVMPI_CallFrame and uses lineno as a
+    // jlocation as well...
+    (*frames)[i].lineno = stack_frames[i].location;
+    (*frames)[i].method_id = stack_frames[i].method;
+  }
+
+  return frames;
+}
+
+extern "C" JNIEXPORT void SampledObjectAlloc(jvmtiEnv *jvmti_env,
+                                             JNIEnv *jni_env, jthread thread,
+                                             jobject object,
+                                             jclass object_klass, jlong size) {
+  google::javaprofiler::HeapMonitor::AddSample(jni_env, thread, object,
+                                               object_klass, size);
+}
+
+extern "C" JNIEXPORT void GarbageCollectionFinish(jvmtiEnv *jvmti_env) {
+  google::javaprofiler::HeapMonitor::NotifyGCWaitingThread();
+}
+
+}  // namespace
+
+namespace google {
+namespace javaprofiler {
+std::atomic<jvmtiEnv *> HeapMonitor::jvmti_;
+std::atomic<int> HeapMonitor::sampling_interval_;
+
+HeapEventStorage::HeapEventStorage(jvmtiEnv *jvmti, ProfileFrameCache *cache,
+                                   int max_garbage_size)
+    : max_garbage_size_(max_garbage_size),
+      cur_garbage_pos_(0),
+      jvmti_(jvmti), cache_(cache) {
+}
+
+void HeapEventStorage::HeapObjectTrace::AddToProto(
+    const std::unique_ptr<ProfileProtoBuilder> &builder) {
+  JVMPI_CallTrace call_trace{nullptr, static_cast<int>(frames_->size()),
+                             frames_->data()};
+  ProfileStackTrace trace = {&call_trace, size_};
+  builder->AddTraces(&trace, 1);
+}
+
+void HeapEventStorage::Add(JNIEnv *jni, jthread thread, jobject object,
+                           jclass klass, jlong size) {
+  const int kMaxFrames = 128;
+  jint count = 0;
+  jvmtiFrameInfo stack_frames[kMaxFrames];
+  jvmtiError err =
+      jvmti_->GetStackTrace(thread, 0, kMaxFrames, stack_frames, &count);
+
+  if (err == JVMTI_ERROR_NONE && count > 0) {
+    auto frames = TransformFrames(stack_frames, count);
+
+    jweak weak_ref = jni->NewWeakGlobalRef(object);
+    if (jni->ExceptionCheck()) {
+      LOG(WARNING) << "Failed to create NewWeakGlobalRef, skipping heap sample";
+      return;
+    }
+
+    auto live_object = std::unique_ptr<HeapObjectTrace>(
+        new HeapObjectTrace(weak_ref, size, std::move(frames)));
+
+    // Only now lock and get things done quickly.
+    std::lock_guard<std::mutex> lock(storage_lock_);
+    newly_allocated_objects_.push_back(std::move(live_object));
+  }
+}
+
+void HeapEventStorage::AddToGarbage(std::unique_ptr<HeapObjectTrace> obj) {
+  if (garbage_objects_.size() >= max_garbage_size_) {
+    garbage_objects_[cur_garbage_pos_] = std::move(obj);
+    cur_garbage_pos_ = (cur_garbage_pos_ + 1) % max_garbage_size_;
+  } else {
+    garbage_objects_.push_back(std::move(obj));
+  }
+}
+
+void HeapEventStorage::MoveLiveObjects(
+    JNIEnv *env, std::vector<std::unique_ptr<HeapObjectTrace>> *objects,
+    std::vector<std::unique_ptr<HeapObjectTrace>> *still_live_objects) {
+  for (auto &elem : *objects) {
+    if (elem->IsLive(env)) {
+      still_live_objects->push_back(std::move(elem));
+    } else {
+      elem->DeleteWeakReference(env);
+      AddToGarbage(std::move(elem));
+    }
+  }
+}
+
+void HeapEventStorage::CompactSamples(JNIEnv *env) {
+  std::lock_guard<std::mutex> lock(storage_lock_);
+
+  std::vector<std::unique_ptr<HeapObjectTrace>> still_live;
+
+  MoveLiveObjects(env, &newly_allocated_objects_, &still_live);
+  MoveLiveObjects(env, &live_objects_, &still_live);
+
+  // Live objects are the objects still alive.
+  live_objects_ = std::move(still_live);
+  // Newly allocated objects is now reset, those still alive are now in
+  // live_objects.
+  newly_allocated_objects_.clear();
+}
+
+std::unique_ptr<perftools::profiles::Profile> HeapEventStorage::GetProfiles(
+    JNIEnv *env, int sampling_interval, bool force_gc, bool get_live) {
+  auto builder =
+      ProfileProtoBuilder::ForHeap(env, jvmti_, sampling_interval, cache_);
+
+  if (force_gc) {
+    if (jvmti_->ForceGarbageCollection() != JVMTI_ERROR_NONE) {
+      LOG(WARNING) << "Failed to force GC, returning empty heap profile proto";
+      return builder->CreateProto();
+    }
+
+    CompactSamples(env);
+  }
+
+  {
+    std::lock_guard<std::mutex> lock(storage_lock_);
+
+    if (get_live) {
+      for (auto &elem : live_objects_) {
+        elem->AddToProto(builder);
+      }
+    } else {
+      for (auto &elem : garbage_objects_) {
+        elem->AddToProto(builder);
+      }
+    }
+  }
+  return builder->CreateProto();
+}
+
+bool HeapMonitor::CreateGCWaitingThread(jvmtiEnv* jvmti, JNIEnv* jni) {
+  jclass cls = jni->FindClass("java/lang/Thread");
+  jmethodID constructor = jni->GetMethodID(cls, "<init>", "()V");
+  jobject thread = jni->NewGlobalRef(jni->NewObject(cls, constructor));
+  if (thread == nullptr) {
+    LOG(WARNING) << "Failed to construct the GC waiting thread";
+    return false;
+  }
+
+  if (jvmti->RunAgentThread(thread, GCWaitingThread, nullptr,
+                            JVMTI_THREAD_MIN_PRIORITY) != JVMTI_ERROR_NONE) {
+    LOG(WARNING) << "Failed to start the GC waiting thread";
+    return false;
+  }
+
+  return true;
+}
+
+bool HeapMonitor::Supported(jvmtiEnv *jvmti) {
+#ifdef ENABLE_HEAP_SAMPLING
+  jvmtiCapabilities caps;
+  memset(&caps, 0, sizeof(caps));
+  if (jvmti->GetPotentialCapabilities(&caps) != JVMTI_ERROR_NONE) {
+    LOG(WARNING) << "Failed to get potential capabilities, disabling the heap "
+                 << "sampling monitor";
+    return false;
+  }
+
+  // If ever this was run with a JDK before JDK11, it would not set this bit as
+  // it was added at the end of the structure. Therefore this is a cheap way to
+  // check for a runtime "are we running with JDK11+".
+  return caps.can_generate_sampled_object_alloc_events
+      && caps.can_generate_garbage_collection_events;
+#else
+  return false;
+#endif
+}
+
+void HeapMonitor::AddCallback(jvmtiEventCallbacks *callbacks) {
+#ifdef ENABLE_HEAP_SAMPLING
+  callbacks->SampledObjectAlloc = &SampledObjectAlloc;
+  callbacks->GarbageCollectionFinish = &GarbageCollectionFinish;
+#endif
+}
+
+// Currently, we enable once and forget about it.
+bool HeapMonitor::Enable(jvmtiEnv *jvmti, JNIEnv* jni, int sampling_interval) {
+#ifdef ENABLE_HEAP_SAMPLING
+  if (!Supported(jvmti)) {
+    LOG(INFO) << "Heap sampling is not supported by the JVM, disabling the "
+              << " heap sampling monitor";
+    return false;
+  }
+
+  jvmtiCapabilities caps;
+  memset(&caps, 0, sizeof(caps));
+  // Get line numbers, sample events, and filename for the tests.
+  caps.can_get_line_numbers = 1;
+  caps.can_get_source_file_name = 1;
+  caps.can_generate_sampled_object_alloc_events = 1;
+  caps.can_generate_garbage_collection_events = 1;
+
+  if (jvmti->AddCapabilities(&caps) != JVMTI_ERROR_NONE) {
+    LOG(WARNING) << "Failed to add capabilities, disabling the heap "
+                 << "sampling monitor";
+    return false;
+  }
+
+  if (jvmti->SetHeapSamplingInterval(sampling_interval) != JVMTI_ERROR_NONE) {
+    LOG(WARNING) << "Failed to set the heap sampling interval, disabling the "
+                 << "heap sampling monitor";
+    return false;
+  }
+
+  jvmti_.store(jvmti);
+  sampling_interval_.store(sampling_interval);
+
+  if (!GetInstance()->CreateGCWaitingThread(jvmti, jni)) {
+    return false;
+  }
+
+  if (jvmti->SetEventNotificationMode(JVMTI_ENABLE,
+                                      JVMTI_EVENT_SAMPLED_OBJECT_ALLOC,
+                                      nullptr) != JVMTI_ERROR_NONE) {
+    LOG(WARNING) << "Failed to enable sampled object alloc event, disabling the"
+                 << " heap sampling monitor";
+    return false;
+  }
+
+  if (jvmti->SetEventNotificationMode(JVMTI_ENABLE,
+                                      JVMTI_EVENT_GARBAGE_COLLECTION_FINISH,
+                                      nullptr) != JVMTI_ERROR_NONE) {
+    jvmti->SetEventNotificationMode(JVMTI_DISABLE,
+                                    JVMTI_EVENT_SAMPLED_OBJECT_ALLOC,
+                                    nullptr);
+    LOG(WARNING) << "Failed to enable garbage collection finish event, "
+                 << "disabling the heap sampling monitor";
+    return false;
+  }
+
+  return true;
+#else
+  return false;
+#endif
+}
+
+void HeapMonitor::Disable() {
+#ifdef ENABLE_HEAP_SAMPLING
+  jvmtiEnv *jvmti = jvmti_.load();
+  if (!jvmti) {
+    return;
+  }
+
+  jvmti->SetEventNotificationMode(JVMTI_DISABLE,
+                                  JVMTI_EVENT_SAMPLED_OBJECT_ALLOC, nullptr);
+  jvmti->SetEventNotificationMode(JVMTI_DISABLE,
+                                  JVMTI_EVENT_GARBAGE_COLLECTION_FINISH,
+                                  nullptr);
+  jvmti_.store(nullptr);
+
+  // Notify the agent thread that we are done.
+  google::javaprofiler::HeapMonitor::GetInstance()->NotifyGCWaitingThread();
+
+#else
+  // Do nothing: we never enabled ourselves.
+#endif
+}
+
+std::unique_ptr<perftools::profiles::Profile> HeapMonitor::GetHeapProfiles(
+    JNIEnv* env, bool force_gc) {
+#ifdef ENABLE_HEAP_SAMPLING
+  // Note: technically this means that you cannot disable the sampler and then
+  // get the profile afterwards; this could be changed if needed.
+  if (jvmti_) {
+    return GetInstance()->storage_.GetHeapProfiles(env, sampling_interval_,
+                                                   force_gc);
+  }
+#endif
+  return EmptyHeapProfile(env);
+}
+
+std::unique_ptr<perftools::profiles::Profile>
+HeapMonitor::GetGarbageHeapProfiles(JNIEnv* env, bool force_gc) {
+#ifdef ENABLE_HEAP_SAMPLING
+  // Note: technically this means that you cannot disable the sampler and then
+  // get the profile afterwards; this could be changed if needed.
+  if (jvmti_) {
+    return GetInstance()->storage_.GetGarbageHeapProfiles(
+        env, sampling_interval_, force_gc);
+  }
+#endif
+  return EmptyHeapProfile(env);
+}
+
+std::unique_ptr<perftools::profiles::Profile> HeapMonitor::EmptyHeapProfile(
+    JNIEnv *jni_env) {
+  return ProfileProtoBuilder::ForHeap(jni_env, jvmti_, sampling_interval_)
+      ->CreateProto();
+}
+
+void HeapMonitor::NotifyGCWaitingThreadInternal() {
+  std::unique_lock<std::mutex> lock(gc_waiting_mutex_);
+  gc_notified_ = true;
+  gc_waiting_cv_.notify_all();
+}
+
+void HeapMonitor::WaitForGC() {
+  std::unique_lock<std::mutex> lock(gc_waiting_mutex_);
+  gc_notified_ = false;
+
+  // If we are woken up without having been notified, just go back to sleep.
+  gc_waiting_cv_.wait(lock, [this] { return gc_notified_; } );
+}
+
+void HeapMonitor::GCWaitingThread(jvmtiEnv* jvmti_env, JNIEnv* jni_env,
+                                  void* arg) {
+  GetInstance()->GCWaitingThreadRun(jni_env);
+}
+
+void HeapMonitor::GCWaitingThreadRun(JNIEnv* jni_env) {
+  while (true) {
+    WaitForGC();
+
+    // Was the heap monitor disabled?
+    if (!Enabled()) {
+      break;
+    }
+
+    CompactData(jni_env);
+  }
+
+  LOG(INFO) << "Heap sampling GC waiting thread finished";
+}
+
+void HeapMonitor::CompactData(JNIEnv* jni_env) {
+  storage_.CompactSamples(jni_env);
+}
+
+}  // namespace javaprofiler
+}  // namespace google
diff --git a/third_party/javaprofiler/heap_sampler.h b/third_party/javaprofiler/heap_sampler.h
new file mode 100644
index 000000000..e99088f0e
--- /dev/null
+++ b/third_party/javaprofiler/heap_sampler.h
@@ -0,0 +1,205 @@
+/*
+ * Copyright 2018 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef THIRD_PARTY_JAVAPROFILER_HEAP_SAMPLER_H_
+#define THIRD_PARTY_JAVAPROFILER_HEAP_SAMPLER_H_
+
+#include <jni.h>
+#include <jvmti.h>
+
+#include <condition_variable>  // NOLINT
+#include <memory>
+#include <mutex>  // NOLINT
+#include <vector>
+
+#include "third_party/javaprofiler/globals.h"
+#include "third_party/javaprofiler/profile_proto_builder.h"
+
+namespace google {
+namespace javaprofiler {
+
+// Storage for the sampled heap objects recorded from the heap sampling JVMTI
+// API callbacks.
+class HeapEventStorage {
+ public:
+  HeapEventStorage(jvmtiEnv *jvmti, ProfileFrameCache *cache = nullptr,
+                   int max_garbage_size = 200);
+
+  // TODO: establish correct shutdown sequence: how do we ensure that
+  // things are not going to go awfully wrong at shutdown, is it this class' job
+  // or should it be the owner of this class' instance's job?
+
+  // Adds an object to the storage system.
+  void Add(JNIEnv *jni, jthread thread, jobject object, jclass klass,
+           jlong size);
+
+  // Returns a perftools::profiles::Profile with the objects stored via
+  // calls to the Add method.
+  // force_gc provides a means to force GC before returning the sampled heap
+  // profiles;
+  // setting force_gc to true has a performance impact and is discouraged.
+  std::unique_ptr<perftools::profiles::Profile> GetHeapProfiles(
+      JNIEnv *env, int sampling_interval, bool force_gc = false) {
+    return GetProfiles(env, sampling_interval, force_gc, true);
+  }
+
+  // Returns a perftools::profiles::Profile with the objects that have been
+  // GC'd.
+  // force_gc provides a means to force GC before returning the sampled heap
+  // profiles;
+  // setting force_gc to true has a performance impact and is discouraged.
+  std::unique_ptr<perftools::profiles::Profile> GetGarbageHeapProfiles(
+      JNIEnv* env, int sampling_interval, bool force_gc = false) {
+    return GetProfiles(env, sampling_interval, force_gc, false);
+  }
+
+  void CompactSamples(JNIEnv *env);
+
+  // Not copyable or movable.
+  HeapEventStorage(const HeapEventStorage&) = delete;
+  HeapEventStorage& operator=(const HeapEventStorage&) = delete;
+
+ private:
+  // A sampled heap object, defined by the object, its size, and the stack
+  // frame.
+  class HeapObjectTrace {
+   public:
+    // This object owns the jweak object parameter. It is freed when the object
+    // is sent to the garbage list, and the object is set to nullptr.
+    HeapObjectTrace(jweak object, jlong size,
+                    std::unique_ptr<std::vector<JVMPI_CallFrame>> frames)
+        : object_(object), size_(size), frames_(std::move(frames)) {}
+
+    void AddToProto(const std::unique_ptr<ProfileProtoBuilder> &builder);
+
+    void DeleteWeakReference(JNIEnv* env) {
+      env->DeleteWeakGlobalRef(object_);
+      object_ = nullptr;
+    }
+
+
+    bool IsLive(JNIEnv *env) {
+      // When GC collects the object, the object represented by the weak
+      // reference will be considered as the same object as NULL.
+      return !env->IsSameObject(object_, NULL);
+    }
+
+    // Not copyable or movable.
+    HeapObjectTrace(const HeapObjectTrace&) = delete;
+    HeapObjectTrace& operator=(const HeapObjectTrace&) = delete;
+
+   private:
+    jweak object_;
+    int size_;
+    std::unique_ptr<std::vector<JVMPI_CallFrame>> frames_;
+  };
+
+  std::unique_ptr<perftools::profiles::Profile> GetProfiles(
+      JNIEnv* env, int sampling_interval, bool force_gc, bool get_live);
+
+  // Add object to the garbage list: it uses a queue with a max size of
+  // max_garbage_size, provided via the constructor.
+  void AddToGarbage(std::unique_ptr<HeapObjectTrace> obj);
+
+  // Moves live objects from objects to still_live_objects; live elements from
+  // the objects vector are replaced with nullptr via std::move.
+  void MoveLiveObjects(
+      JNIEnv *env, std::vector<std::unique_ptr<HeapObjectTrace>> *objects,
+      std::vector<std::unique_ptr<HeapObjectTrace>> *still_live_objects);
+
+  std::vector<std::unique_ptr<HeapObjectTrace>> newly_allocated_objects_;
+  std::vector<std::unique_ptr<HeapObjectTrace>> live_objects_;
+
+  // Though a queue really would be nice, we need a way to iterate when
+  // requested.
+  int max_garbage_size_;
+  int cur_garbage_pos_;
+  std::vector<std::unique_ptr<HeapObjectTrace>> garbage_objects_;
+
+  std::mutex storage_lock_;
+  jvmtiEnv *jvmti_;
+  ProfileFrameCache *cache_;
+};
+
+// Due to the JVMTI callback, everything here is static.
+class HeapMonitor {
+ public:
+  static bool Enable(jvmtiEnv *jvmti, JNIEnv* jni, int sampling_interval);
+  static void Disable();
+
+  static bool Enabled() { return jvmti_ != nullptr; }
+
+  // Returns a perftools::profiles::Profile with the objects provided by the
+  // HeapEventStorage.
+  static std::unique_ptr<perftools::profiles::Profile> GetHeapProfiles(
+      JNIEnv* env, bool force_gc);
+
+  // Returns a perftools::profiles::Profile with the GC'd objects provided by
+  // the HeapEventStorage.
+  static std::unique_ptr<perftools::profiles::Profile> GetGarbageHeapProfiles(
+      JNIEnv* env, bool force_gc);
+
+  static void AddSample(JNIEnv *jni_env, jthread thread, jobject object,
+                        jclass object_klass, jlong size) {
+    GetInstance()->storage_.Add(jni_env, thread, object, object_klass, size);
+  }
+
+  static void AddCallback(jvmtiEventCallbacks *callbacks);
+
+  static void NotifyGCWaitingThread() {
+    GetInstance()->NotifyGCWaitingThreadInternal();
+  }
+
+  // Not copyable or movable.
+  HeapMonitor(const HeapMonitor &) = delete;
+  HeapMonitor &operator=(const HeapMonitor &) = delete;
+
+ private:
+  HeapMonitor() : gc_notified_(false), storage_(jvmti_.load()) {}
+
+  // We construct the heap_monitor at the first call to GetInstance, so ensure
+  // Enable was called at least once before to initialize jvmti_.
+  static HeapMonitor *GetInstance() {
+    static HeapMonitor heap_monitor;
+    return &heap_monitor;
+  }
+
+  static bool Supported(jvmtiEnv* jvmti);
+
+  bool CreateGCWaitingThread(jvmtiEnv* jvmti, JNIEnv* jni);
+  static void GCWaitingThread(jvmtiEnv *jvmti_env, JNIEnv *jni_env, void *arg);
+  void GCWaitingThreadRun(JNIEnv* jni_env);
+  void WaitForGC();
+  void NotifyGCWaitingThreadInternal();
+
+  void CompactData(JNIEnv* jni_env);
+
+  static std::unique_ptr<perftools::profiles::Profile> EmptyHeapProfile(
+      JNIEnv *jni_env);
+
+  static std::atomic<jvmtiEnv *> jvmti_;
+  static std::atomic<int> sampling_interval_;
+
+  bool gc_notified_;
+  std::condition_variable gc_waiting_cv_;
+  std::mutex gc_waiting_mutex_;
+  HeapEventStorage storage_;
+};
+
+}  // namespace javaprofiler
+}  // namespace google
+
+#endif  // THIRD_PARTY_JAVAPROFILER_HEAP_SAMPLER_H_
diff --git a/third_party/javaprofiler/native.h b/third_party/javaprofiler/native.h
index c71883376..58acab8ce 100644
--- a/third_party/javaprofiler/native.h
+++ b/third_party/javaprofiler/native.h
@@ -34,7 +34,7 @@ class NativeProcessInfo {
   explicit NativeProcessInfo(const string &procmaps_filename);
 
   struct Mapping {
-    uint64_t start, limit;
+    uint64 start, limit;
     string name;
   };
 
diff --git a/third_party/javaprofiler/profile_proto_builder.cc b/third_party/javaprofiler/profile_proto_builder.cc
index aaea2f86c..a44593165 100644
--- a/third_party/javaprofiler/profile_proto_builder.cc
+++ b/third_party/javaprofiler/profile_proto_builder.cc
@@ -24,13 +24,16 @@ namespace javaprofiler {
 const int kCount = 0;
 const int kMetric = 1;
 
-ProfileProtoBuilder::ProfileProtoBuilder(jvmtiEnv *jvmti_env,
+ProfileProtoBuilder::ProfileProtoBuilder(JNIEnv *jni_env, jvmtiEnv *jvmti_env,
                                          ProfileFrameCache *native_cache,
                                          int64 sampling_rate,
                                          const SampleType &count_type,
                                          const SampleType &metric_type)
-    : jvmti_env_(jvmti_env), native_cache_(native_cache),
-    location_builder_(&builder_), sampling_rate_(sampling_rate) {
+    : sampling_rate_(sampling_rate),
+      jni_env_(jni_env),
+      jvmti_env_(jvmti_env),
+      native_cache_(native_cache),
+      location_builder_(&builder_) {
   AddSampleType(count_type);
   AddSampleType(metric_type);
   SetPeriodType(metric_type);
@@ -38,7 +41,9 @@ ProfileProtoBuilder::ProfileProtoBuilder(jvmtiEnv *jvmti_env,
 
 void ProfileProtoBuilder::AddTraces(const ProfileStackTrace *traces,
                                     int num_traces) {
-  native_cache_->ProcessTraces(traces, num_traces);
+  if (native_cache_) {
+    native_cache_->ProcessTraces(traces, num_traces);
+  }
 
   for (int i = 0; i < num_traces; ++i) {
     AddTrace(traces[i], 1);
@@ -48,7 +53,9 @@ void ProfileProtoBuilder::AddTraces(const ProfileStackTrace *traces,
 void ProfileProtoBuilder::AddTraces(const ProfileStackTrace *traces,
                                     const int32 *counts,
                                     int num_traces) {
-  native_cache_->ProcessTraces(traces, num_traces);
+  if (native_cache_) {
+    native_cache_->ProcessTraces(traces, num_traces);
+  }
 
   for (int i = 0; i < num_traces; ++i) {
     AddTrace(traces[i], counts[i]);
@@ -63,7 +70,8 @@ void ProfileProtoBuilder::AddArtificialTrace(const string& name, int count,
   auto profile = builder_.mutable_profile();
   auto sample = profile->add_sample();
   sample->add_location_id(location->id());
-  InitSampleValues(sample, count, count * sampling_rate);
+  // Move count * sampling rate to 64-bit.
+  InitSampleValues(sample, count, static_cast<int64>(count) * sampling_rate);
 }
 
 void ProfileProtoBuilder::UnsampleMetrics() {
@@ -107,18 +115,18 @@ void ProfileProtoBuilder::SetPeriodType(const SampleType &metric_type) {
 }
 
 void ProfileProtoBuilder::UpdateSampleValues(
-    perftools::profiles::Sample *sample, jint count, jint size) {
+    perftools::profiles::Sample *sample, int64 count, int64 size) {
   sample->set_value(kCount, sample->value(kCount) + count);
   sample->set_value(kMetric, sample->value(kMetric) + size);
 }
 
 void ProfileProtoBuilder::InitSampleValues(
-    perftools::profiles::Sample *sample, jint metric) {
+    perftools::profiles::Sample *sample, int64 metric) {
   InitSampleValues(sample, 1, metric);
 }
 
 void ProfileProtoBuilder::InitSampleValues(
-    perftools::profiles::Sample *sample, jint count, jint metric) {
+    perftools::profiles::Sample *sample, int64 count, int64 metric) {
   sample->add_value(count);
   sample->add_value(metric);
 }
@@ -155,7 +163,7 @@ void ProfileProtoBuilder::AddTrace(const ProfileStackTrace &trace,
 }
 
 void ProfileProtoBuilder::AddJavaInfo(
-    const google::javaprofiler::JVMPI_CallFrame &jvm_frame,
+    const JVMPI_CallFrame &jvm_frame,
     perftools::profiles::Profile *profile,
     perftools::profiles::Sample *sample,
     StackState *stack_state) {
@@ -163,7 +171,7 @@ void ProfileProtoBuilder::AddJavaInfo(
 
   if (!jvm_frame.method_id) {
     perftools::profiles::Location *location = location_builder_.LocationFor(
-        "", "Unknown method", "", 0);
+        "", "[Unknown method]", "", 0);
     sample->add_location_id(location->id());
     return;
   }
@@ -173,12 +181,10 @@ void ProfileProtoBuilder::AddJavaInfo(
   string method_name;
   string signature;
   int line_number;
-  google::javaprofiler::GetStackFrameElements(jvmti_env_, jvm_frame,
-                                              &file_name, &class_name,
-                                              &method_name, &signature,
-                                              &line_number);
+  GetStackFrameElements(jni_env_, jvmti_env_, jvm_frame, &file_name,
+                        &class_name, &method_name, &signature, &line_number);
 
-  ::google::javaprofiler::FixMethodParameters(&signature);
+  FixMethodParameters(&signature);
   string full_method_name = class_name + "." + method_name + signature;
 
   perftools::profiles::Location *location = location_builder_.LocationFor(
@@ -187,11 +193,17 @@ void ProfileProtoBuilder::AddJavaInfo(
   sample->add_location_id(location->id());
 }
 
-void ProfileProtoBuilder::AddNativeInfo(
-    const google::javaprofiler::JVMPI_CallFrame &jvm_frame,
-    perftools::profiles::Profile *profile,
-    perftools::profiles::Sample *sample,
-    StackState *stack_state) {
+void ProfileProtoBuilder::AddNativeInfo(const JVMPI_CallFrame &jvm_frame,
+                                        perftools::profiles::Profile *profile,
+                                        perftools::profiles::Sample *sample,
+                                        StackState *stack_state) {
+  if (!native_cache_) {
+    perftools::profiles::Location *location = location_builder_.LocationFor(
+        "", "[Unknown non-Java frame]", "", 0);
+    sample->add_location_id(location->id());
+    return;
+  }
+
   string function_name = native_cache_->GetFunctionName(jvm_frame);
   perftools::profiles::Location *location =
     native_cache_->GetLocation(jvm_frame,
@@ -206,6 +218,21 @@ void ProfileProtoBuilder::AddNativeInfo(
   }
 }
 
+void ContentionProfileProtoBuilder::MultiplyBySamplingRate() {
+  auto profile = builder_.mutable_profile();
+
+  for (int i = 0; i < profile->sample_size(); ++i) {
+    auto sample = profile->mutable_sample(i);
+
+    auto count = sample->value(kCount);
+    auto metric_value = sample->value(kMetric);
+
+    sample->set_value(kCount, static_cast<double>(count) * sampling_rate_);
+    sample->set_value(kMetric,
+                      static_cast<double>(metric_value) * sampling_rate_);
+  }
+}
+
 size_t LocationBuilder::LocationInfoHash::operator()(
     const LocationInfo &info) const {
 
@@ -251,9 +278,9 @@ perftools::profiles::Location *LocationBuilder::LocationFor(
 
   auto line = location->add_line();
 
-  // TODO: Handle library name etc too...
+  auto simplified_name = SimplifyFunctionName(function_name);
   auto function_id = builder_->FunctionId(
-      function_name.c_str(), "", file_name.c_str(), 0);
+      simplified_name.c_str(), function_name.c_str(), file_name.c_str(), 0);
 
   line->set_function_id(function_id);
   line->set_line(line_number);
@@ -335,21 +362,38 @@ double CalculateSamplingRatio(int64 rate, int64 count, int64 metric_value) {
 }
 
 std::unique_ptr<ProfileProtoBuilder> ProfileProtoBuilder::ForHeap(
-    jvmtiEnv *jvmti_env, int64 sampling_rate, ProfileFrameCache *cache) {
-  return std::unique_ptr<ProfileProtoBuilder>(new HeapProfileProtoBuilder(
-      jvmti_env, sampling_rate, cache));
+    JNIEnv *jni_env, jvmtiEnv *jvmti_env, int64 sampling_rate,
+    ProfileFrameCache *cache) {
+  // Cache can be nullptr because the heap sampler can be using a JVMTI
+  // Java-only stackframe gatherer.
+  return std::unique_ptr<ProfileProtoBuilder>(
+      new HeapProfileProtoBuilder(jni_env, jvmti_env, sampling_rate, cache));
+}
+
+std::unique_ptr<ProfileProtoBuilder> ProfileProtoBuilder::ForNativeHeap(
+    JNIEnv *jni_env, jvmtiEnv *jvmti_env, int64 sampling_rate,
+    ProfileFrameCache *cache) {
+  assert(cache != nullptr);
+  return std::unique_ptr<ProfileProtoBuilder>(new NativeHeapProfileProtoBuilder(
+      jni_env, jvmti_env, sampling_rate, cache));
 }
 
 std::unique_ptr<ProfileProtoBuilder> ProfileProtoBuilder::ForCpu(
-    jvmtiEnv *jvmti_env, int64 sampling_rate, ProfileFrameCache *cache) {
+    JNIEnv *jni_env, jvmtiEnv *jvmti_env, int64 sampling_rate,
+    ProfileFrameCache *cache) {
+  CHECK (cache != nullptr)
+      << "CPU profiles may have native frames, cache must be provided";
   return std::unique_ptr<ProfileProtoBuilder>(
-      new CpuProfileProtoBuilder(jvmti_env, sampling_rate, cache));
+      new CpuProfileProtoBuilder(jni_env, jvmti_env, sampling_rate, cache));
 }
 
 std::unique_ptr<ProfileProtoBuilder> ProfileProtoBuilder::ForContention(
-    jvmtiEnv *jvmti_env, int64 sampling_rate, ProfileFrameCache *cache) {
-  return std::unique_ptr<ProfileProtoBuilder>(
-      new ContentionProfileProtoBuilder(jvmti_env, sampling_rate, cache));
+    JNIEnv *jni_env, jvmtiEnv *jvmti_env, int64 sampling_rate,
+    ProfileFrameCache *cache) {
+  CHECK (cache != nullptr)
+      << "Contention profiles may have native frames, cache must be provided";
+  return std::unique_ptr<ProfileProtoBuilder>(new ContentionProfileProtoBuilder(
+      jni_env, jvmti_env, sampling_rate, cache));
 }
 
 }  // namespace javaprofiler
diff --git a/third_party/javaprofiler/profile_proto_builder.h b/third_party/javaprofiler/profile_proto_builder.h
index a94f2e5e2..f68ab2d1c 100644
--- a/third_party/javaprofiler/profile_proto_builder.h
+++ b/third_party/javaprofiler/profile_proto_builder.h
@@ -17,13 +17,16 @@
 #ifndef THIRD_PARTY_JAVAPROFILER_PROFILE_PROTO_BUILDER_H__
 #define THIRD_PARTY_JAVAPROFILER_PROFILE_PROTO_BUILDER_H__
 
+#include <jvmti.h>
 #include <link.h>
+
+#include <cmath>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 #include "perftools/profiles/proto/builder.h"
-#include "third_party/java/jdk/include/jvmti.h"
 #include "third_party/javaprofiler/stacktrace_decls.h"
 
 namespace google {
@@ -50,9 +53,8 @@ class TraceSamples {
                     const JVMPI_CallTrace &trace2) const;
   };
 
-  __gnu_cxx::hash_map<JVMPI_CallTrace, perftools::profiles::Sample *, TraceHash,
-                      TraceEquals>
-      traces_;
+  std::unordered_map<JVMPI_CallTrace, perftools::profiles::Sample *, TraceHash,
+      TraceEquals> traces_;
 };
 
 // Store locations previously seen so that the profile is only
@@ -88,9 +90,8 @@ class LocationBuilder {
 
   perftools::profiles::Builder *builder_;
 
-  __gnu_cxx::hash_map<LocationInfo, perftools::profiles::Location *,
-                      LocationInfoHash, LocationInfoEquals>
-      locations_;
+  std::unordered_map<LocationInfo, perftools::profiles::Location *,
+      LocationInfoHash, LocationInfoEquals> locations_;
 };
 
 // Remember traces and use the information to create locations with native
@@ -113,11 +114,11 @@ class ProfileProtoBuilder {
  public:
   virtual ~ProfileProtoBuilder() {}
 
-  // Add traces to the proto.
+  // Add traces to the proto. The elements of the array are copied over.
   void AddTraces(const ProfileStackTrace *traces, int num_traces);
 
   // Add traces to the proto, where each trace has a defined count
-  // of occurrences.
+  // of occurrences. The elements of the arrays are copied over.
   void AddTraces(const ProfileStackTrace *traces,
                  const int32 *counts,
                  int num_traces);
@@ -130,14 +131,29 @@ class ProfileProtoBuilder {
   // this has undefined behavior.
   virtual std::unique_ptr<perftools::profiles::Profile> CreateProto() = 0;
 
+  // Create a heap profile.
+  // jvmti_env can be null as well but then all calls to AddTraces will return
+  // unknown.
+  // ForHeap/ForNativeHeap is the only case where we accept a null cache since
+  // the heap profiles can be using JVMTI's GetStackTrace and remain in pure
+  // Java land frames. Other ForX methods will fail an assertion when attempting
+  // a nullptr cache.
   static std::unique_ptr<ProfileProtoBuilder> ForHeap(
-      jvmtiEnv *jvmti_env, int64 sampling_rate, ProfileFrameCache *cache);
+      JNIEnv *jni_env, jvmtiEnv *jvmti_env, int64 sampling_rate,
+      ProfileFrameCache *cache = nullptr);
+
+  static std::unique_ptr<ProfileProtoBuilder> ForNativeHeap(
+      JNIEnv *jni_env, jvmtiEnv *jvmti_env, int64 sampling_rate,
+      ProfileFrameCache *cache = nullptr);
 
-  static std::unique_ptr<ProfileProtoBuilder> ForCpu(
-      jvmtiEnv *jvmti_env, int64 sampling_rate, ProfileFrameCache *cache);
+  static std::unique_ptr<ProfileProtoBuilder> ForCpu(JNIEnv *jni_env,
+                                                     jvmtiEnv *jvmti_env,
+                                                     int64 sampling_rate,
+                                                     ProfileFrameCache *cache);
 
   static std::unique_ptr<ProfileProtoBuilder> ForContention(
-      jvmtiEnv *jvmti_env, int64 sampling_rate, ProfileFrameCache *cache);
+      JNIEnv *jni_env, jvmtiEnv *jvmti_env, int64 sampling_rate,
+      ProfileFrameCache *cache);
 
  protected:
   struct SampleType {
@@ -148,9 +164,11 @@ class ProfileProtoBuilder {
     string unit;
   };
 
-  ProfileProtoBuilder(jvmtiEnv *jvmti_env,
-                      ProfileFrameCache *native_cache,
-                      int64 sampling_rate,
+  // Create the profile proto builder, if native_cache is nullptr, then no
+  // information about native frames can be provided. The proto buffer will then
+  // contain "Unknown native method" frames.
+  ProfileProtoBuilder(JNIEnv *env, jvmtiEnv *jvmti_env,
+                      ProfileFrameCache *native_cache, int64 sampling_rate,
                       const SampleType &count_type,
                       const SampleType &metric_type);
 
@@ -166,6 +184,7 @@ class ProfileProtoBuilder {
   std::unique_ptr<perftools::profiles::Profile> CreateSampledProto();
 
   perftools::profiles::Builder builder_;
+  int64 sampling_rate_ = 0;
 
  private:
   // Track progress through a stack as we traverse it, in order to determine
@@ -207,28 +226,28 @@ class ProfileProtoBuilder {
 
   void AddSampleType(const SampleType &sample_type);
   void SetPeriodType(const SampleType &metric_type);
-  void InitSampleValues(perftools::profiles::Sample *sample, jint metric);
-  void InitSampleValues(perftools::profiles::Sample *sample, jint count,
-                        jint metric);
-  void UpdateSampleValues(perftools::profiles::Sample *sample, jint count,
-                          jint size);
+  void InitSampleValues(perftools::profiles::Sample *sample, int64 metric);
+  void InitSampleValues(perftools::profiles::Sample *sample, int64 count,
+                        int64 metric);
+  void UpdateSampleValues(perftools::profiles::Sample *sample, int64 count,
+                          int64 size);
   void AddTrace(const ProfileStackTrace &trace, int32 count);
-  void AddJavaInfo(const google::javaprofiler::JVMPI_CallFrame &jvm_frame,
+  void AddJavaInfo(const JVMPI_CallFrame &jvm_frame,
                    perftools::profiles::Profile *profile,
                    perftools::profiles::Sample *sample,
                    StackState *stack_state);
-  void AddNativeInfo(const google::javaprofiler::JVMPI_CallFrame &jvm_frame,
+  void AddNativeInfo(const JVMPI_CallFrame &jvm_frame,
                      perftools::profiles::Profile *profile,
                      perftools::profiles::Sample *sample,
                      StackState *stack_state);
   void UnsampleMetrics();
 
+  JNIEnv *jni_env_;
   jvmtiEnv *jvmti_env_;
 
   ProfileFrameCache *native_cache_;
   TraceSamples trace_samples_;
   LocationBuilder location_builder_;
-  int64 sampling_rate_ = 0;
 };
 
 // Computes the ratio to use to scale heap data to unsample it.
@@ -241,13 +260,12 @@ double CalculateSamplingRatio(int64 rate, int64 count, int64 metric_value);
 
 class CpuProfileProtoBuilder : public ProfileProtoBuilder {
  public:
-  CpuProfileProtoBuilder(jvmtiEnv *jvmti_env,
-                         int64 sampling_rate,
-                         ProfileFrameCache *cache)
-      : ProfileProtoBuilder(jvmti_env, cache, sampling_rate,
-                            ProfileProtoBuilder::SampleType("samples", "count"),
-                            ProfileProtoBuilder::SampleType("cpu",
-                                                            "nanoseconds")) {
+  CpuProfileProtoBuilder(JNIEnv *jni_env, jvmtiEnv *jvmti_env,
+                         int64 sampling_rate, ProfileFrameCache *cache)
+      : ProfileProtoBuilder(
+            jni_env, jvmti_env, cache, sampling_rate,
+            ProfileProtoBuilder::SampleType("samples", "count"),
+            ProfileProtoBuilder::SampleType("cpu", "nanoseconds")) {
     builder_.mutable_profile()->set_period(sampling_rate);
   }
 
@@ -261,52 +279,83 @@ class CpuProfileProtoBuilder : public ProfileProtoBuilder {
 
 class HeapProfileProtoBuilder : public ProfileProtoBuilder {
  public:
-  HeapProfileProtoBuilder(jvmtiEnv *jvmti_env,
-                          int64 sampling_rate,
-                          ProfileFrameCache *cache)
-      : ProfileProtoBuilder(jvmti_env, cache, sampling_rate,
-                            ProfileProtoBuilder::SampleType("inuse_objects",
-                                                            "count"),
-                            ProfileProtoBuilder::SampleType("inuse_space",
-                                                            "bytes")) {
-  }
+  HeapProfileProtoBuilder(JNIEnv *jni_env, jvmtiEnv *jvmti_env,
+                          int64 sampling_rate, ProfileFrameCache *cache)
+      : ProfileProtoBuilder(
+            jni_env, jvmti_env, cache, sampling_rate,
+            ProfileProtoBuilder::SampleType("inuse_objects", "count"),
+            ProfileProtoBuilder::SampleType("inuse_space", "bytes")) {}
 
   std::unique_ptr<perftools::profiles::Profile> CreateProto() override {
     return CreateUnsampledProto();
   }
 
  protected:
+  // Depending on the JDK or how we got the frames (e.g. internal JVM or
+  // JVMTI), we might have native frames on top of the Java frames
+  // (basically where the JVM internally allocated the object).
+  // Returns the first Java frame index or 0 if none were found.
   int SkipTopNativeFrames(const JVMPI_CallTrace &trace) override {
-    for (int i = 0; i < trace.num_frames; ++i) {
-      if (trace.frames[i].lineno !=
-          google::javaprofiler::kNativeFrameLineNum) {
+    // Skip until we see the first Java frame.
+    for (int i = 0; i < trace.num_frames; i++) {
+      if (trace.frames[i].lineno != kNativeFrameLineNum) {
         return i;
       }
     }
 
-    return trace.num_frames;
+    // All are native is weird for Java heap samples but do nothing in this
+    // case.
+    return 0;
+  }
+};
+
+class NativeHeapProfileProtoBuilder : public HeapProfileProtoBuilder {
+ public:
+  NativeHeapProfileProtoBuilder(JNIEnv *jni_env, jvmtiEnv *jvmti_env,
+                                int64 sampling_rate, ProfileFrameCache *cache)
+      : HeapProfileProtoBuilder(jni_env, jvmti_env, sampling_rate, cache) {}
+
+ protected:
+  // In cases of long native frames, we really only want to skip the
+  // frames_to_skip first frames, which would be something akin to:
+  //   absl::base_internal::MallocHook::InvokeNewHookSlow
+  //   absl::base_internal::MallocHook::InvokeNewHook
+  //   calloc
+  int SkipTopNativeFrames(const JVMPI_CallTrace &trace) override {
+    // If frames_to_skip changes, change the number of frames checked against
+    // kNativeFrameLineNum below to check the correct number of frames.
+    const int frames_to_skip = 3;
+    return (trace.num_frames >= frames_to_skip
+            && trace.frames[0].lineno == kNativeFrameLineNum
+            && trace.frames[1].lineno == kNativeFrameLineNum
+            && trace.frames[2].lineno == kNativeFrameLineNum)
+        ? frames_to_skip : 0;
   }
 };
 
 class ContentionProfileProtoBuilder : public ProfileProtoBuilder {
  public:
-  ContentionProfileProtoBuilder(jvmtiEnv *jvmti_env,
-                                int64 sampling_rate,
-                                ProfileFrameCache *cache)
-      : ProfileProtoBuilder(jvmti_env, cache, sampling_rate,
-                            ProfileProtoBuilder::SampleType("contentions",
-                                                            "count"),
-                            ProfileProtoBuilder::SampleType("delay",
-                                                            "microseconds")) {
+  ContentionProfileProtoBuilder(JNIEnv *jni_env, jvmtiEnv *jvmti_env,
+                                int64 sampling_rate, ProfileFrameCache *cache)
+      : ProfileProtoBuilder(
+            jni_env, jvmti_env, cache, sampling_rate,
+            ProfileProtoBuilder::SampleType("contentions", "count"),
+            ProfileProtoBuilder::SampleType("delay", "microseconds")) {
     builder_.mutable_profile()->set_period(sampling_rate);
   }
 
   std::unique_ptr<perftools::profiles::Profile> CreateProto() {
-    return CreateSampledProto();
+    MultiplyBySamplingRate();
+    builder_.Finalize();
+    return std::unique_ptr<perftools::profiles::Profile>(builder_.Consume());
   }
 
  protected:
   int SkipTopNativeFrames(const JVMPI_CallTrace &trace) override { return 0; }
+
+ private:
+  // Multiply the (count, metric) values by the sampling_rate.
+  void MultiplyBySamplingRate();
 };
 
 }  // namespace javaprofiler
diff --git a/third_party/javaprofiler/profile_test_lib.cc b/third_party/javaprofiler/profile_test_lib.cc
index 2683f28cb..9d7d6de07 100644
--- a/third_party/javaprofiler/profile_test_lib.cc
+++ b/third_party/javaprofiler/profile_test_lib.cc
@@ -56,6 +56,10 @@ static jvmtiError GetMethodName(jvmtiEnv* jvmti, jmethodID method_id,
       CreateJvmtiString(jvmti, "thirdMethodName", name_str);
       CreateJvmtiString(jvmti, "()V", sig_str);
       break;
+    case 4:
+      CreateJvmtiString(jvmti, "fourthMethodName$$Lambda$42.42", name_str);
+      CreateJvmtiString(jvmti, "()V", sig_str);
+      break;
     default:
       ADD_FAILURE() << "Unknown method id in test.";
   }
@@ -75,6 +79,9 @@ static jvmtiError GetClassSignature(jvmtiEnv* jvmti, jclass declaring_class,
     case 3:
       CreateJvmtiString(jvmti, "Lcom/google/ThirdClass;", sig_str);
       break;
+    case 4:
+      CreateJvmtiString(jvmti, "Lcom/google/FourthClass;", sig_str);
+      break;
     default:
       ADD_FAILURE() << "Unknown class id in test.";
   }
@@ -101,6 +108,9 @@ static jvmtiError GetSourceFileName(jvmtiEnv *env, jclass klass,
       case 3:
         *source_name_ptr = strdup("ThirdClass.java");
         break;
+      case 4:
+        *source_name_ptr = strdup("FourthClass.java");
+        break;
       default:
         ADD_FAILURE() << "Unknown class id in test.";
     }
@@ -130,6 +140,39 @@ static jvmtiError GetLineNumberTable(jvmtiEnv *env, jmethodID method,
   return JVMTI_ERROR_NONE;
 }
 
+static jvmtiError GetStackTrace(jvmtiEnv* env, jthread thread, jint start_depth,
+                                jint max_frame_count,
+                                jvmtiFrameInfo* frame_buffer, jint* count_ptr) {
+  uint64 thread_num = reinterpret_cast<uint64>(thread);
+
+  if (thread_num < 0 || thread_num >= JvmProfileTestLib::GetMaxThreads()) {
+    *count_ptr = 0;
+    return JVMTI_ERROR_NONE;
+  }
+
+  std::vector<jvmtiFrameInfo> frames;
+
+  switch (thread_num) {
+    case 0:
+      frames.push_back({reinterpret_cast<jmethodID>(1), 30});
+      frames.push_back({reinterpret_cast<jmethodID>(2), 64});
+      break;
+    case 1:
+      frames.push_back({reinterpret_cast<jmethodID>(3), 128});
+      break;
+  }
+
+  jint count = std::min(max_frame_count, (jint) frames.size());
+  memcpy(frame_buffer, &frames[0], sizeof(*frame_buffer) * count);
+  *count_ptr = count;
+
+  return JVMTI_ERROR_NONE;
+}
+
+static jvmtiError ForceGarbageCollection(jvmtiEnv* env) {
+  return JVMTI_ERROR_NONE;
+}
+
 struct jvmtiInterface_1_ JvmProfileTestLib::GetDispatchTable() {
   struct jvmtiInterface_1_ jvmti_dispatch_table;
   jvmti_dispatch_table.GetMethodName = &GetMethodName;
@@ -139,8 +182,20 @@ struct jvmtiInterface_1_ JvmProfileTestLib::GetDispatchTable() {
   jvmti_dispatch_table.GetLineNumberTable = &GetLineNumberTable;
   jvmti_dispatch_table.Allocate = &Allocate;
   jvmti_dispatch_table.Deallocate = &Deallocate;
+  jvmti_dispatch_table.GetStackTrace = &GetStackTrace;
+  jvmti_dispatch_table.ForceGarbageCollection = &ForceGarbageCollection;
   return jvmti_dispatch_table;
 }
 
+int JvmProfileTestLib::GetMaxThreads() { return 2; }
+
+jthread JvmProfileTestLib::GetThread(int thread_id) {
+  if (thread_id < 0 || thread_id >= GetMaxThreads()) {
+    return 0;
+  }
+
+  return reinterpret_cast<jthread>(thread_id);
+}
+
 }  // namespace javaprofiler
 }  // namespace google
diff --git a/third_party/javaprofiler/profile_test_lib.h b/third_party/javaprofiler/profile_test_lib.h
index 2f2e32881..f663ef31a 100644
--- a/third_party/javaprofiler/profile_test_lib.h
+++ b/third_party/javaprofiler/profile_test_lib.h
@@ -17,11 +17,30 @@
 #ifndef THIRD_PARTY_JAVAPROFILER_PROFILE_TEST_H__
 #define THIRD_PARTY_JAVAPROFILER_PROFILE_TEST_H__
 
-#include "third_party/java/jdk/include/jvmti.h"
+#include <jvmti.h>
+
+#include "third_party/javaprofiler/profile_proto_builder.h"
+#include "third_party/javaprofiler/stacktrace_decls.h"
 
 namespace google {
 namespace javaprofiler {
 
+class TestProfileFrameCache : public ProfileFrameCache {
+  void ProcessTraces(const ProfileStackTrace *traces, int num_traces) override {
+  }
+
+  perftools::profiles::Location *GetLocation(
+      const JVMPI_CallFrame &jvm_frame,
+      LocationBuilder *location_builder) override {
+    return &nop_;
+  }
+
+  string GetFunctionName(const JVMPI_CallFrame &jvm_frame) { return ""; }
+
+ private:
+  perftools::profiles::Location nop_;
+};
+
 class JvmProfileTestLib {
  public:
   static jmethodID GetDroppedFrameMethodId() {
@@ -29,6 +48,27 @@ class JvmProfileTestLib {
   }
 
   static struct jvmtiInterface_1_ GetDispatchTable();
+
+  /**
+   * Returns a jthread object that can be used by the various JvmProfileTestLib
+   * methods. It is not a real jthread object but is an identifier to a "fake"
+   * thread that is mocked to be at a given point in the code.
+   * ie. GetStackTrace will return something sane.
+   *
+   * thread_id should be an integer with 0 < thread_id < N;
+   * N being the value returned by GetMaxThreads.
+   *
+   * implementation dependent on the test framework.
+   * A return of 0 signifies the thread_id is not supported.
+   */
+  static jthread GetThread(int thread_id);
+
+  /**
+   * Returns the number of threads that are supported by GetThread and
+   * subsequent methods that would take a thread as argument to fake
+   * various threads in action.
+   */
+  static int GetMaxThreads();
 };
 
 }  // namespace javaprofiler
diff --git a/third_party/javaprofiler/stacktraces.cc b/third_party/javaprofiler/stacktraces.cc
index d25f727a9..48d7cc450 100644
--- a/third_party/javaprofiler/stacktraces.cc
+++ b/third_party/javaprofiler/stacktraces.cc
@@ -17,8 +17,6 @@
 namespace google {
 namespace javaprofiler {
 
-__thread JNIEnv *Accessors::env_;
-__thread int64_t Accessors::attr_;
 ASGCTType Asgct::asgct_;
 
 std::mutex *AttributeTable::mutex_;
@@ -26,23 +24,23 @@ std::unordered_map<string, int> *AttributeTable::string_map_;
 std::vector<string> *AttributeTable::strings_;
 
 bool AsyncSafeTraceMultiset::Add(int attr, JVMPI_CallTrace *trace) {
-  uint64_t hash_val = CalculateHash(attr, trace->num_frames, &trace->frames[0]);
+  uint64 hash_val = CalculateHash(attr, trace->num_frames, &trace->frames[0]);
 
-  active_insertions_.fetch_add(1, std::memory_order_acquire);
-  for (int64_t i = 0; i < MaxEntries(); i++) {
-    int64_t idx = (i + hash_val) % MaxEntries();
+  for (int64 i = 0; i < MaxEntries(); i++) {
+    int64 idx = (i + hash_val) % MaxEntries();
     auto &entry = traces_[idx];
-    int64_t count_zero = 0;
-    int64_t count = entry.count.load(std::memory_order_acquire);
+    int64 count_zero = 0;
+    entry.active_updates.fetch_add(1, std::memory_order_acquire);
+    int64 count = entry.count.load(std::memory_order_acquire);
     switch (count) {
       case 0:
         if (entry.count.compare_exchange_weak(count_zero, kTraceCountLocked,
                                               std::memory_order_relaxed)) {
           // This entry is reserved, there is no danger of interacting
-          // with Extract, so decrement active_insertions early.
-          active_insertions_.fetch_add(-1, std::memory_order_release);
+          // with Extract, so decrement active_updates early.
+          entry.active_updates.fetch_sub(1, std::memory_order_release);
           // memcpy is not async safe
-          JVMPI_CallFrame *fb = frame_buffer_[idx];
+          JVMPI_CallFrame *fb = entry.frame_buffer;
           int num_frames = trace->num_frames;
           for (int frame_num = 0; frame_num < num_frames; ++frame_num) {
             fb[frame_num].lineno = trace->frames[frame_num].lineno;
@@ -51,7 +49,7 @@ bool AsyncSafeTraceMultiset::Add(int attr, JVMPI_CallTrace *trace) {
           entry.trace.frames = fb;
           entry.trace.num_frames = num_frames;
           entry.attr = attr;
-          entry.count.store(int64_t(1), std::memory_order_release);
+          entry.count.store(static_cast<int64>(1), std::memory_order_release);
           return true;
         }
         break;
@@ -70,25 +68,25 @@ bool AsyncSafeTraceMultiset::Add(int attr, JVMPI_CallTrace *trace) {
           if (count != kTraceCountLocked &&
               entry.count.compare_exchange_weak(count, count + 1,
                                                 std::memory_order_relaxed)) {
-            active_insertions_.fetch_add(-1, std::memory_order_release);
+            entry.active_updates.fetch_sub(1, std::memory_order_release);
             return true;
           }
         }
     }
+    // Did nothing, but we still need storage ordering between this
+    // store and preceding loads.
+    entry.active_updates.fetch_sub(1, std::memory_order_release);
   }
-  // Did nothing, but we still need storage ordering between this
-  // store and preceding loads.
-  active_insertions_.fetch_add(-1, std::memory_order_release);
   return false;
 }
 
-int AsyncSafeTraceMultiset::Extract(int location, int64_t *attr, int max_frames,
-                                    JVMPI_CallFrame *frames, int64_t *count) {
+int AsyncSafeTraceMultiset::Extract(int location, int64 *attr, int max_frames,
+                                    JVMPI_CallFrame *frames, int64 *count) {
   if (location < 0 || location >= MaxEntries()) {
     return 0;
   }
   auto &entry = traces_[location];
-  int64_t c = entry.count.load(std::memory_order_acquire);
+  int64 c = entry.count.load(std::memory_order_acquire);
   if (c <= 0) {
     // Unused or in process of being updated, skip for now.
     return 0;
@@ -101,22 +99,15 @@ int AsyncSafeTraceMultiset::Extract(int location, int64_t *attr, int max_frames,
   c = entry.count.exchange(kTraceCountLocked, std::memory_order_acquire);
 
   *attr = entry.attr;
-  bool all_quiet = false;
   for (int i = 0; i < num_frames; ++i) {
     frames[i].lineno = entry.trace.frames[i].lineno;
     frames[i].method_id = entry.trace.frames[i].method_id;
-    if (all_quiet == false &&
-        active_insertions_.load(std::memory_order_acquire) == 0) {
-      all_quiet = true;
-    }
   }
 
-  if (!all_quiet) {
-    while (active_insertions_.load(std::memory_order_acquire) != 0) {
-      // spin
-      // TODO: Introduce a limit to detect and break
-      // deadlock
-    }
+  while (entry.active_updates.load(std::memory_order_acquire) != 0) {
+    // spin
+    // TODO: Introduce a limit to detect and break
+    // deadlock
   }
 
   entry.count.store(0, std::memory_order_release);
@@ -124,8 +115,8 @@ int AsyncSafeTraceMultiset::Extract(int location, int64_t *attr, int max_frames,
   return num_frames;
 }
 
-void TraceMultiset::Add(int64_t attr, int num_frames, JVMPI_CallFrame *frames,
-                        int64_t count) {
+void TraceMultiset::Add(int64 attr, int num_frames, JVMPI_CallFrame *frames,
+                        int64 count) {
   CallTrace t;
   t.attr = attr;
   t.frames = std::vector<JVMPI_CallFrame>(frames, frames + num_frames);
@@ -140,10 +131,10 @@ void TraceMultiset::Add(int64_t attr, int num_frames, JVMPI_CallFrame *frames,
 
 int HarvestSamples(AsyncSafeTraceMultiset *from, TraceMultiset *to) {
   int trace_count = 0;
-  int64_t num_traces = from->MaxEntries();
-  for (int64_t i = 0; i < num_traces; i++) {
+  int64 num_traces = from->MaxEntries();
+  for (int64 i = 0; i < num_traces; i++) {
     JVMPI_CallFrame frame[kMaxFramesToCapture];
-    int64_t attr, count;
+    int64 attr, count;
 
     int num_frames =
         from->Extract(i, &attr, kMaxFramesToCapture, &frame[0], &count);
@@ -155,10 +146,10 @@ int HarvestSamples(AsyncSafeTraceMultiset *from, TraceMultiset *to) {
   return trace_count;
 }
 
-uint64_t CalculateHash(int64_t attr, int num_frames,
+uint64 CalculateHash(int64 attr, int num_frames,
                        const JVMPI_CallFrame *frame) {
   // Make hash-value
-  uint64_t h = attr;
+  uint64 h = attr;
   h += h << 10;
   h ^= h >> 6;
   for (int i = 0; i < num_frames; i++) {
diff --git a/third_party/javaprofiler/stacktraces.h b/third_party/javaprofiler/stacktraces.h
index 05427d142..9adbbec2e 100644
--- a/third_party/javaprofiler/stacktraces.h
+++ b/third_party/javaprofiler/stacktraces.h
@@ -33,7 +33,7 @@ namespace javaprofiler {
 // Maximum number of frames to store from the stack traces sampled.
 const int kMaxFramesToCapture = 128;
 
-uint64_t CalculateHash(int64_t attr, int num_frames,
+uint64 CalculateHash(int64 attr, int num_frames,
                        const JVMPI_CallFrame *frame);
 bool Equal(int num_frames, const JVMPI_CallFrame *f1,
            const JVMPI_CallFrame *f2);
@@ -121,8 +121,6 @@ class AsyncSafeTraceMultiset {
 
   void Reset() {
     memset(traces_, 0, sizeof(traces_));
-    memset(frame_buffer_, 0, sizeof(frame_buffer_));
-    active_insertions_ = 0;
   }
 
   // Add a trace to the set. If it is already present, increment its
@@ -135,10 +133,10 @@ class AsyncSafeTraceMultiset {
   // there is no valid trace at this location.  This operation is
   // thread safe with respect to Add() but only a single call to
   // Extract can be done at a time.
-  int Extract(int location, int64_t *attr, int max_frames,
-              JVMPI_CallFrame *frames, int64_t *count);
+  int Extract(int location, int64 *attr, int max_frames,
+              JVMPI_CallFrame *frames, int64 *count);
 
-  int64_t MaxEntries() const { return kMaxStackTraces; }
+  int64 MaxEntries() const { return kMaxStackTraces; }
 
  private:
   struct TraceData {
@@ -146,12 +144,16 @@ class AsyncSafeTraceMultiset {
     // this will represent a sample label.
     int attr;
     // trace is a triple containing the JNIEnv and the individual call frames.
-    // The frames are stored in AsyncSafeTraceMultiset::frame_buffer_
+    // The frames are stored in frame_buffer.
     JVMPI_CallTrace trace;
+    // frame_buffer is the storage for stack frames.
+    JVMPI_CallFrame frame_buffer[kMaxFramesToCapture];
     // Number of times a trace has been encountered.
     // 0 indicates that the trace is unused
     // <0 values are reserved, used for concurrency control.
-    std::atomic<int64_t> count;
+    std::atomic<int64> count;
+    // Number of active attempts to increase the counter on the trace.
+    std::atomic<int> active_updates;
   };
 
   // TODO: Re-evaluate MaxStackTraces, to minimize storage
@@ -160,13 +162,9 @@ class AsyncSafeTraceMultiset {
   static const int kMaxStackTraces = 2048;
 
   // Sentinel to use as trace count while the frames are being updated.
-  static const int64_t kTraceCountLocked = -1;
-
-  // Number of calls to Add() currently in progress.
-  std::atomic<int> active_insertions_;
+  static const int64 kTraceCountLocked = -1;
 
   TraceData traces_[kMaxStackTraces];
-  JVMPI_CallFrame frame_buffer_[kMaxStackTraces][kMaxFramesToCapture];
   DISALLOW_COPY_AND_ASSIGN(AsyncSafeTraceMultiset);
 };
 
@@ -179,7 +177,7 @@ class TraceMultiset {
  private:
   typedef struct {
     std::vector<JVMPI_CallFrame> frames;
-    int64_t attr;
+    int64 attr;
   } CallTrace;
 
   struct CallTraceHash {
@@ -201,7 +199,7 @@ class TraceMultiset {
     }
   };
 
-  typedef std::unordered_map<CallTrace, uint64_t, CallTraceHash, CallTraceEqual>
+  typedef std::unordered_map<CallTrace, uint64, CallTraceHash, CallTraceEqual>
       CountMap;
 
  public:
@@ -209,8 +207,8 @@ class TraceMultiset {
 
   // Add a trace to the array. If it is already in the array,
   // increment its count.
-  void Add(int64_t attr, int num_frames, JVMPI_CallFrame *frames,
-           int64_t count);
+  void Add(int64 attr, int num_frames, JVMPI_CallFrame *frames,
+           int64 count);
 
   typedef CountMap::iterator iterator;
   typedef CountMap::const_iterator const_iterator;
diff --git a/third_party/javaprofiler/tags.cc b/third_party/javaprofiler/tags.cc
new file mode 100644
index 000000000..65d8aaff8
--- /dev/null
+++ b/third_party/javaprofiler/tags.cc
@@ -0,0 +1,195 @@
+/*
+ * Copyright 2018 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "third_party/javaprofiler/tags.h"
+
+#include <algorithm>
+#include <mutex>  // NOLINT
+#include <unordered_map>
+
+#include "third_party/javaprofiler/globals.h"
+
+namespace google {
+namespace javaprofiler {
+namespace {
+
+// Protects the global variables keys and key_to_id.
+std::mutex mutex;
+const Tags *empty_tags = nullptr;
+const AsyncRefCountedString *empty_async_string = nullptr;
+// Stores all the keys.
+std::vector<string> *keys = nullptr;
+// Maps key to its id. The index of a key stored in keys (called id or key_id)
+// is used to retrieve a key or value quickly as it is very friendly to vectors.
+std::unordered_map<string, int32_t> *key_to_id = nullptr;
+
+// Returns the id of the key if the key has been registered. Otherwise, it tries
+// to add the key and returns its id upon success or returns -1 if there is no
+// extra space.
+int32_t RegisterKey(const string &key) {
+  std::lock_guard<std::mutex> lock(mutex);
+  const auto &target = key_to_id->find(key);
+  if (target != key_to_id->end()) {
+    // Key is found and directly return its id.
+    return target->second;
+  }
+  if (keys->size() >= kMaxNumTags) {
+    // No space to register a new key.
+    return -1;
+  }
+  // Enough space to register a new key.
+  keys->emplace_back(key);
+  int32_t id = keys->size() - 1;
+  (*key_to_id)[key] = id;
+  return id;
+}
+
+}  // namespace
+
+// The key stored in tags storage aims to support GetAttribute() and
+// SetAttribute().
+const char kAttrKey[] = "attr";
+
+// Returns the id of the key if the key has been registered; otherwise, returns
+// -1.
+int32_t Tags::GetIdByKey(const string &key) {
+  std::lock_guard<std::mutex> lock(mutex);
+  auto target = key_to_id->find(key);
+  if (target == key_to_id->end()) {
+    return -1;
+  }
+  return target->second;
+}
+
+void Tags::AsyncSafeCopy(const Tags &from) {
+  for (int i = 0; i < kMaxNumTags; i++) {
+    values_[i].AsyncSafeCopy(from.values_[i]);
+  }
+}
+
+bool Tags::Set(const string &key, const AsyncRefCountedString &value) {
+  // All values are stored in the vector "values_" and "key_id" (resolved by
+  // "key") is used to locate the right position.
+  int32_t key_id = RegisterKey(key);
+  if (key_id < 0) {
+    return false;
+  }
+  values_[key_id] = value;
+  return true;
+}
+
+void Tags::ClearAll() {
+  for (auto &val : values_) {
+    val.Reset();
+  }
+}
+
+void Tags::AsyncSafeClearAll() {
+  for (auto &val : values_) {
+    val.AsyncSafeReset();
+  }
+}
+
+bool Tags::operator==(const Tags &other) const {
+  for (int i = 0; i < kMaxNumTags; i++) {
+    if (values_[i] != other.values_[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+uint64 Tags::Hash() const {
+  uint64 h = 0;
+  for (const auto &val : values_) {
+    h += val.Hash();
+    h += h << 10;
+    h ^= h >> 6;
+  }
+  return h;
+}
+
+const AsyncRefCountedString &Tags::Get(const string &key) const {
+  int32_t key_id = GetIdByKey(key);
+  if (key_id >= 0) {
+    return values_[key_id];
+  }
+  return *empty_async_string;
+}
+
+std::vector<std::pair<string, AsyncRefCountedString>> Tags::GetAll() const {
+  std::lock_guard<std::mutex> lock(mutex);
+  std::vector<std::pair<string, AsyncRefCountedString>> all_pairs(keys->size());
+  for (int i = 0; i < keys->size(); i++) {
+    all_pairs[i].first = (*keys)[i];
+    all_pairs[i].second = values_[i];
+  }
+  return all_pairs;
+}
+
+void Tags::SetAttribute(int64 value) {
+  // TODO: Check whether the conversion between integer and string is a
+  // performance concern.
+  Set(kAttrKey, AsyncRefCountedString(std::to_string(value)));
+}
+
+int64 Tags::GetAttribute() const {
+  const string *value = Get(kAttrKey).Get();
+  if (value == nullptr) {
+    return 0;
+  }
+  return static_cast<jint>(std::stol(*value));
+}
+
+const Tags &Tags::Empty() { return *empty_tags; }
+
+bool Tags::Init() {
+  {
+    std::lock_guard<std::mutex> lock(mutex);
+    if (empty_tags != nullptr || empty_async_string != nullptr ||
+        keys != nullptr || key_to_id != nullptr) {
+      return false;
+    }
+    empty_async_string = new AsyncRefCountedString();
+    keys = new std::vector<string>();
+    key_to_id = new std::unordered_map<string, int32_t>();
+    empty_tags = new Tags();
+  }
+  // Register the key "Accessors::kAttrKey" to support Accessors::SetAttribute()
+  // and Accessors::GetAttribute().
+  RegisterKey(kAttrKey);
+  return true;
+}
+
+bool Tags::Destroy() {
+  std::lock_guard<std::mutex> lock(mutex);
+  if (empty_tags == nullptr || empty_async_string == nullptr ||
+      keys == nullptr || key_to_id == nullptr) {
+    return false;
+  }
+  delete empty_tags;
+  empty_tags = nullptr;
+  delete key_to_id;
+  key_to_id = nullptr;
+  delete keys;
+  keys = nullptr;
+  delete empty_async_string;
+  empty_async_string = nullptr;
+  return true;
+}
+
+}  // namespace javaprofiler
+}  // namespace google
diff --git a/third_party/javaprofiler/tags.h b/third_party/javaprofiler/tags.h
new file mode 100644
index 000000000..0523c83a1
--- /dev/null
+++ b/third_party/javaprofiler/tags.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright 2018 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef THIRD_PARTY_JAVAPROFILER_TAGS_H_
+#define THIRD_PARTY_JAVAPROFILER_TAGS_H_
+#include <utility>
+#include <vector>
+
+#include "third_party/javaprofiler/async_ref_counted_string.h"
+
+namespace google {
+namespace javaprofiler {
+
+constexpr int kMaxNumTags = 16;
+
+extern const char kAttrKey[];
+
+// Stores additional attributes as <key,value> pairs for profiles.
+class Tags {
+ public:
+  // Returns the value (in AsyncRefCountedString reference) indexed by "key". Be
+  // cautious that the referenced AsyncRefCountedString may change if modifiers
+  // (e.g., Set(), Clear()) are invoked later.
+  const AsyncRefCountedString &Get(const string &key) const;
+  // Sets the value indexed by "key" to "value". It returns true on success;
+  // otherwise returns false. The failure usually comes from that "key" does not
+  // exist in the key list while there is no space to accommodate this new
+  // "key".
+  bool Set(const string &key, const AsyncRefCountedString &value);
+  // Resets all values to nullptr.
+  void ClearAll();
+  // Returns all the key-value pairs stored in this Tags.
+  std::vector<std::pair<string, AsyncRefCountedString>> GetAll() const;
+
+  // Legacy interfaces. The attribute value is stored internally indexed by a
+  // special key (kAttrKey).
+  void SetAttribute(int64 value);
+  int64 GetAttribute() const;
+
+  // Async-signal-safe version of operator=(const Tags &other). It requires that
+  // the instance should be empty (does not refer to any string). Otherwise, it
+  // asserts an error.
+  void AsyncSafeCopy(const Tags &from);
+  // Async-signal-safe version of ClearAll(). It will succeed if no enty is
+  // removed from the AsyncRefCountedString internal string table. Otherwise, it
+  // asserts an error.
+  void AsyncSafeClearAll();
+  // Async-signal-safe.
+  bool operator==(const Tags &other) const;
+  // Async-signal-safe.
+  uint64 Hash() const;
+
+  // Async-signal-safe.
+  static const Tags &Empty();
+  // Initializes the internal for key table and empty Tags and
+  // AsyncRefCountedString instances. Must be called after
+  // AsyncRefCountedString::Init(), and before using Tags to store any key-value
+  // pair or calling any other static method. Should only be called once,
+  // subsequent calls have no effect and return false. Destroy() should be
+  // called to free the storage.
+  static bool Init();
+  // Frees the internal storage allocated by Init(). Must be called before
+  // AsyncRefCountedString::Destroy(), and after all outstanding Tags objects
+  // are gone. No key-value pair can be stored after Destroy() is called.
+  // Returns false if the storage is not currently allocated.
+  static bool Destroy();
+
+ private:
+  // // Returns the id of "key" if "key" is present; otherwise, returns -1.
+  static int32_t GetIdByKey(const string &key);
+
+  AsyncRefCountedString values_[kMaxNumTags];
+
+  friend class TagsTest;
+};
+
+}  // namespace javaprofiler
+}  // namespace google
+#endif  // THIRD_PARTY_JAVAPROFILER_TAGS_H_
diff --git a/third_party/perftools/profiles/proto/builder.cc b/third_party/perftools/profiles/proto/builder.cc
index ec0eac129..fbcfd28db 100644
--- a/third_party/perftools/profiles/proto/builder.cc
+++ b/third_party/perftools/profiles/proto/builder.cc
@@ -18,10 +18,11 @@
 #include <unistd.h>
 
 #include <map>
-#include <unordered_map>
-#include <unordered_set>
 #include <utility>
 #include <vector>
+
+#include <unordered_map>
+#include <unordered_set>
 #include "glog/logging.h"
 #include "google/protobuf/io/gzip_stream.h"
 #include "google/protobuf/io/zero_copy_stream_impl.h"
@@ -31,6 +32,13 @@ using google::protobuf::io::GzipOutputStream;
 using google::protobuf::io::FileOutputStream;
 using google::protobuf::RepeatedField;
 
+namespace perftools {
+namespace profiles {
+typedef std::unordered_map<uint64, uint64> IndexMap;
+typedef std::unordered_set<uint64> IndexSet;
+}  // namespace profiles
+}  // namespace perftools
+
 namespace perftools {
 namespace profiles {
 
@@ -60,7 +68,8 @@ uint64 Builder::FunctionId(const char *name, const char *system_name,
   int64 system_name_index = StringId(system_name);
   int64 file_index = StringId(file);
 
-  Function fn(name_index, system_name_index, file_index, start_line);
+  auto fn =
+      std::make_tuple(name_index, system_name_index, file_index, start_line);
 
   int64 index = profile_->function_size() + 1;
   const auto inserted = functions_.insert(std::make_pair(fn, index));
@@ -122,7 +131,8 @@ bool Builder::MarshalToFile(const Profile &profile, const char *filename) {
 // Returns a bool indicating if the profile is valid. It logs any
 // errors it encounters.
 bool Builder::CheckValid(const Profile &profile) {
-  std::unordered_set<uint64> mapping_ids;
+  IndexSet mapping_ids;
+  mapping_ids.reserve(profile.mapping_size());
   for (const auto &mapping : profile.mapping()) {
     const int64 id = mapping.id();
     if (id != 0) {
@@ -134,7 +144,8 @@ bool Builder::CheckValid(const Profile &profile) {
     }
   }
 
-  std::unordered_set<uint64> function_ids;
+  IndexSet function_ids;
+  function_ids.reserve(profile.function_size());
   for (const auto &function : profile.function()) {
     const int64 id = function.id();
     if (id != 0) {
@@ -146,7 +157,8 @@ bool Builder::CheckValid(const Profile &profile) {
     }
   }
 
-  std::unordered_set<uint64> location_ids;
+  IndexSet location_ids;
+  location_ids.reserve(profile.location_size());
   for (const auto &location : profile.location()) {
     const int64 id = location.id();
     if (id != 0) {
@@ -211,7 +223,8 @@ bool Builder::CheckValid(const Profile &profile) {
 // - Associates locations to the corresponding mappings.
 bool Builder::Finalize() {
   if (profile_->location_size() == 0) {
-    std::unordered_map<uint64, uint64> address_to_id;
+    IndexMap address_to_id;
+    address_to_id.reserve(profile_->sample_size());
     for (auto &sample : *profile_->mutable_sample()) {
       // Copy sample locations into a temp vector, and then clear and
       // repopulate it with the corresponding location IDs.
diff --git a/third_party/perftools/profiles/proto/builder.h b/third_party/perftools/profiles/proto/builder.h
index 810f6792f..e0f1915bb 100644
--- a/third_party/perftools/profiles/proto/builder.h
+++ b/third_party/perftools/profiles/proto/builder.h
@@ -20,7 +20,9 @@
 #include <memory>
 #include <string>
 #include <tuple>
+
 #include <unordered_map>
+
 namespace perftools {
 namespace profiles {
 
@@ -28,8 +30,26 @@ typedef int64_t int64;
 typedef uint64_t uint64;
 typedef std::string string;
 
-}
-}
+typedef std::unordered_map<string, int64> StringIndexMap;
+
+class FunctionHasher {
+ public:
+  size_t operator()(const std::tuple<int64, int64, int64, int64> &f) const {
+    int64 hash = std::get<0>(f);
+    hash = hash + ((hash << 8) ^ std::get<1>(f));
+    hash = hash + ((hash << 8) ^ std::get<2>(f));
+    hash = hash + ((hash << 8) ^ std::get<3>(f));
+    return static_cast<size_t>(hash);
+  }
+};
+
+typedef std::unordered_map<std::tuple<int64, int64, int64, int64>, int64,
+                           FunctionHasher>
+    FunctionIndexMap;
+
+}  // namespace profiles
+}  // namespace perftools
+
 #include "perftools/profiles/proto/profile.pb.h"
 
 namespace perftools {
@@ -100,22 +120,9 @@ class Builder {
   Profile *mutable_profile() { return profile_.get(); }
 
  private:
-  // Holds the information about a function to facilitate deduplication.
-  typedef std::tuple<int64, int64, int64, int64> Function;
-  class FunctionHasher {
-   public:
-    size_t operator()(const Function &f) const {
-      int64 hash = std::get<0>(f);
-      hash = hash + ((hash << 8) ^ std::get<1>(f));
-      hash = hash + ((hash << 8) ^ std::get<2>(f));
-      hash = hash + ((hash << 8) ^ std::get<3>(f));
-      return static_cast<size_t>(hash);
-    }
-  };
-
-  // Hashes to deduplicate strings and functions.
-  std::unordered_map<string, int64> strings_;
-  std::unordered_map<Function, int64, FunctionHasher> functions_;
+  // Maps to deduplicate strings and functions.
+  StringIndexMap strings_;
+  FunctionIndexMap functions_;
 
   // Actual profile being updated.
   std::unique_ptr<Profile> profile_;
diff --git a/third_party/perftools/profiles/proto/profile.proto b/third_party/perftools/profiles/proto/profile.proto
index ed6fb2b83..e59e4562b 100644
--- a/third_party/perftools/profiles/proto/profile.proto
+++ b/third_party/perftools/profiles/proto/profile.proto
@@ -69,7 +69,7 @@ message Profile {
   repeated string string_table = 6;
   // frames with Function.function_name fully matching the following
   // regexp will be dropped from the samples, along with their successors.
-  int64 drop_frames = 7;   // Index into string table.
+  int64 drop_frames = 7;  // Index into string table.
   // frames with Function.function_name fully matching the following
   // regexp will be kept, even if it matches drop_functions.
   int64 keep_frames = 8;  // Index into string table.
@@ -81,13 +81,13 @@ message Profile {
   int64 time_nanos = 9;
   // Duration of the profile, if a duration makes sense.
   int64 duration_nanos = 10;
-  // The kind of events between sampled ocurrences.
+  // The kind of events between sampled occurrences.
   // e.g [ "cpu","cycles" ] or [ "heap","bytes" ]
   ValueType period_type = 11;
   // The number of events between sampled occurrences.
   int64 period = 12;
   // Freeform text associated to the profile.
-  repeated int64 comment = 13; // Indices into string table.
+  repeated int64 comment = 13;  // Indices into string table.
   // Index into the string table of the type of the preferred sample
   // value. If unset, clients should default to the last sample value.
   int64 default_sample_type = 14;
@@ -95,8 +95,8 @@ message Profile {
 
 // ValueType describes the semantics and measurement units of a value.
 message ValueType {
-  int64 type = 1; // Index into string table.
-  int64 unit = 2; // Index into string table.
+  int64 type = 1;  // Index into string table.
+  int64 unit = 2;  // Index into string table.
 }
 
 // Each Sample records values encountered in some program
@@ -111,7 +111,7 @@ message Sample {
   // entry in Profile.sample_type. All samples must have the same
   // number of values, the same as the length of Profile.sample_type.
   // When aggregating multiple samples into a single sample, the
-  // result has a list of values that is the elemntwise sum of the
+  // result has a list of values that is the element-wise sum of the
   // lists of the originals.
   repeated int64 value = 2;
   // label includes additional context for this sample. It can include
@@ -120,10 +120,10 @@ message Sample {
 }
 
 message Label {
-  int64 key = 1;   // Index into string table
+  int64 key = 1;  // Index into string table
 
   // At most one of the following must be present
-  int64 str = 2;   // Index into string table
+  int64 str = 2;  // Index into string table
   int64 num = 3;
 
   // Should only be present when num is present.
@@ -203,12 +203,12 @@ message Function {
   // Unique nonzero id for the function.
   uint64 id = 1;
   // Name of the function, in human-readable form if available.
-  int64 name = 2; // Index into string table
+  int64 name = 2;  // Index into string table
   // Name of the function, as identified by the system.
   // For instance, it can be a C++ mangled name.
-  int64 system_name = 3; // Index into string table
+  int64 system_name = 3;  // Index into string table
   // Source file containing the function.
-  int64 filename = 4; // Index into string table
+  int64 filename = 4;  // Index into string table
   // Line number in source file.
   int64 start_line = 5;
 }