diff --git a/dev/archery/archery/benchmark/compare.py b/dev/archery/archery/benchmark/compare.py
index bf9811f2c8d..8bacac02420 100644
--- a/dev/archery/archery/benchmark/compare.py
+++ b/dev/archery/archery/benchmark/compare.py
@@ -105,18 +105,17 @@ def __init__(self, contender, baseline, threshold=DEFAULT_THRESHOLD):
         self.baseline = baseline
         self.threshold = threshold
 
-    def comparisons(self, suite_filter=None, benchmark_filter=None):
-        """
-        """
-        contender = self.contender.suites(suite_filter, benchmark_filter)
-        baseline = self.baseline.suites(suite_filter, benchmark_filter)
+    @property
+    def comparisons(self):
+        contender = self.contender.suites
+        baseline = self.baseline.suites
         suites = pairwise_compare(contender, baseline)
 
         for suite_name, (suite_cont, suite_base) in suites:
             benchmarks = pairwise_compare(
                 suite_cont.benchmarks, suite_base.benchmarks)
 
-            for bench_name, (bench_cont, bench_base) in benchmarks:
+            for _, (bench_cont, bench_base) in benchmarks:
                 yield BenchmarkComparator(bench_cont, bench_base,
                                           threshold=self.threshold,
                                           suite_name=suite_name)
diff --git a/dev/archery/archery/benchmark/google.py b/dev/archery/archery/benchmark/google.py
index d6efb77dc22..bd2793eb4e8 100644
--- a/dev/archery/archery/benchmark/google.py
+++ b/dev/archery/archery/benchmark/google.py
@@ -18,6 +18,7 @@
 from itertools import filterfalse, groupby, tee
 import json
 import subprocess
+from tempfile import NamedTemporaryFile
 
 from .core import Benchmark
 from ..utils.command import Command
@@ -49,13 +50,16 @@ def list_benchmarks(self):
         return str.splitlines(result.stdout.decode("utf-8"))
 
     def results(self):
-        argv = ["--benchmark_format=json", "--benchmark_repetitions=20"]
+        with NamedTemporaryFile() as out:
+            argv = ["--benchmark_repetitions=20",
+                    f"--benchmark_out={out.name}",
+                    "--benchmark_out_format=json"]
 
-        if self.benchmark_filter:
-            argv.append(f"--benchmark_filter={self.benchmark_filter}")
+            if self.benchmark_filter:
+                argv.append(f"--benchmark_filter={self.benchmark_filter}")
 
-        return json.loads(self.run(*argv, stdout=subprocess.PIPE,
-                                   stderr=subprocess.PIPE).stdout)
+            self.run(*argv, check=True)
+            return json.load(out)
 
 
 class GoogleBenchmarkObservation:
diff --git a/dev/archery/archery/benchmark/runner.py b/dev/archery/archery/benchmark/runner.py
index 7dc56bdf096..dbbb3f5713c 100644
--- a/dev/archery/archery/benchmark/runner.py
+++ b/dev/archery/archery/benchmark/runner.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import glob
+import json
 import os
 import re
 
@@ -34,14 +35,94 @@ def regex_filter(re_expr):
 
 
 class BenchmarkRunner:
-    def suites(self, suite_filter=None, benchmark_filter=None):
+    def __init__(self, suite_filter=None, benchmark_filter=None):
+        self.suite_filter = suite_filter
+        self.benchmark_filter = benchmark_filter
+
+    @property
+    def suites(self):
         raise NotImplementedError("BenchmarkRunner must implement suites")
 
+    @staticmethod
+    def from_rev_or_path(src, root, rev_or_path, cmake_conf, **kwargs):
+        """ Returns a BenchmarkRunner from a path or a git revision.
+
+        First, it checks if `rev_or_path` is a valid path (or string) of a json
+        object that can deserialize to a BenchmarkRunner. If so, it initialize
+        a StaticBenchmarkRunner from it. This allows memoizing the result of a
+        run in a file or a string.
+
+        Second, it checks if `rev_or_path` points to a valid CMake build
+        directory.  If so, it creates a CppBenchmarkRunner with this existing
+        CMakeBuild.
+
+        Otherwise, it assumes `rev_or_path` is a revision and clone/checkout
+        the given revision and create a fresh CMakeBuild.
+        """
+        build = None
+        if StaticBenchmarkRunner.is_json_result(rev_or_path):
+            return StaticBenchmarkRunner.from_json(rev_or_path, **kwargs)
+        elif CMakeBuild.is_build_dir(rev_or_path):
+            build = CMakeBuild.from_path(rev_or_path)
+            return CppBenchmarkRunner(build, **kwargs)
+        else:
+            root_rev = os.path.join(root, rev_or_path)
+            os.mkdir(root_rev)
+
+            clone_dir = os.path.join(root_rev, "arrow")
+            # Possibly checkout the sources at given revision, no need to
+            # perform cleanup on cloned repository as root_rev is reclaimed.
+            src_rev, _ = src.at_revision(rev_or_path, clone_dir)
+            cmake_def = CppCMakeDefinition(src_rev.cpp, cmake_conf)
+            build_dir = os.path.join(root_rev, "build")
+            return CppBenchmarkRunner(cmake_def.build(build_dir), **kwargs)
+
+
+class StaticBenchmarkRunner(BenchmarkRunner):
+    """ Run suites from a (static) set of suites. """
+
+    def __init__(self, suites, **kwargs):
+        self._suites = suites
+        super().__init__(**kwargs)
+
+    @property
+    def suites(self):
+        suite_fn = regex_filter(self.suite_filter)
+        benchmark_fn = regex_filter(self.benchmark_filter)
+
+        for suite in (s for s in self._suites if suite_fn(s.name)):
+            benchmarks = [b for b in suite.benchmarks if benchmark_fn(b.name)]
+            yield BenchmarkSuite(suite.name, benchmarks)
+
+    @classmethod
+    def is_json_result(cls, path_or_str):
+        builder = None
+        try:
+            builder = cls.from_json(path_or_str)
+        except BaseException:
+            pass
+
+        return builder is not None
+
+    @staticmethod
+    def from_json(path_or_str, **kwargs):
+        # breaks recursive imports
+        from ..utils.codec import BenchmarkRunnerCodec
+        path_or_str, json_load = (open(path_or_str), json.load) \
+            if os.path.isfile(path_or_str) else (path_or_str, json.loads)
+        return BenchmarkRunnerCodec.decode(json_load(path_or_str), **kwargs)
+
+    def __repr__(self):
+        return f"BenchmarkRunner[suites={list(self.suites)}]"
+
 
 class CppBenchmarkRunner(BenchmarkRunner):
-    def __init__(self, build):
+    """ Run suites from a CMakeBuild. """
+
+    def __init__(self, build, **kwargs):
         """ Initialize a CppBenchmarkRunner. """
         self.build = build
+        super().__init__(**kwargs)
 
     @property
     def suites_binaries(self):
@@ -52,9 +133,9 @@ def suites_binaries(self):
         glob_expr = os.path.join(self.build.binaries_dir, "*-benchmark")
         return {os.path.basename(b): b for b in glob.glob(glob_expr)}
 
-    def suite(self, name, suite_bin, benchmark_filter):
+    def suite(self, name, suite_bin):
         """ Returns the resulting benchmarks for a given suite. """
-        suite_cmd = GoogleBenchmarkCommand(suite_bin, benchmark_filter)
+        suite_cmd = GoogleBenchmarkCommand(suite_bin, self.benchmark_filter)
 
         # Ensure there will be data
         benchmark_names = suite_cmd.list_benchmarks()
@@ -65,9 +146,10 @@ def suite(self, name, suite_bin, benchmark_filter):
         benchmarks = GoogleBenchmark.from_json(results.get("benchmarks"))
         return BenchmarkSuite(name, benchmarks)
 
-    def suites(self, suite_filter=None, benchmark_filter=None):
+    @property
+    def suites(self):
         """ Returns all suite for a runner. """
-        suite_matcher = regex_filter(suite_filter)
+        suite_matcher = regex_filter(self.suite_filter)
 
         suite_and_binaries = self.suites_binaries
         for suite_name in suite_and_binaries:
@@ -76,8 +158,7 @@ def suites(self, suite_filter=None, benchmark_filter=None):
                 continue
 
             suite_bin = suite_and_binaries[suite_name]
-            suite = self.suite(suite_name, suite_bin,
-                               benchmark_filter=benchmark_filter)
+            suite = self.suite(suite_name, suite_bin)
 
             # Filter may exclude all benchmarks
             if not suite:
@@ -85,30 +166,3 @@ def suites(self, suite_filter=None, benchmark_filter=None):
                 continue
 
             yield suite
-
-    @staticmethod
-    def from_rev_or_path(src, root, rev_or_path, cmake_conf):
-        """ Returns a CppBenchmarkRunner from a path or a git revision.
-
-        First, it checks if `rev_or_path` points to a valid CMake build
-        directory.  If so, it creates a CppBenchmarkRunner with this existing
-        CMakeBuild.
-
-        Otherwise, it assumes `rev_or_path` is a revision and clone/checkout
-        the given revision and create a fresh CMakeBuild.
-        """
-        build = None
-        if CMakeBuild.is_build_dir(rev_or_path):
-            build = CMakeBuild.from_path(rev_or_path)
-        else:
-            root_rev = os.path.join(root, rev_or_path)
-            os.mkdir(root_rev)
-
-            clone_dir = os.path.join(root_rev, "arrow")
-            # Possibly checkout the sources at given revision, no need to
-            # perform cleanup on cloned repository as root_rev is reclaimed.
-            src_rev, _ = src.at_revision(rev_or_path, clone_dir)
-            cmake_def = CppCMakeDefinition(src_rev.cpp, cmake_conf)
-            build = cmake_def.build(os.path.join(root_rev, "build"))
-
-        return CppBenchmarkRunner(build)
diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py
index 4fa88966080..b7b077ab87c 100644
--- a/dev/archery/archery/cli.py
+++ b/dev/archery/archery/cli.py
@@ -24,7 +24,7 @@
 from tempfile import mkdtemp, TemporaryDirectory
 
 from .benchmark.compare import RunnerComparator, DEFAULT_THRESHOLD
-from .benchmark.runner import CppBenchmarkRunner
+from .benchmark.runner import BenchmarkRunner
 from .lang.cpp import CppCMakeDefinition, CppConfiguration
 from .utils.codec import JsonEncoder
 from .utils.logger import logger, ctx as log_ctx
@@ -167,7 +167,77 @@ def benchmark(ctx):
     pass
 
 
-@benchmark.command(name="diff", short_help="Run the C++ benchmark suite")
+@benchmark.command(name="run", short_help="Run benchmark suite")
+@click.option("--src", metavar="<arrow_src>", show_default=True,
+              default=ArrowSources.find(),
+              callback=validate_arrow_sources,
+              help="Specify Arrow source directory")
+@click.option("--suite-filter", metavar="<regex>", show_default=True,
+              type=str, default=None, help="Regex filtering benchmark suites.")
+@click.option("--benchmark-filter", metavar="<regex>", show_default=True,
+              type=str, default=DEFAULT_BENCHMARK_FILTER,
+              help="Regex filtering benchmark suites.")
+@click.option("--preserve", type=bool, default=False, show_default=True,
+              is_flag=True, help="Preserve workspace for investigation.")
+@click.option("--output", metavar="<output>",
+              type=click.File("w", encoding="utf8"), default="-",
+              help="Capture output result into file.")
+@click.option("--cmake-extras", type=str, multiple=True,
+              help="Extra flags/options to pass to cmake invocation. "
+              "Can be stacked")
+@click.argument("baseline", metavar="[<baseline>]]", default="master",
+                required=False)
+@click.pass_context
+def benchmark_run(ctx, src, preserve, suite_filter, benchmark_filter,
+                  output, cmake_extras, baseline):
+    """ Run benchmark suite.
+
+    This command will run the benchmark suite for a single build. This is
+    used to capture (and/or publish) the results.
+
+    The caller can optionally specify a target which is either a git revision
+    (commit, tag, special values like HEAD) or a cmake build directory.
+
+
+    When a commit is referenced, a local clone of the arrow sources (specified
+    via --src) is performed and the proper branch is created. This is done in
+    a temporary directory which can be left intact with the `---preserve` flag.
+
+    The special token "WORKSPACE" is reserved to specify the current git
+    workspace. This imply that no clone will be performed.
+
+    Examples:
+
+    \b
+    # Run the benchmarks on current git workspace
+    \b
+    archery benchmark run
+
+    \b
+    # Run the benchmarks on current previous commit
+    \b
+    archery benchmark run HEAD~1
+
+    \b
+    # Run the benchmarks on current previous commit
+    \b
+    archery benchmark run --output=run.json
+    """
+    with tmpdir(preserve) as root:
+        logger.debug(f"Running benchmark {baseline}")
+
+        conf = CppConfiguration(
+            build_type="release", with_tests=True, with_benchmarks=True,
+            with_python=False, cmake_extras=cmake_extras)
+
+        runner_base = BenchmarkRunner.from_rev_or_path(
+            src, root, baseline, conf,
+            suite_filter=suite_filter, benchmark_filter=benchmark_filter)
+
+        json.dump(runner_base, output, cls=JsonEncoder)
+
+
+@benchmark.command(name="diff", short_help="Compare benchmark suites")
 @click.option("--src", metavar="<arrow_src>", show_default=True,
               default=ArrowSources.find(),
               callback=validate_arrow_sources,
@@ -182,6 +252,9 @@ def benchmark(ctx):
 @click.option("--threshold", type=float, default=DEFAULT_THRESHOLD,
               show_default=True,
               help="Regression failure threshold in percentage.")
+@click.option("--output", metavar="<output>",
+              type=click.File("w", encoding="utf8"), default="-",
+              help="Capture output result into file.")
 @click.option("--cmake-extras", type=str, multiple=True,
               help="Extra flags/options to pass to cmake invocation. "
               "Can be stacked")
@@ -191,7 +264,7 @@ def benchmark(ctx):
                 required=False)
 @click.pass_context
 def benchmark_diff(ctx, src, preserve, suite_filter, benchmark_filter,
-                   threshold, cmake_extras, contender, baseline):
+                   threshold, output, cmake_extras, contender, baseline):
     """ Compare (diff) benchmark runs.
 
     This command acts like git-diff but for benchmark results.
@@ -245,6 +318,22 @@ def benchmark_diff(ctx, src, preserve, suite_filter, benchmark_filter,
     \b
     archery benchmark diff --suite-filter="^arrow-compute-aggregate" \\
             --benchmark-filter="(Sum|Mean)Kernel"
+
+    \b
+    # Capture result in file `result.json`
+    \b
+    archery benchmark diff --output=result.json
+    \b
+    # Equivalently with no stdout clutter.
+    archery --quiet benchmark diff > result.json
+
+    \b
+    # Comparing with a cached results from `archery benchmark run`
+    \b
+    archery benchmark run --output=run.json HEAD~1
+    \b
+    # This should not recompute the benchmark from run.json
+    archery --quiet benchmark diff WORKSPACE run.json > result.json
     """
     with tmpdir(preserve) as root:
         logger.debug(f"Comparing {contender} (contender) with "
@@ -254,18 +343,18 @@ def benchmark_diff(ctx, src, preserve, suite_filter, benchmark_filter,
             build_type="release", with_tests=True, with_benchmarks=True,
             with_python=False, cmake_extras=cmake_extras)
 
-        runner_cont = CppBenchmarkRunner.from_rev_or_path(
-            src, root, contender, conf)
-        runner_base = CppBenchmarkRunner.from_rev_or_path(
-            src, root, baseline, conf)
-
-        runner_comp = RunnerComparator(runner_cont, runner_base, threshold)
-        comparisons = runner_comp.comparisons(suite_filter, benchmark_filter)
+        runner_cont = BenchmarkRunner.from_rev_or_path(
+            src, root, contender, conf,
+            suite_filter=suite_filter, benchmark_filter=benchmark_filter)
+        runner_base = BenchmarkRunner.from_rev_or_path(
+            src, root, baseline, conf,
+            suite_filter=suite_filter, benchmark_filter=benchmark_filter)
 
         regressions = 0
-        for comparator in comparisons:
+        runner_comp = RunnerComparator(runner_cont, runner_base, threshold)
+        for comparator in runner_comp.comparisons:
             regressions += comparator.regression
-            print(json.dumps(comparator, cls=JsonEncoder))
+            json.dump(comparator, output, cls=JsonEncoder)
 
         sys.exit(regressions)
 
diff --git a/dev/archery/archery/utils/codec.py b/dev/archery/archery/utils/codec.py
index 612f2df5605..abb91a075b4 100644
--- a/dev/archery/archery/utils/codec.py
+++ b/dev/archery/archery/utils/codec.py
@@ -18,26 +18,85 @@
 
 import json
 
+from ..benchmark.core import Benchmark, BenchmarkSuite
+from ..benchmark.runner import BenchmarkRunner, StaticBenchmarkRunner
 from ..benchmark.compare import BenchmarkComparator
 
 
 class JsonEncoder(json.JSONEncoder):
     def default(self, o):
+        if isinstance(o, Benchmark):
+            return BenchmarkCodec.encode(o)
+
+        if isinstance(o, BenchmarkSuite):
+            return BenchmarkSuiteCodec.encode(o)
+
+        if isinstance(o, BenchmarkRunner):
+            return BenchmarkRunnerCodec.encode(o)
+
         if isinstance(o, BenchmarkComparator):
-            comparator = {
-                "benchmark": o.name,
-                "change": o.change,
-                "regression": o.regression,
-                "baseline": o.baseline.value,
-                "contender": o.contender.value,
-                "unit": o.unit,
-                "less_is_better": o.less_is_better,
-            }
-
-            suite_name = o.suite_name
-            if suite_name:
-                comparator["suite"] = suite_name
-
-            return comparator
+            return BenchmarkComparatorCodec.encode(o)
 
         return json.JSONEncoder.default(self, o)
+
+
+class BenchmarkCodec:
+    @staticmethod
+    def encode(b):
+        return {
+            "name": b.name,
+            "unit": b.unit,
+            "less_is_better": b.less_is_better,
+            "values": b.values.tolist(),
+        }
+
+    @staticmethod
+    def decode(dct, **kwargs):
+        return Benchmark(**dct, **kwargs)
+
+
+class BenchmarkSuiteCodec:
+    @staticmethod
+    def encode(bs):
+        return {
+            "name": bs.name,
+            "benchmarks": [BenchmarkCodec.encode(b) for b in bs.benchmarks]
+        }
+
+    @staticmethod
+    def decode(dct, **kwargs):
+        benchmarks = [BenchmarkCodec.decode(b)
+                      for b in dct.pop("benchmarks", [])]
+        return BenchmarkSuite(benchmarks=benchmarks, **dct, **kwargs)
+
+
+class BenchmarkRunnerCodec:
+    @staticmethod
+    def encode(br):
+        return {"suites": [BenchmarkSuiteCodec.encode(s) for s in br.suites]}
+
+    @staticmethod
+    def decode(dct, **kwargs):
+        suites = [BenchmarkSuiteCodec.decode(s)
+                  for s in dct.pop("suites", [])]
+        return StaticBenchmarkRunner(suites=suites, **dct, **kwargs)
+
+
+class BenchmarkComparatorCodec:
+    @staticmethod
+    def encode(bc):
+        comparator = {
+            "benchmark": bc.name,
+            "change": bc.change,
+            "regression": bc.regression,
+            "baseline": bc.baseline.value,
+            "contender": bc.contender.value,
+            "unit": bc.unit,
+            "less_is_better": bc.less_is_better,
+        }
+
+        suite_name = bc.suite_name
+        if suite_name:
+            comparator["suite"] = suite_name
+
+        return comparator
diff --git a/docs/source/developers/benchmarks.rst b/docs/source/developers/benchmarks.rst
index d0e6f1b8aa4..2ae767bdda6 100644
--- a/docs/source/developers/benchmarks.rst
+++ b/docs/source/developers/benchmarks.rst
@@ -42,12 +42,31 @@ the module files but uses the actual sources.
   # optional: enable bash/zsh autocompletion
   eval "$(_ARCHERY_COMPLETE=source archery)"
 
+Running the benchmark suite
+===========================
+
+The benchmark suites can be ran with the ``benchmark run`` sub-command.
+
+.. code-block:: shell
+
+  # Run benchmarks in the current git workspace
+  archery benchmark run
+  # Storing the results in a file
+  archery benchmark run --output=run.json
+
+Sometimes, it is required to pass custom CMake flags, e.g.
+
+.. code-block:: shell
+
+  export CC=clang-8 CXX=clang++8
+  archery benchmark run --cmake-extras="-DARROW_USE_SIMD=ON"
+
 Comparison
 ==========
 
 One goal with benchmarking is to detect performance regressions. To this end,
 ``archery`` implements a benchmark comparison facility via the ``benchmark
-diff`` command.
+diff`` sub-command.
 
 In the default invocation, it will compare the current source (known as the
 current workspace in git) with local master branch.
@@ -59,8 +78,8 @@ Iterating efficiently
 ~~~~~~~~~~~~~~~~~~~~~
 
 Iterating with benchmark development can be a tedious process due to long
-build time and long run times. ``archery benchmark diff`` provides 2 methods
-to reduce this overhead.
+build time and long run times. Multiple tricks can be used with
+``archery benchmark diff`` to reduce this overhead.
 
 First, the benchmark command supports comparing existing
 build directories, This can be paired with the ``--preserve`` flag to
@@ -77,7 +96,18 @@ avoid rebuilding sources from zero.
   # Re-run benchmark in the previously created build directory.
   archery benchmark diff /tmp/arrow-bench*/{WORKSPACE,master}/build
 
-Second, the benchmark command supports filtering suites (``--suite-filter``)
+Second, a benchmark run result can be saved in a json file. This also avoids
+rebuilding the sources, but also executing the (sometimes) heavy benchmarks.
+This technique can be used as a poor's man caching.
+
+.. code-block:: shell
+
+  # Run the benchmarks on a given commit and save the result
+  archery benchmark run --output=run-head-1.json HEAD~1
+  # Compare the previous captured result with HEAD
+  archery benchmark diff HEAD run-head-1.json
+
+Third, the benchmark command supports filtering suites (``--suite-filter``)
 and benchmarks (``--benchmark-filter``), both options supports regular
 expressions.
 
@@ -89,8 +119,6 @@ expressions.
     --suite-filter=compute-aggregate --benchmark-filter=Kernel \
     /tmp/arrow-bench*/{WORKSPACE,master}/build
 
-Both methods can be combined.
-
 Regression detection
 ====================
 
@@ -110,6 +138,16 @@ Writing a benchmark
    does not fit in memory (L2/L3), the benchmark will be memory bound instead
    of CPU bound. In this case, the input can be downsized.
 
+4. By default, google's benchmark library will use the cputime metric, which
+   is the sum of runtime dedicated on the CPU for all threads of the process.
+   By contrast to realtime which is the wall clock time, e.g. the difference
+   between end_time - start_time. In a single thread model, the cputime is
+   preferable since it is less affected by context switching. In a multi thread
+   scenario, the cputime will give incorrect result since the since it'll
+   be inflated by the number of threads and can be far off realtime. Thus, if
+   the benchmark is multi threaded, it might be better to use
+   ``SetRealtime()``, see this `example <https://github.com/apache/arrow/blob/a9582ea6ab2db055656809a2c579165fe6a811ba/cpp/src/arrow/io/memory-benchmark.cc#L223-L227>`.
+
 Scripting
 =========
 
@@ -125,3 +163,10 @@ output. This can be controlled/avoided with the ``--quiet`` option, e.g.
   {"benchmark": "BenchSumKernel/32768/0", "change": -0.6498, "regression": true, ...
   {"benchmark": "BenchSumKernel/32768/1", "change": 0.01553, "regression": false, ...
   ...
+
+or the ``--output=<file>`` can be used, e.g.
+
+.. code-block:: shell
+
+  archery benchmark diff --benchmark-filter=Kernel --output=compare.json
+  ...