apache · kou · Nov 28, 2021 · Nov 30, 2021 · Nov 30, 2021 · Nov 30, 2021
diff --git a/.clang-format b/.clang-format
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 ---
-BasedOnStyle:  Google
-DerivePointerAlignment: false
+BasedOnStyle: Google
 ColumnLimit: 90
+DerivePointerAlignment: false
+IncludeBlocks: Preserve
diff --git a/.env b/.env
@@ -51,7 +51,7 @@ FEDORA=33
 UBUNTU=20.04
 
 # Default versions for various dependencies
-CLANG_TOOLS=8
+CLANG_TOOLS=12
 CUDA=9.1
 DASK=latest
 DOTNET=3.1

diff --git a/ci/docker/ubuntu-20.04-cpp.dockerfile b/ci/docker/ubuntu-20.04-cpp.dockerfile
@@ -29,20 +29,27 @@ RUN echo "debconf debconf/frontend select Noninteractive" | \
 # while debugging package list with docker build.
 ARG clang_tools
 ARG llvm
-RUN if [ "${llvm}" -gt "10" ]; then \
+RUN latest_system_llvm=10 && \
+    if [ ${llvm} -gt ${latest_system_llvm} -o \
+         ${clang_tools} -gt ${latest_system_llvm} ]; then \
       apt-get update -y -q && \
       apt-get install -y -q --no-install-recommends \
           apt-transport-https \
           ca-certificates \
           gnupg \
+          lsb-release \
           wget && \
       wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \
-      echo "deb https://apt.llvm.org/focal/ llvm-toolchain-focal-${llvm} main" > \
-         /etc/apt/sources.list.d/llvm.list && \
-      if [ "${clang_tools}" != "${llvm}" -a "${clang_tools}" -gt 10 ]; then \
-        echo "deb https://apt.llvm.org/focal/ llvm-toolchain-focal-${clang_tools} main" > \
+      code_name=$(lsb_release --codename --short) && \
+      if [ ${llvm} -gt 10 ]; then \
+        echo "deb https://apt.llvm.org/${code_name}/ llvm-toolchain-${code_name}-${llvm} main" > \
+           /etc/apt/sources.list.d/llvm.list; \
+      fi && \
+      if [ ${clang_tools} -ne ${llvm} -a \
+           ${clang_tools} -gt ${latest_system_llvm} ]; then \
+        echo "deb https://apt.llvm.org/${code_name}/ llvm-toolchain-${code_name}-${clang_tools} main" > \
            /etc/apt/sources.list.d/clang-tools.list; \
-      fi \
+      fi; \
     fi && \
     apt-get update -y -q && \
     apt-get install -y -q --no-install-recommends \

diff --git a/ci/docker/ubuntu-21.04-cpp.dockerfile b/ci/docker/ubuntu-21.04-cpp.dockerfile
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-ARG base=amd64/ubuntu:20.04
+ARG base=amd64/ubuntu:21.04
 FROM ${base}
 
 SHELL ["/bin/bash", "-o", "pipefail", "-c"]
@@ -29,20 +29,27 @@ RUN echo "debconf debconf/frontend select Noninteractive" | \
 # while debugging package list with docker build.
 ARG clang_tools
 ARG llvm
-RUN if [ "${llvm}" -gt "10" ]; then \
+RUN latest_system_llvm=12 && \
+    if [ ${llvm} -gt ${latest_system_llvm} -o \
+         ${clang_tools} -gt ${latest_system_llvm} ]; then \
       apt-get update -y -q && \
       apt-get install -y -q --no-install-recommends \
           apt-transport-https \
           ca-certificates \
           gnupg \
+          lsb-release \
           wget && \
       wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \
-      echo "deb https://apt.llvm.org/hirsute/ llvm-toolchain-hirsute-${llvm} main" > \
-         /etc/apt/sources.list.d/llvm.list && \
-      if [ "${clang_tools}" != "${llvm}" -a "${clang_tools}" -gt 10 ]; then \
-        echo "deb https://apt.llvm.org/hirsute/ llvm-toolchain-hirsute-${clang_tools} main" > \
+      code_name=$(lsb_release --codename --short) && \
+      if [ ${llvm} -gt 10 ]; then \
+        echo "deb https://apt.llvm.org/${code_name}/ llvm-toolchain-${code_name}-${llvm} main" > \
+           /etc/apt/sources.list.d/llvm.list; \
+      fi && \
+      if [ ${clang_tools} -ne ${llvm} -a \
+           ${clang_tools} -gt ${latest_system_llvm} ]; then \
+        echo "deb https://apt.llvm.org/${code_name}/ llvm-toolchain-${code_name}-${clang_tools} main" > \
            /etc/apt/sources.list.d/clang-tools.list; \
-      fi \
+      fi; \
     fi && \
     apt-get update -y -q && \
     apt-get install -y -q --no-install-recommends \

diff --git a/cpp/src/arrow/compute/exec/expression_test.cc b/cpp/src/arrow/compute/exec/expression_test.cc
@@ -1222,8 +1222,9 @@ TEST(Expression, SingleComparisonGuarantees) {
             all = false;
           }
         }
-        Simplify{filter}.WithGuarantee(guarantee).Expect(
-            all ? literal(true) : none ? literal(false) : filter);
+        Simplify{filter}.WithGuarantee(guarantee).Expect(all    ? literal(true)
+                                                         : none ? literal(false)
+                                                                : filter);
       }
     }
   }

diff --git a/cpp/src/arrow/compute/exec/hash_join_node_test.cc b/cpp/src/arrow/compute/exec/hash_join_node_test.cc
@@ -281,8 +281,10 @@ struct RandomDataTypeConstraints {
 
   void OnlyInt(int int_size, bool allow_nulls) {
     Default();
-    data_type_enabled_mask =
-        int_size == 8 ? kInt8 : int_size == 4 ? kInt4 : int_size == 2 ? kInt2 : kInt1;
+    data_type_enabled_mask = int_size == 8   ? kInt8
+                             : int_size == 4 ? kInt4
+                             : int_size == 2 ? kInt2
+                                             : kInt1;
     if (!allow_nulls) {
       max_null_probability = 0.0;
     }
@@ -1166,12 +1168,12 @@ void TestHashJoinDictionaryHelper(
     int expected_num_r_no_match,
     // Whether to swap two inputs to the hash join
     bool swap_sides) {
-  int64_t l_length = l_key.is_array()
-                         ? l_key.array()->length
-                         : l_payload.is_array() ? l_payload.array()->length : -1;
-  int64_t r_length = r_key.is_array()
-                         ? r_key.array()->length
-                         : r_payload.is_array() ? r_payload.array()->length : -1;
+  int64_t l_length = l_key.is_array()       ? l_key.array()->length
+                     : l_payload.is_array() ? l_payload.array()->length
+                                            : -1;
+  int64_t r_length = r_key.is_array()       ? r_key.array()->length
+                     : r_payload.is_array() ? r_payload.array()->length
+                                            : -1;
   ARROW_DCHECK(l_length >= 0 && r_length >= 0);
 
   constexpr int batch_multiplicity_for_parallel = 2;

diff --git a/cpp/src/arrow/compute/exec/key_encode.cc b/cpp/src/arrow/compute/exec/key_encode.cc
@@ -572,10 +572,14 @@ void KeyEncoder::EncoderBinaryPair::Decode(uint32_t start_row, uint32_t num_rows
 
   uint32_t col_width1 = col_prep[0].metadata().fixed_length;
   uint32_t col_width2 = col_prep[1].metadata().fixed_length;
-  int log_col_width1 =
-      col_width1 == 8 ? 3 : col_width1 == 4 ? 2 : col_width1 == 2 ? 1 : 0;
-  int log_col_width2 =
-      col_width2 == 8 ? 3 : col_width2 == 4 ? 2 : col_width2 == 2 ? 1 : 0;
+  int log_col_width1 = col_width1 == 8   ? 3
+                       : col_width1 == 4 ? 2
+                       : col_width1 == 2 ? 1
+                                         : 0;
+  int log_col_width2 = col_width2 == 8   ? 3
+                       : col_width2 == 4 ? 2
+                       : col_width2 == 2 ? 1
+                                         : 0;
 
   bool is_row_fixed_length = rows.metadata().is_fixed_length;
 

diff --git a/cpp/src/arrow/compute/exec/key_map.h b/cpp/src/arrow/compute/exec/key_map.h
@@ -153,8 +153,10 @@ class SwissTable {
 
   static int num_groupid_bits_from_log_blocks(int log_blocks) {
     int required_bits = log_blocks + 3;
-    return required_bits <= 8 ? 8
-                              : required_bits <= 16 ? 16 : required_bits <= 32 ? 32 : 64;
+    return required_bits <= 8    ? 8
+           : required_bits <= 16 ? 16
+           : required_bits <= 32 ? 32
+                                 : 64;
   }
 
   // Use 32-bit hash for now

diff --git a/cpp/src/arrow/io/memory_benchmark.cc b/cpp/src/arrow/io/memory_benchmark.cc
@@ -49,10 +49,10 @@ using VectorType = __m512i;
 #define VectorSet _mm512_set1_epi32
 #define VectorLoad _mm512_stream_load_si512
 #define VectorLoadAsm(SRC, DST) \
-  asm volatile("vmovaps %[src], %[dst]" : [ dst ] "=v"(DST) : [ src ] "m"(SRC) :)
+  asm volatile("vmovaps %[src], %[dst]" : [dst] "=v"(DST) : [src] "m"(SRC) :)
 #define VectorStreamLoad _mm512_stream_load_si512
 #define VectorStreamLoadAsm(SRC, DST) \
-  asm volatile("vmovntdqa %[src], %[dst]" : [ dst ] "=v"(DST) : [ src ] "m"(SRC) :)
+  asm volatile("vmovntdqa %[src], %[dst]" : [dst] "=v"(DST) : [src] "m"(SRC) :)
 #define VectorStreamWrite _mm512_stream_si512
 
 #else
@@ -63,10 +63,10 @@ using VectorType = __m256i;
 #define VectorSet _mm256_set1_epi32
 #define VectorLoad _mm256_stream_load_si256
 #define VectorLoadAsm(SRC, DST) \
-  asm volatile("vmovaps %[src], %[dst]" : [ dst ] "=v"(DST) : [ src ] "m"(SRC) :)
+  asm volatile("vmovaps %[src], %[dst]" : [dst] "=v"(DST) : [src] "m"(SRC) :)
 #define VectorStreamLoad _mm256_stream_load_si256
 #define VectorStreamLoadAsm(SRC, DST) \
-  asm volatile("vmovntdqa %[src], %[dst]" : [ dst ] "=v"(DST) : [ src ] "m"(SRC) :)
+  asm volatile("vmovntdqa %[src], %[dst]" : [dst] "=v"(DST) : [src] "m"(SRC) :)
 #define VectorStreamWrite _mm256_stream_si256
 
 #else  // ARROW_HAVE_AVX2 not set
@@ -75,10 +75,10 @@ using VectorType = __m128i;
 #define VectorSet _mm_set1_epi32
 #define VectorLoad _mm_stream_load_si128
 #define VectorLoadAsm(SRC, DST) \
-  asm volatile("movaps %[src], %[dst]" : [ dst ] "=x"(DST) : [ src ] "m"(SRC) :)
+  asm volatile("movaps %[src], %[dst]" : [dst] "=x"(DST) : [src] "m"(SRC) :)
 #define VectorStreamLoad _mm_stream_load_si128
 #define VectorStreamLoadAsm(SRC, DST) \
-  asm volatile("movntdqa %[src], %[dst]" : [ dst ] "=x"(DST) : [ src ] "m"(SRC) :)
+  asm volatile("movntdqa %[src], %[dst]" : [dst] "=x"(DST) : [src] "m"(SRC) :)
 #define VectorStreamWrite _mm_stream_si128
 
 #endif  // ARROW_HAVE_AVX2
@@ -164,22 +164,22 @@ using VectorTypeDual = uint8x16x2_t;
 
 static void armv8_stream_load_pair(VectorType* src, VectorType* dst) {
   asm volatile("LDNP %[reg1], %[reg2], [%[from]]\n\t"
-               : [ reg1 ] "+r"(*dst), [ reg2 ] "+r"(*(dst + 1))
-               : [ from ] "r"(src));
+               : [reg1] "+r"(*dst), [reg2] "+r"(*(dst + 1))
+               : [from] "r"(src));
 }
 
 static void armv8_stream_store_pair(VectorType* src, VectorType* dst) {
   asm volatile("STNP %[reg1], %[reg2], [%[to]]\n\t"
-               : [ to ] "+r"(dst)
-               : [ reg1 ] "r"(*src), [ reg2 ] "r"(*(src + 1))
+               : [to] "+r"(dst)
+               : [reg1] "r"(*src), [reg2] "r"(*(src + 1))
                : "memory");
 }
 
 static void armv8_stream_ldst_pair(VectorType* src, VectorType* dst) {
   asm volatile(
       "LDNP q1, q2, [%[from]]\n\t"
       "STNP q1, q2, [%[to]]\n\t"
-      : [ from ] "+r"(src), [ to ] "+r"(dst)
+      : [from] "+r"(src), [to] "+r"(dst)
       :
       : "memory", "v0", "v1", "v2", "v3");
 }

diff --git a/cpp/src/arrow/testing/util.cc b/cpp/src/arrow/testing/util.cc
@@ -38,8 +38,8 @@
 #endif
 
 #include "arrow/table.h"
-#include "arrow/type.h"
 #include "arrow/testing/random.h"
+#include "arrow/type.h"
 #include "arrow/util/io_util.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/pcg_random.h"

diff --git a/cpp/src/arrow/util/basic_decimal.cc b/cpp/src/arrow/util/basic_decimal.cc
@@ -1353,9 +1353,9 @@ bool operator<(const BasicDecimal256& left, const BasicDecimal256& right) {
   const auto rhs_le = bit_util::little_endian::Make(right.native_endian_array());
   return lhs_le[3] != rhs_le[3]
              ? static_cast<int64_t>(lhs_le[3]) < static_cast<int64_t>(rhs_le[3])
-             : lhs_le[2] != rhs_le[2] ? lhs_le[2] < rhs_le[2]
-                                      : lhs_le[1] != rhs_le[1] ? lhs_le[1] < rhs_le[1]
-                                                               : lhs_le[0] < rhs_le[0];
+         : lhs_le[2] != rhs_le[2] ? lhs_le[2] < rhs_le[2]
+         : lhs_le[1] != rhs_le[1] ? lhs_le[1] < rhs_le[1]
+                                  : lhs_le[0] < rhs_le[0];
 }
 
 BasicDecimal256 operator-(const BasicDecimal256& operand) {

diff --git a/cpp/src/gandiva/expression_registry.cc b/cpp/src/gandiva/expression_registry.cc
@@ -60,8 +60,8 @@ FunctionSignature ExpressionRegistry::FunctionSignatureIterator::operator*() {
   return *func_sig_it_;
 }
 
-ExpressionRegistry::func_sig_iterator_type ExpressionRegistry::FunctionSignatureIterator::
-operator++(int increment) {
+ExpressionRegistry::func_sig_iterator_type
+ExpressionRegistry::FunctionSignatureIterator::operator++(int increment) {
   ++func_sig_it_;
   // point func_sig_it_ to first signature of next nativefunction if func_sig_it_ is
   // pointing to end

diff --git a/cpp/src/parquet/arrow/reconstruct_internal_test.cc b/cpp/src/parquet/arrow/reconstruct_internal_test.cc
@@ -113,9 +113,10 @@ class FileBuilder {
     auto typed_writer =
         checked_cast<TypedColumnWriter<PhysicalType<TYPE>>*>(column_writer);
 
-    const int64_t num_values = static_cast<int64_t>(
-        (max_def_level > 0) ? def_levels.size()
-                            : (max_rep_level > 0) ? rep_levels.size() : values.size());
+    const int64_t num_values =
+        static_cast<int64_t>((max_def_level > 0)   ? def_levels.size()
+                             : (max_rep_level > 0) ? rep_levels.size()
+                                                   : values.size());
     const int64_t values_written = typed_writer->WriteBatch(
         num_values, LevelPointerOrNull(def_levels, max_def_level),
         LevelPointerOrNull(rep_levels, max_rep_level), values.data());

diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
@@ -1186,7 +1186,6 @@ tasks:
     params:
       env:
         UBUNTU: 21.04
-        CLANG_TOOLS: 9 # can remove this when >=9 is the default
         R_PRUNE_DEPS: TRUE
       flags: '-e ARROW_SOURCE_HOME="/arrow" -e FORCE_BUNDLED_BUILD=TRUE -e LIBARROW_BUILD=TRUE'
       image: ubuntu-r-only-r
@@ -1198,7 +1197,6 @@ tasks:
     params:
       env:
         UBUNTU: 21.04
-        CLANG_TOOLS: 9 # can remove this when >=9 is the default
         GCC_VERSION: 11
       # S3 support is not buildable with gcc11 right now
       flags: '-e ARROW_SOURCE_HOME="/arrow" -e FORCE_BUNDLED_BUILD=TRUE -e LIBARROW_BUILD=TRUE -e ARROW_S3=OFF'

diff --git a/docs/source/developers/cpp/building.rst b/docs/source/developers/cpp/building.rst
@@ -398,7 +398,7 @@ CMake:
 LLVM and Clang Tools
 ~~~~~~~~~~~~~~~~~~~~
 
-We are currently using LLVM 8 for library builds and for other developer tools
+We are currently using LLVM for library builds and for other developer tools
 such as code formatting with ``clang-format``. LLVM can be installed via most
 modern package managers (apt, yum, conda, Homebrew, vcpkg, chocolatey).
 

diff --git a/docs/source/developers/cpp/development.rst b/docs/source/developers/cpp/development.rst
@@ -105,9 +105,11 @@ following checks:
   subcommand to :ref:`Archery <archery>`. This can also be fixed locally
   by running ``archery lint --cpplint --fix``.
 
-In order to account for variations in the behavior of ``clang-format`` between
-major versions of LLVM, we pin the version of ``clang-format`` used (current
-LLVM 8).
+In order to account for variations in the behavior of ``clang-format``
+between major versions of LLVM, we pin the version of ``clang-format``
+used. You can confirm the current pinned version by finding
+the ``CLANG_TOOLS`` variable value in `.env
+<https://github.com/apache/arrow/blob/master/.env>`_.
 
 Depending on how you installed clang-format, the build system may not be able
 to find it. You can provide an explicit path to your LLVM installation (or the

diff --git a/r/lint.sh b/r/lint.sh
@@ -26,7 +26,16 @@ SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 CPP_BUILD_SUPPORT=$SOURCE_DIR/../cpp/build-support
 
 # Run clang-format
-: ${CLANG_FORMAT:=$(. "${SOURCE_DIR}/../.env" && echo clang-format-${CLANG_TOOLS})}
+if [ -z "${CLANG_FORMAT:-}" ]; then
+  CLANG_TOOLS=$(. "${SOURCE_DIR}/../.env" && echo ${CLANG_TOOLS})
+  if type clang-format-${CLANG_TOOLS} >/dev/null 2>&1; then
+    CLANG_FORMAT=clang-format-${CLANG_TOOLS}
+  elif type brew >/dev/null 2>&1; then
+    CLANG_FORMAT=$(brew --prefix llvm@${CLANG_TOOLS})/bin/clang-format
+  else
+    CLANG_FORMAT=clang-format
+  fi
+fi
 $CPP_BUILD_SUPPORT/run_clang_format.py \
     --clang_format_binary=$CLANG_FORMAT \
     --exclude_glob=$CPP_BUILD_SUPPORT/lint_exclusions.txt \

diff --git a/r/src/.clang-format b/r/src/.clang-format
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 ---
-BasedOnStyle:  Google
-DerivePointerAlignment: false
+BasedOnStyle: Google
 ColumnLimit: 90
+DerivePointerAlignment: false
+IncludeBlocks: Preserve
diff --git a/r/src/nameof.h b/r/src/nameof.h
@@ -55,9 +55,9 @@ const size_t typename_prefix = search(raw<double>(), "double");
 template <typename T>
 size_t struct_class_prefix() {
 #ifdef _MSC_VER
-  return starts_with(raw<T>() + typename_prefix, "struct ")
-             ? 7
-             : starts_with(raw<T>() + typename_prefix, "class ") ? 6 : 0;
+  return starts_with(raw<T>() + typename_prefix, "struct ")  ? 7
+         : starts_with(raw<T>() + typename_prefix, "class ") ? 6
+                                                             : 0;
 #else
   return 0;
 #endif

diff --git a/r/vignettes/developers/workflow.Rmd b/r/vignettes/developers/workflow.Rmd
@@ -87,16 +87,18 @@ Fix any style issues before committing with
 ./lint.sh --fix
 ```
 
-The lint script requires Python 3 and `clang-format-8`. If the command
+The lint script requires Python 3 and `clang-format`. If the command
 isn't found, you can explicitly provide the path to it like:
 
 ```bash
-CLANG_FORMAT=$(which clang-format-8) ./lint.sh
+CLANG_FORMAT=/opt/llvm/bin/clang-format ./lint.sh
 ```
 
-On macOS, you can get this by installing LLVM via Homebrew and running the script as:
+You can see what version of `clang-format` is required by the following
+command:
+
 ```bash
-CLANG_FORMAT=$(brew --prefix llvm@8)/bin/clang-format ./lint.sh
+(. ../.env && echo ${CLANG_TOOLS})
 ```
 
 _Note_ that the lint script requires Python 3 and the Python dependencies