Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
06b80ce
add bolt backend in gluten
Oct 10, 2025
0fd3370
Add parent project information to pom.xml
taiyang-li Dec 11, 2025
92119e6
test commit
taiyang-li Dec 11, 2025
8ef118c
fix style
taiyang-li Dec 11, 2025
1ad78da
Refine build script
kexianda Dec 15, 2025
0fa674f
register SparkExprToSubfieldFilterParser
guhaiyan0221 Dec 22, 2025
e74e874
fix style
guhaiyan0221 Dec 24, 2025
762b340
Add Dockerfile for bolt backend
kexianda Dec 22, 2025
03ce1b8
Refine Makefile
kexianda Dec 25, 2025
4e8e1cb
Optimize out-of-the-box parameters
guhaiyan0221 Dec 29, 2025
3aaa02c
overwrite batchsize default value
WangGuangxin Dec 30, 2025
d4ee706
add make arrow instruction
taiyang-li Jan 7, 2026
c37a0e6
add docker instructions
taiyang-li Jan 7, 2026
f7c7d30
fix S3 compile error
guhaiyan0221 Jan 7, 2026
bfb8fb1
add Bolt.md
guhaiyan0221 Jan 8, 2026
0988a35
add bolt-spark-configuration.md
guhaiyan0221 Jan 8, 2026
1870bcf
add BoltStageResourceAdj.md
guhaiyan0221 Jan 8, 2026
67ee1d9
bolt-backend-generator-function-support.md
guhaiyan0221 Jan 8, 2026
315ff2b
add aggregate-function/scalar-function/window-function/write-configur…
guhaiyan0221 Jan 8, 2026
4eacd71
add bolt-function-development-guide.md BoltFileSystem.md BoltLocalCac…
guhaiyan0221 Jan 8, 2026
5f0d7f6
add velox-to-bolt-migration-guide.md
guhaiyan0221 Jan 16, 2026
407f7be
add bolt-quick-start.md
guhaiyan0221 Jan 19, 2026
cf64eb3
align gtest version wiht bolt
kexianda Jan 21, 2026
6f75c3d
Remove outputType init logic
WangGuangxin Jan 26, 2026
f021682
fix: symbols conflicts with other JNI libraries
kexianda Feb 11, 2026
68f26b0
fix: avoid flatten twice in write
fzhedu Mar 3, 2026
847a271
fix compilation error caused by renaming RegisterGCSFileSystem.h to R…
guhaiyan0221 Mar 11, 2026
47eff98
[VL] Support mapping columns by position index for ORC and Parquet fi…
kevinwilfong Oct 15, 2025
970ab69
[fix] pass the full tables schema when creating HiveTableHandle for o…
markjin1990 Mar 5, 2026
ad59e35
Remove __cxa_throw hook in gluten
kexianda Mar 11, 2026
6dbdc7c
Add support for paimon from master
ZacBlanco Feb 18, 2026
f54de19
Remove unused data type cases in BoltBackend
VvanFalleaves Mar 13, 2026
faf6d0d
fix: dont push down paimon metadata column filters
ZacBlanco Mar 18, 2026
0229207
remove unused sort_before_repartition for round robin shuffle
zhangxffff Mar 16, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,9 @@ dist/
metastore_db/

.ipynb_checkpoints
cpp/gluten.conan.graph.html
**/version/version.h
.bolt-build-info.properties
cpp/gluten.conan.graph.html

output/**
4 changes: 2 additions & 2 deletions LICENSE-binary
Original file line number Diff line number Diff line change
Expand Up @@ -241,8 +241,8 @@ BSD 3-Clause
------------

com.thoughtworks.paranamer:paranamer
io.glutenproject:protobuf-java
io.glutenproject:protobuf-java-util
org.apache.gluten:protobuf-java
org.apache.gluten:protobuf-java-util
org.eclipse.collections:eclipse-collections
org.eclipse.collections:eclipse-collections-api

Expand Down
216 changes: 216 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

ROOT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
BUILD_DIR := ${ROOT_DIR}/cpp/build
CONAN_FILE_DIR := ${ROOT_DIR}/cpp/
BUILD_TYPE=Debug
ENABLE_ASAN ?= False
LDB_BUILD ?= False
BUILD_BENCHMARKS ?= False
BUILD_TESTS ?= False
BUILD_EXAMPLES ?= False
BUILD_ORC ?= False
ENABLE_PROTON ?= False

# conan package info
GLUTEN_BUILD_VERSION ?= main
BOLT_BUILD_VERSION ?= main
BUILD_USER ?=
BUILD_CHANNEL ?=

ENABLE_HDFS ?= True
ENABLE_S3 ?= False
RSS_PROFILE ?= ''

ifeq ($(BUILD_BENCHMARKS),True)
BUILD_ORC = True
endif

ARCH := $(shell arch)
ifeq ($(ARCH), x86_64)
ARCH := amd64
endif

SHARED_LIBRARY ?= True

# Manually specify the number of bolt compilation threads by setting the BOLT_NUM_THREADS environment variable.
# e.g. export BOLT_NUM_THREADS=50
ifndef CI_NUM_THREADS
ifdef BOLT_NUM_THREADS
NUM_THREADS ?= $(BOLT_NUM_THREADS)
else
NUM_THREADS ?= $$(( $(shell grep -c ^processor /proc/cpuinfo) / 2 ))
endif
else
NUM_THREADS ?= $(CI_NUM_THREADS)
endif

ALLOWED_VERSIONS := 11 17
ifeq ($(JAVA_HOME),)
$(error ERROR: JAVA_HOME is not set)
endif
ifneq ($(wildcard $(JAVA_HOME)/bin/java),)
ifneq ($(wildcard $(JAVA_HOME)/bin/javac),)
JDK_VERSION := $(shell $(JAVA_HOME)/bin/java -version 2>&1 | sed -n 's/.*version "\(1\.\)\{0,1\}\([0-9]\+\).*/\2/p')
ifneq ($(filter $(JDK_VERSION),$(ALLOWED_VERSIONS)),$(JDK_VERSION))
$(error ERROR: JDK version $(JDK_VERSION) is not supported, only 11 and 17 are allowed now)
endif
endif
endif

.PHONY: clean debug release java

bolt-recipe:
@echo "Install Bolt recipe into local cache"
rm -rf ep/bolt
git clone --depth=1 --branch ${BOLT_BUILD_VERSION} https://github.com/bytedance/bolt.git ep/bolt &&\
bash ep/bolt/scripts/install-bolt-deps.sh && \
conan export ep/bolt/conanfile.py --name=bolt --version=${BOLT_BUILD_VERSION} --user=${BUILD_USER} --channel=${BUILD_CHANNEL}
@echo "Bolt recipe has been installed"

build:
mkdir -p ${BUILD_DIR} && mkdir -p ${BUILD_DIR}/releases &&\
cd ${CONAN_FILE_DIR} && export BOLT_BUILD_VERSION=${BOLT_BUILD_VERSION} &&\
ALL_CONAN_OPTIONS=" -o gluten/*:shared=${SHARED_LIBRARY} \
-o gluten/*:enable_hdfs=${ENABLE_HDFS} \
-o gluten/*:enable_s3=${ENABLE_S3} \
-o gluten/*:enable_asan=${ENABLE_ASAN} \
-o gluten/*:build_benchmarks=${BUILD_BENCHMARKS} \
-o gluten/*:build_tests=${BUILD_TESTS} \
-o gluten/*:build_examples=${BUILD_EXAMPLES} " && \
conan graph info . --name=gluten --version=${GLUTEN_BUILD_VERSION} --user=${BUILD_USER} --channel=${BUILD_CHANNEL} -c "arrow/*:tools.build:download_source=True" $${ALL_CONAN_OPTIONS} --format=html > gluten.conan.graph.html && \
NUM_THREADS=$(NUM_THREADS) conan install . --name=gluten --version=${GLUTEN_BUILD_VERSION} --user=${BUILD_USER} --channel=${BUILD_CHANNEL} \
-s llvm-core/*:build_type=Release -s build_type=${BUILD_TYPE} --build=missing $${ALL_CONAN_OPTIONS} && \
cmake --preset `echo conan-${BUILD_TYPE} | tr A-Z a-z` && \
cmake --build build/${BUILD_TYPE} -j $(NUM_THREADS) && \
if [ "${SHARED_LIBRARY}" = "True" ]; then cmake --build ${BUILD_DIR}/${BUILD_TYPE} --target install ; fi && \
if [ "${SHARED_LIBRARY}" = "False" ]; then \
conan export-pkg . --name=gluten --version=${GLUTEN_BUILD_VERSION} --user=${BUILD_USER} --channel=${BUILD_CHANNEL} -s build_type=${BUILD_TYPE} \
$${ALL_CONAN_OPTIONS} ; \
fi && cd -

release :
$(MAKE) build BUILD_TYPE=Release GLUTEN_BUILD_VERSION=${GLUTEN_BUILD_VERSION} BOLT_BUILD_VERSION=${BOLT_BUILD_VERSION} BUILD_USER=${BUILD_USER} BUILD_CHANNEL=${BUILD_CHANNEL}

debug:
$(MAKE) build BUILD_TYPE=Debug GLUTEN_BUILD_VERSION=${GLUTEN_BUILD_VERSION} BOLT_BUILD_VERSION=${BOLT_BUILD_VERSION} BUILD_USER=${BUILD_USER} BUILD_CHANNEL=${BUILD_CHANNEL}

RelWithDebInfo:
$(MAKE) build BUILD_TYPE=RelWithDebInfo GLUTEN_BUILD_VERSION=${GLUTEN_BUILD_VERSION} BUILD_USER=${BUILD_USER} BUILD_CHANNEL=${BUILD_CHANNEL}

clean_cpp:
rm -rf ${ROOT_DIR}/cpp/build &&\
rm -f cpp/conan.lock cpp/conaninfo.txt cpp/graph_info.json CMakeCache.txt

install_debug:
$(MAKE) clean_cpp
$(MAKE) debug SHARED_LIBRARY=False

install_release:
$(MAKE) clean_cpp
$(MAKE) release SHARED_LIBRARY=False

release-with-tests :
$(MAKE) build BUILD_TYPE=Release GLUTEN_BUILD_VERSION=${GLUTEN_BUILD_VERSION} BOLT_BUILD_VERSION=${BOLT_BUILD_VERSION} BUILD_USER=${BUILD_USER} BUILD_CHANNEL=${BUILD_CHANNEL} BUILD_TESTS=True

debug-with-tests :
$(MAKE) build BUILD_TYPE=Debug GLUTEN_BUILD_VERSION=${GLUTEN_BUILD_VERSION} BOLT_BUILD_VERSION=${BOLT_BUILD_VERSION} BUILD_USER=${BUILD_USER} BUILD_CHANNEL=${BUILD_CHANNEL} BUILD_TESTS=True

release-with-benchmarks :
$(MAKE) build BUILD_TYPE=Release GLUTEN_BUILD_VERSION=${GLUTEN_BUILD_VERSION} BOLT_BUILD_VERSION=${BOLT_BUILD_VERSION} B UILD_USER=${BUILD_USER} BUILD_CHANNEL=${BUILD_CHANNEL} BUILD_BENCHMARKS=True

debug-with-benchmarks :
$(MAKE) build BUILD_TYPE=Debug GLUTEN_BUILD_VERSION=${GLUTEN_BUILD_VERSION} BOLT_BUILD_VERSION=${BOLT_BUILD_VERSION} BUILD_USER=${BUILD_USER} BUILD_CHANNEL=${BUILD_CHANNEL} BUILD_BENCHMARKS=True

release-with-tests-and-benchmarks :
$(MAKE) build BUILD_TYPE=Release GLUTEN_BUILD_VERSION=${GLUTEN_BUILD_VERSION} BOLT_BUILD_VERSION=${BOLT_BUILD_VERSION} BUILD_USER=${BUILD_USER} BUILD_CHANNEL=${BUILD_CHANNEL} BUILD_BENCHMARKS=True BUILD_TESTS=True

debug-with-tests-and-benchmarks :
$(MAKE) build BUILD_TYPE=Debug GLUTEN_BUILD_VERSION=${GLUTEN_BUILD_VERSION} BOLT_BUILD_VERSION=${BOLT_BUILD_VERSION} BUILD_USER=${BUILD_USER} BUILD_CHANNEL=${BUILD_CHANNEL} BUILD_BENCHMARKS=True BUILD_TESTS=True

arrow:
bash dev/build_bolt_arrow.sh

# build gluten jar
jar:
java -version && mvn package -Pbackends-bolt -Pspark-3.3 -Pceleborn -DskipTests -Denforcer.skip=true -Pjava-8 -Ppaimon &&\
mkdir -p output && \
rm -rf output/gluten-spark*.jar
mv package/target/gluten-package-1.6.0-SNAPSHOT.jar output/gluten-spark3.2_2.12-1.0.0-SNAPSHOT-jar-with-dependencies.jar

jar-skip-check:
java -version && mvn package -Pbackends-bolt -Pspark-3.2 -Pceleborn -DskipTests -Denforcer.skip=true -Pjava-8 -Ppaimon -Dcheckstyle.skip=true -Dspotless.check.skip=true &&\
mkdir -p output && \
rm -rf output/gluten-spark*.jar
mv package/target/gluten-package-1.6.0-SNAPSHOT.jar output/gluten-spark3.2_2.12-1.0.0-SNAPSHOT-jar-with-dependencies.jar

spark32-las:
java -version && mvn package -Pbackends-bolt -Pspark-3.2-las -Pceleborn -DskipTests -Denforcer.skip=true -Pjava-8 -Ppaimon &&\
mkdir -p output && \
rm -rf output/gluten-spark*.jar
mv package/target/gluten-package-1.6.0-SNAPSHOT.jar output/gluten-spark3.2_2.12-1.0.0-SNAPSHOT-jar-with-dependencies.jar

fast-jar:
if [ ! -f "output/gluten-spark3.2_2.12-1.0.0-SNAPSHOT-jar-with-dependencies.jar" ] ; then \
$(MAKE) jar; \
else \
jar uf output/gluten-spark3.2_2.12-1.0.0-SNAPSHOT-jar-with-dependencies.jar -C cpp/build/releases/ libbolt_backend.so; \
fi

zip:
$(MAKE) jar
rm -rf output/gluten-spark*.zip
zip -j output/gluten-spark3.2_2.12-1.0.0-SNAPSHOT-jar-with-dependencies.zip output/gluten-spark3.2_2.12-1.0.0-SNAPSHOT-jar-with-dependencies.jar

fast-zip:
$(MAKE) fast-jar
rm -rf output/gluten-spark*.zip
zip -j output/gluten-spark3.2_2.12-1.0.0-SNAPSHOT-jar-with-dependencies.zip output/gluten-spark3.2_2.12-1.0.0-SNAPSHOT-jar-with-dependencies.jar

jar_spark33:
java -version && mvn -T32 clean package -Pbackends-bolt -Pspark-3.3 -Pceleborn -Piceberg -DskipTests -Denforcer.skip=true -Ppaimon && \
mkdir -p output && \
rm -rf output/gluten-spark*.jar
mv package/target/gluten-package-1.6.0-SNAPSHOT.jar output/gluten-spark3.3_2.12-1.0.0-SNAPSHOT-jar-with-dependencies.jar

jar_spark34:
java -version && mvn clean package -Pbackends-bolt -Pspark-3.4 -Pceleborn -Piceberg -DskipTests -Denforcer.skip=true -Ppaimon && \
mkdir -p output && \
rm -rf output/gluten-spark*.jar
mv package/target/gluten-package-1.6.0-SNAPSHOT.jar output/gluten-spark3.4_2.12-1.0.0-SNAPSHOT-jar-with-dependencies.jar

jar_spark35:
java -version && mvn -T32 clean package -Pbackends-bolt -Pspark-3.5 -Phadoop-3.2 -Pceleborn -Piceberg -DskipTests -Denforcer.skip=true -Ppaimon && \
mkdir -p output && \
rm -rf output/gluten-spark*.jar
mv package/target/gluten-package-1.6.0-SNAPSHOT.jar output/gluten-spark3.5_2.12-1.0.0-SNAPSHOT-jar-with-dependencies.jar

test:
mvn -Pbackends-bolt -Pspark-3.2 -Pceleborn -Ppaimon package -Denforcer.skip=true

test_spark35:
mvn -Pbackends-bolt -Pspark-3.5 -Ppaimon -Phadoop-3.2 -Pceleborn -Piceberg package -Denforcer.skip=true

cpp-test-release: release-with-tests
cd $(BUILD_DIR)/Release && ctest --timeout 7200 -j $(NUM_THREADS) --output-on-failure -V

cpp-test-debug: debug-with-tests
cd $(BUILD_DIR)/Debug && ctest --timeout 7200 -j $(NUM_THREADS) --output-on-failure -V

clean :
$(MAKE) clean_cpp
mvn clean -Pbackends-bolt -Pspark-3.2 -Pceleborn -Ppaimon -DskipTests -Denforcer.skip=true && \
rm -rf ${ROOT_DIR}/output/gluten-*.jar
54 changes: 54 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,60 @@ ClickHouse backend demonstrated an average speedup of 2.12x, with up to 3.48x sp

<sub>Test environment: a 8-nodes AWS cluster with 1TB data, using Spark 3.1.1 as the baseline and with Gluten integrated into the same Spark version.</sub>

### Bolt Backend
#### Prerequisites
* Linux operating system
* GCC 10/11/12 or Clang 16
* python 3 (virtualenv or Conda) for conan

Linux with kernel version(>5.4) is preferred, since Bolt will enable io-uring when the kernel supports.

if the system gcc version is too older, it is recommended to install GCC from source code:
```shell
# run with root privilege
bash ./dev/install-gcc.sh 12.5.0
```

Bolt adopts Conan as its package manager. Conan is an open-source, cross-platform package management tool.
We provide dedicated scripts to assist developers in setting up and installing Bolt's dependencies.
```shell
bash ./dev/install-conan.sh
```

We also provide a Dockerfile to build a Docker image for the **Bolt** backend, it includes all the prerequisites required to build Gluten with Bolt backend.
```shell
docker buildx build -t bolt -f dev/docker/Dockerfile.centos8-bolt .
```

#### Build Bolt Backend
To install bolt recipe for Gluten:
```shell
# Install the recipes of Bolt and its third-party dependencies
make bolt-recipe

# specific a version of Bolt (release or branch)
# `main` branch is the default
make bolt-recipe BOLT_BUILD_VERSION=main
```

To build bolt backend:
```shell
make release

# or specific the version for Bolt, and the version for Gluten
make release BOLT_BUILD_VERSION=main GLUTEN_BUILD_VERSION=main
```
Note that, the missing third-parties binaries will be built from source for the first time.

To build gluten:

```shell
# install arrow dependency for gluten
make arrow

make jar_spark35
```

## 8. Qualification Tool

The [Qualification Tool](./tools/qualification-tool/README.md) is a utility to analyze Spark event log files and assess the compatibility and performance of SQL workloads with Gluten. This tool helps users understand how their workloads can benefit from Gluten.
Expand Down
23 changes: 23 additions & 0 deletions backends-bolt/benchmark/ColumnarTableCacheBenchmark-results.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Mac OS X 13.5
Apple M1 Pro
table cache count: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
disable columnar table cache 16773 17024 401 1.2 838.7 1.0X
enable columnar table cache 9985 10051 65 2.0 499.3 1.0X


OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Mac OS X 13.5
Apple M1 Pro
table cache column pruning: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
disable columnar table cache 16429 16873 688 1.2 821.5 1.0X
enable columnar table cache 15118 15495 456 1.3 755.9 1.0X


OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Mac OS X 13.5
Apple M1 Pro
table cache filter: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
disable columnar table cache 22895 23527 722 0.9 1144.7 1.0X
enable columnar table cache 16673 17462 765 1.2 833.7 1.0X

Loading
Loading