From a0f47991932053512ce4d78edf5da698dcd19d48 Mon Sep 17 00:00:00 2001 From: morningman Date: Mon, 27 Mar 2023 16:23:24 +0800 Subject: [PATCH 1/8] 1. intro libhdfs --- be/CMakeLists.txt | 30 ++++++++++-- be/src/common/config.h | 2 - be/src/io/fs/broker_file_reader.cpp | 1 + be/src/io/fs/err_utils.cpp | 12 ++++- be/src/io/fs/hdfs.h | 24 +++++++++ be/src/io/fs/hdfs_file_reader.cpp | 6 ++- be/src/io/fs/hdfs_file_system.h | 4 +- be/src/io/fs/hdfs_file_writer.cpp | 9 ++-- be/src/io/hdfs_builder.cpp | 5 +- be/src/io/hdfs_builder.h | 8 +-- be/src/util/hdfs_util.h | 3 +- be/src/util/jni-util.cpp | 16 ++++-- be/src/util/jni-util.h | 5 ++ .../exec/format/parquet/parquet_thrift_util.h | 1 + .../exec/format/parquet/vparquet_reader.cpp | 13 +++++ bin/start_be.sh | 49 ++++++++++++++++--- build.sh | 4 ++ .../apache/doris/clone/BeLoadRebalancer.java | 2 +- 18 files changed, 161 insertions(+), 33 deletions(-) create mode 100644 be/src/io/fs/hdfs.h diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt index 18b058badd79bc..1037321e454244 100644 --- a/be/CMakeLists.txt +++ b/be/CMakeLists.txt @@ -420,9 +420,6 @@ set_target_properties(k5crypto PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/li add_library(gssapi_krb5 STATIC IMPORTED) set_target_properties(gssapi_krb5 PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib/libgssapi_krb5.a) -add_library(hdfs3 STATIC IMPORTED) -set_target_properties(hdfs3 PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib/libhdfs3.a) - find_program(THRIFT_COMPILER thrift ${CMAKE_SOURCE_DIR}/bin) if (OS_MACOSX) @@ -770,12 +767,37 @@ set(COMMON_THIRDPARTY # put this after lz4 to avoid using lz4 lib in librdkafka librdkafka_cpp librdkafka - hdfs3 xml2 lzma simdjson ) +if (ARCH_AMD64) + add_library(hadoop_hdfs STATIC IMPORTED) + set_target_properties(hadoop_hdfs PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib/hadoop_hdfs/native/libhdfs.a) + + add_library(jvm SHARED IMPORTED) + FILE(GLOB_RECURSE LIB_JVM $ENV{JAVA_HOME}/jre/lib/*/libjvm.so) + set_target_properties(jvm PROPERTIES IMPORTED_LOCATION ${LIB_JVM}) + + set(COMMON_THIRDPARTY + ${COMMON_THIRDPARTY} + hadoop_hdfs + jvm + ) + add_definitions(-DUSE_HADOOP_HDFS) +else() + add_library(hdfs3 STATIC IMPORTED) + set_target_properties(hdfs3 PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib/libhdfs3.a) + + # TODO: use arm hadoop hdfs to replace this + set(COMMON_THIRDPARTY + ${COMMON_THIRDPARTY} + hdfs3 + ) + add_definitions(-DUSE_LIBHDFS3) +endif() + if (absl_FOUND) set(COMMON_THIRDPARTY ${COMMON_THIRDPARTY} diff --git a/be/src/common/config.h b/be/src/common/config.h index 1d899eccd8259d..1a5529858298f1 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -870,8 +870,6 @@ CONF_Int32(segcompaction_threshold_segment_num, "10"); // The segment whose row number above the threshold will be compacted during segcompaction CONF_Int32(segcompaction_small_threshold, "1048576"); -CONF_String(jvm_max_heap_size, "1024M"); - // enable java udf and jdbc scannode CONF_Bool(enable_java_support, "true"); diff --git a/be/src/io/fs/broker_file_reader.cpp b/be/src/io/fs/broker_file_reader.cpp index 40ddfe6b4407a5..fc1334eed91564 100644 --- a/be/src/io/fs/broker_file_reader.cpp +++ b/be/src/io/fs/broker_file_reader.cpp @@ -35,6 +35,7 @@ BrokerFileReader::BrokerFileReader(const TNetworkAddress& broker_addr, const Pat _fd(fd), _fs(std::move(fs)) { _fs->get_client(&_client); + // LOG(INFO) << "yy debug broker reader size: " << _file_size; DorisMetrics::instance()->broker_file_open_reading->increment(1); DorisMetrics::instance()->broker_file_reader_total->increment(1); } diff --git a/be/src/io/fs/err_utils.cpp b/be/src/io/fs/err_utils.cpp index 1e788165e54308..d01c7e748829f6 100644 --- a/be/src/io/fs/err_utils.cpp +++ b/be/src/io/fs/err_utils.cpp @@ -18,10 +18,11 @@ #include "io/fs/err_utils.h" #include -#include #include +#include "io/fs/hdfs.h" + namespace doris { namespace io { @@ -37,8 +38,15 @@ std::string errcode_to_str(const std::error_code& ec) { std::string hdfs_error() { std::stringstream ss; char buf[1024]; - ss << "(" << errno << "), " << strerror_r(errno, buf, 1024); + ss << "(" << errno << "), " << strerror_r(errno, buf, 1024) << ")"; +#ifdef USE_HADOOP_HDFS + char* root_cause = hdfsGetLastExceptionRootCause(); + if (root_cause != nullptr) { + ss << ", reason: " << root_cause; + } +#else ss << ", reason: " << hdfsGetLastError(); +#endif return ss.str(); } diff --git a/be/src/io/fs/hdfs.h b/be/src/io/fs/hdfs.h new file mode 100644 index 00000000000000..5e288e8dbdd2bc --- /dev/null +++ b/be/src/io/fs/hdfs.h @@ -0,0 +1,24 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#if defined(__x86_64__) +#include +#else +#include +#endif diff --git a/be/src/io/fs/hdfs_file_reader.cpp b/be/src/io/fs/hdfs_file_reader.cpp index 219410ac187fc4..77e15d7beea764 100644 --- a/be/src/io/fs/hdfs_file_reader.cpp +++ b/be/src/io/fs/hdfs_file_reader.cpp @@ -17,9 +17,11 @@ #include "io/fs/hdfs_file_reader.h" +#include "io/fs/err_utils.h" #include "io/fs/hdfs_file_system.h" #include "service/backend_options.h" #include "util/doris_metrics.h" + namespace doris { namespace io { HdfsFileReader::HdfsFileReader(Path path, size_t file_size, const std::string& name_node, @@ -66,7 +68,7 @@ Status HdfsFileReader::read_at_impl(size_t offset, Slice result, size_t* bytes_r int res = hdfsSeek(handle->hdfs_fs, _hdfs_file, offset); if (res != 0) { return Status::InternalError("Seek to offset failed. (BE: {}) offset={}, err: {}", - BackendOptions::get_localhost(), offset, hdfsGetLastError()); + BackendOptions::get_localhost(), offset, hdfs_error()); } size_t bytes_req = result.size; @@ -85,7 +87,7 @@ Status HdfsFileReader::read_at_impl(size_t offset, Slice result, size_t* bytes_r return Status::InternalError( "Read hdfs file failed. (BE: {}) namenode:{}, path:{}, err: {}", BackendOptions::get_localhost(), _name_node, _path.string(), - hdfsGetLastError()); + hdfs_error()); } if (loop_read == 0) { break; diff --git a/be/src/io/fs/hdfs_file_system.h b/be/src/io/fs/hdfs_file_system.h index 75663bf198ea31..9cec56b86cd9d7 100644 --- a/be/src/io/fs/hdfs_file_system.h +++ b/be/src/io/fs/hdfs_file_system.h @@ -18,13 +18,13 @@ #pragma once #include -#include #include +#include "io/fs/hdfs.h" #include "io/fs/remote_file_system.h" -namespace doris { +namespace doris { namespace io { class HdfsFileSystemHandle { diff --git a/be/src/io/fs/hdfs_file_writer.cpp b/be/src/io/fs/hdfs_file_writer.cpp index 723856fa254102..ffb56e39e0ce45 100644 --- a/be/src/io/fs/hdfs_file_writer.cpp +++ b/be/src/io/fs/hdfs_file_writer.cpp @@ -52,7 +52,8 @@ Status HdfsFileWriter::close() { std::stringstream ss; ss << "failed to flush hdfs file. " << "(BE: " << BackendOptions::get_localhost() << ")" - << "namenode:" << _hdfs_fs->_namenode << " path:" << _path << ", err: " << hdfs_error(); + << "namenode:" << _hdfs_fs->_namenode << " path:" << _path + << ", err: " << hdfs_error(); LOG(WARNING) << ss.str(); return Status::InternalError(ss.str()); } @@ -82,7 +83,8 @@ Status HdfsFileWriter::appendv(const Slice* data, size_t data_cnt) { hdfsWrite(_hdfs_fs->_fs_handle->hdfs_fs, _hdfs_file, p, left_bytes); if (written_bytes < 0) { return Status::InternalError("write hdfs failed. namenode: {}, path: {}, error: {}", - _hdfs_fs->_namenode, _path.native(), hdfs_error()); + _hdfs_fs->_namenode, _path.native(), + hdfs_error()); } left_bytes -= written_bytes; p += written_bytes; @@ -125,7 +127,8 @@ Status HdfsFileWriter::_open() { std::stringstream ss; ss << "open file failed. " << "(BE: " << BackendOptions::get_localhost() << ")" - << " namenode:" << _hdfs_fs->_namenode << " path:" << _path << ", err: " << hdfs_error(); + << " namenode:" << _hdfs_fs->_namenode << " path:" << _path + << ", err: " << hdfs_error(); LOG(WARNING) << ss.str(); return Status::InternalError(ss.str()); } diff --git a/be/src/io/hdfs_builder.cpp b/be/src/io/hdfs_builder.cpp index b08b973860b462..8f3b765dbbd5fe 100644 --- a/be/src/io/hdfs_builder.cpp +++ b/be/src/io/hdfs_builder.cpp @@ -35,6 +35,7 @@ Status HDFSCommonBuilder::init_hdfs_builder() { return Status::InternalError( "failed to init HDFSCommonBuilder, please check check be/conf/hdfs-site.xml"); } + hdfsBuilderSetForceNewInstance(hdfs_builder); return Status::OK(); } @@ -53,7 +54,10 @@ Status HDFSCommonBuilder::run_kinit() { if (!rc) { return Status::InternalError("Kinit failed, errMsg: " + msg); } +#ifdef USE_LIBHDFS3 + hdfsBuilderSetPrincipal(hdfs_builder, hdfs_kerberos_principal.c_str()); hdfsBuilderSetKerbTicketCachePath(hdfs_builder, ticket_path.c_str()); +#endif return Status::OK(); } @@ -100,7 +104,6 @@ Status createHDFSBuilder(const THdfsParams& hdfsParams, HDFSCommonBuilder* build if (hdfsParams.__isset.hdfs_kerberos_principal) { builder->need_kinit = true; builder->hdfs_kerberos_principal = hdfsParams.hdfs_kerberos_principal; - hdfsBuilderSetPrincipal(builder->get(), hdfsParams.hdfs_kerberos_principal.c_str()); } if (hdfsParams.__isset.hdfs_kerberos_keytab) { builder->need_kinit = true; diff --git a/be/src/io/hdfs_builder.h b/be/src/io/hdfs_builder.h index ecc08d5a71fe66..b04b94a7e29077 100644 --- a/be/src/io/hdfs_builder.h +++ b/be/src/io/hdfs_builder.h @@ -17,9 +17,8 @@ #pragma once -#include - #include "common/status.h" +#include "io/fs/hdfs.h" #include "gen_cpp/PlanNodes_types.h" namespace doris { @@ -38,9 +37,12 @@ class HDFSCommonBuilder { public: HDFSCommonBuilder() {} ~HDFSCommonBuilder() { +#ifdef USE_LIBHDFS3 + // for hadoop hdfs, the hdfs_builder will be freed in hdfsConnect if (hdfs_builder != nullptr) { hdfsFreeBuilder(hdfs_builder); } +#endif } // Must call this to init hdfs_builder first. @@ -51,7 +53,7 @@ class HDFSCommonBuilder { Status run_kinit(); private: - hdfsBuilder* hdfs_builder; + hdfsBuilder* hdfs_builder = nullptr; bool need_kinit {false}; std::string hdfs_kerberos_keytab; std::string hdfs_kerberos_principal; diff --git a/be/src/util/hdfs_util.h b/be/src/util/hdfs_util.h index f98bdd5ab3687d..2e56181df7932e 100644 --- a/be/src/util/hdfs_util.h +++ b/be/src/util/hdfs_util.h @@ -17,13 +17,12 @@ #pragma once -#include - #include #include #include #include "common/status.h" +#include "io/fs/hdfs.h" #include "io/fs/path.h" #include "io/hdfs_builder.h" diff --git a/be/src/util/jni-util.cpp b/be/src/util/jni-util.cpp index 811738144e4ae9..b274ae08e813da 100644 --- a/be/src/util/jni-util.cpp +++ b/be/src/util/jni-util.cpp @@ -37,10 +37,10 @@ namespace doris { namespace { JavaVM* g_vm; -std::once_flag g_vm_once; +[[maybe_unused]] std::once_flag g_vm_once; const std::string GetDorisJNIClasspath() { - const auto* classpath = getenv("DORIS_JNI_CLASSPATH_PARAMETER"); + const auto* classpath = getenv("DORIS_CLASSPATH"); if (classpath) { return classpath; } else { @@ -66,12 +66,13 @@ const std::string GetDorisJNIClasspath() { } } -void FindOrCreateJavaVM() { +// Only used on non-x86 platform +[[maybe_unused]] void FindOrCreateJavaVM() { int num_vms; int rv = LibJVMLoader::JNI_GetCreatedJavaVMs(&g_vm, 1, &num_vms); if (rv == 0) { auto classpath = GetDorisJNIClasspath(); - std::string heap_size = fmt::format("-Xmx{}", config::jvm_max_heap_size); + std::string heap_size = fmt::format("-Xmx{}", "1024m"); std::string log_path = fmt::format("-DlogPath={}/log/udf-jdbc.log", getenv("DORIS_HOME")); std::string jvm_name = fmt::format("-Dsun.java.command={}", "DorisBE"); @@ -152,6 +153,7 @@ Status JniLocalFrame::push(JNIEnv* env, int max_local_ref) { Status JniUtil::GetJNIEnvSlowPath(JNIEnv** env) { DCHECK(!tls_env_) << "Call GetJNIEnv() fast path"; +#ifdef USE_LIBHDFS3 std::call_once(g_vm_once, FindOrCreateJavaVM); int rc = g_vm->GetEnv(reinterpret_cast(&tls_env_), JNI_VERSION_1_8); if (rc == JNI_EDETACHED) { @@ -160,6 +162,10 @@ Status JniUtil::GetJNIEnvSlowPath(JNIEnv** env) { if (rc != 0 || tls_env_ == nullptr) { return Status::InternalError("Unable to get JVM: {}", rc); } +#else + // the hadoop libhdfs will do all the stuff + tls_env_ = getJNIEnv(); +#endif *env = tls_env_; return Status::OK(); } @@ -219,7 +225,9 @@ Status JniUtil::LocalToGlobalRef(JNIEnv* env, jobject local_ref, jobject* global } Status JniUtil::Init() { +#ifdef USE_LIBHDFS3 RETURN_IF_ERROR(LibJVMLoader::instance().load()); +#endif // Get the JNIEnv* corresponding to current thread. JNIEnv* env; diff --git a/be/src/util/jni-util.h b/be/src/util/jni-util.h index 5aa8be9a1fc547..ec5f6abf6e142d 100644 --- a/be/src/util/jni-util.h +++ b/be/src/util/jni-util.h @@ -23,6 +23,11 @@ #include "gutil/macros.h" #include "util/thrift_util.h" +#ifdef USE_HADOOP_HDFS +// defined in hadoop_hdfs/hdfs.h +extern "C" JNIEnv* getJNIEnv(void); +#endif + namespace doris { #define RETURN_ERROR_IF_EXC(env) \ diff --git a/be/src/vec/exec/format/parquet/parquet_thrift_util.h b/be/src/vec/exec/format/parquet/parquet_thrift_util.h index cccbe0f9c2258a..9c540d67692c0e 100644 --- a/be/src/vec/exec/format/parquet/parquet_thrift_util.h +++ b/be/src/vec/exec/format/parquet/parquet_thrift_util.h @@ -63,6 +63,7 @@ static Status parse_thrift_footer(io::FileReaderSPtr file, FileMetaData** file_m RETURN_IF_ERROR( file->read_at(file_size - PARQUET_FOOTER_SIZE - metadata_size, res, &bytes_read)); DCHECK_EQ(bytes_read, metadata_size); + LOG(INFO) << "yy debug bytes_read: " << bytes_read << ", metadata_size: " << metadata_size; RETURN_IF_ERROR(deserialize_thrift_msg(meta_buff.get(), &metadata_size, true, &t_metadata)); *file_metadata = new FileMetaData(t_metadata); RETURN_IF_ERROR((*file_metadata)->init_schema()); diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_reader.cpp index 50f01b6b86f90e..8babca9108c952 100644 --- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp @@ -167,21 +167,30 @@ void ParquetReader::close() { } Status ParquetReader::_open_file() { + LOG(INFO) << "yy debug 1"; if (_file_reader == nullptr) { + LOG(INFO) << "yy debug 2"; SCOPED_RAW_TIMER(&_statistics.open_file_time); ++_statistics.open_file_num; RETURN_IF_ERROR(FileFactory::create_file_reader( _profile, _system_properties, _file_description, &_file_system, &_file_reader)); + LOG(INFO) << "yy debug 3"; } + LOG(INFO) << "yy debug 4"; if (_file_metadata == nullptr) { + // LOG(INFO) << "yy debug 5: file size: " << _file_reader->size(); SCOPED_RAW_TIMER(&_statistics.parse_footer_time); if (_file_reader->size() == 0) { + LOG(INFO) << "yy debug 6"; return Status::EndOfFile("open file failed, empty parquet file: " + _scan_range.path); } + LOG(INFO) << "yy debug 7"; if (_kv_cache == nullptr) { + LOG(INFO) << "yy debug 8"; _is_file_metadata_owned = true; RETURN_IF_ERROR(parse_thrift_footer(_file_reader, &_file_metadata)); } else { + LOG(INFO) << "yy debug 9"; _is_file_metadata_owned = false; _file_metadata = _kv_cache->get( _meta_cache_key(_file_reader->path()), [&]() -> FileMetaData* { @@ -194,9 +203,12 @@ Status ParquetReader::_open_file() { } return meta; }); + LOG(INFO) << "yy debug 10"; } + LOG(INFO) << "yy debug 11"; if (_file_metadata == nullptr) { + LOG(INFO) << "yy debug 12"; return Status::InternalError("failed to get file meta data: {}", _file_description.path); } @@ -230,6 +242,7 @@ void ParquetReader::_init_file_description() { _file_description.path = _scan_range.path; _file_description.start_offset = _scan_range.start_offset; _file_description.file_size = _scan_range.__isset.file_size ? _scan_range.file_size : 0; + LOG(INFO) << "yy debug _file_description.file_size: " << _scan_range.file_size; } Status ParquetReader::init_reader( diff --git a/bin/start_be.sh b/bin/start_be.sh index 7204d6511441e4..2aa665288a49d4 100755 --- a/bin/start_be.sh +++ b/bin/start_be.sh @@ -70,16 +70,34 @@ if [[ "$(uname -s)" != 'Darwin' ]]; then fi fi -# add libs to CLASSPATH +# add java libs for f in "${DORIS_HOME}/lib"/*.jar; do - if [[ -z "${DORIS_JNI_CLASSPATH_PARAMETER}" ]]; then - export DORIS_JNI_CLASSPATH_PARAMETER="${f}" + if [[ -z "${DORIS_CLASSPATH}" ]]; then + export DORIS_CLASSPATH="${f}" else - export DORIS_JNI_CLASSPATH_PARAMETER="${f}:${DORIS_JNI_CLASSPATH_PARAMETER}" + export DORIS_CLASSPATH="${f}:${DORIS_CLASSPATH}" fi done -# DORIS_JNI_CLASSPATH_PARAMETER is used to configure additional jar path to jvm. e.g. -Djava.class.path=$DORIS_HOME/lib/java-udf.jar -export DORIS_JNI_CLASSPATH_PARAMETER="-Djava.class.path=${DORIS_JNI_CLASSPATH_PARAMETER}" + +if [[ -d "${DORIS_HOME}/lib/hadoop_hdfs/" ]]; then + # add hadoop libs + for f in "${DORIS_HOME}/lib/hadoop_hdfs/common"/*.jar; do + DORIS_CLASSPATH="${f}:${DORIS_CLASSPATH}" + done + for f in "${DORIS_HOME}/lib/hadoop_hdfs/common/lib"/*.jar; do + DORIS_CLASSPATH="${f}:${DORIS_CLASSPATH}" + done + for f in "${DORIS_HOME}/lib/hadoop_hdfs/hdfs"/*.jar; do + DORIS_CLASSPATH="${f}:${DORIS_CLASSPATH}" + done + for f in "${DORIS_HOME}/lib/hadoop_hdfs/hdfs/lib"/*.jar; do + DORIS_CLASSPATH="${f}:${DORIS_CLASSPATH}" + done +fi + +# the CLASSPATH and LIBHDFS_OPTS is used for hadoop libhdfs +# and conf/ dir so that hadoop libhdfs can read .xml config file in conf/ +export CLASSPATH="${DORIS_HOME}/conf/:$DORIS_CLASSPATH" jdk_version() { local java_cmd="${1}" @@ -230,11 +248,28 @@ set_tcmalloc_heap_limit() { # set_tcmalloc_heap_limit || exit 1 -## set hdfs conf +## set hdfs3 conf if [[ -f "${DORIS_HOME}/conf/hdfs-site.xml" ]]; then export LIBHDFS3_CONF="${DORIS_HOME}/conf/hdfs-site.xml" fi +# set jvm library for hadoop libhdfs +if [[ -d "${DORIS_HOME}/lib/hadoop_hdfs/" ]]; then + MACHINE_ARCH=$(uname -m) + if [[ "${MACHINE_ARCH}" == "x86_64" ]]; then + # TODO: for now, only support hadoop libs on x86_64 + jvm_arch=amd64 + export LD_LIBRARY_PATH=$JAVA_HOME/jre/lib/$jvm_arch/server:$JAVA_HOME/jre/lib/$jvm_arch:$LD_LIBRARY_PATH + export LD_LIBRARY_PATH=$DORIS_HOME/lib/hadoop_hdfs/native:$LD_LIBRARY_PATH + export LIBHDFS_OPTS="${JAVA_OPTS}" + fi +fi + +# FIXME: for debug +echo "CLASSPATH: ${CLASSPATH}\n" +echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}\n" +echo "LIBHDFS_OPTS: ${LIBHDFS_OPTS}\n" + # see https://github.com/apache/doris/blob/master/docs/zh-CN/community/developer-guide/debug-tool.md#jemalloc-heap-profile export JEMALLOC_CONF="percpu_arena:percpu,background_thread:true,metadata_thp:auto,muzzy_decay_ms:30000,dirty_decay_ms:30000,oversize_threshold:0,lg_tcache_max:16,prof_prefix:jeprof.out" diff --git a/build.sh b/build.sh index fc1f84749f6514..9bc71ead4c8723 100755 --- a/build.sh +++ b/build.sh @@ -552,6 +552,10 @@ if [[ "${OUTPUT_BE_BINARY}" -eq 1 ]]; then cp -r -p "${DORIS_HOME}/be/output/bin"/* "${DORIS_OUTPUT}/be/bin"/ cp -r -p "${DORIS_HOME}/be/output/conf"/* "${DORIS_OUTPUT}/be/conf"/ + if [[ -d "${DORIS_THIRDPARTY}/installed/lib/hadoop_hdfs/" ]]; then + cp -r -p "${DORIS_THIRDPARTY}/installed/lib/hadoop_hdfs/" "${DORIS_OUTPUT}/be/lib/" + fi + if [[ "${DISABLE_JAVA_UDF_IN_CONF}" -eq 1 ]]; then echo -e "\033[33;1mWARNNING: \033[37;1mDisable Java UDF support in be.conf due to the BE was built without Java UDF.\033[0m" cat >>"${DORIS_OUTPUT}/be/conf/be.conf" < Date: Tue, 28 Mar 2023 22:50:35 +0800 Subject: [PATCH 2/8] fix meta tool --- be/src/tools/meta_tool.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/be/src/tools/meta_tool.cpp b/be/src/tools/meta_tool.cpp index 02dffcd9c83054..377f2e85876b59 100644 --- a/be/src/tools/meta_tool.cpp +++ b/be/src/tools/meta_tool.cpp @@ -142,7 +142,7 @@ void delete_meta(DataDir* data_dir) { Status init_data_dir(const std::string& dir, std::unique_ptr* ret) { std::string root_path; - RETURN_IF_ERROR(io::global_local_filesystem()->canonicalize(dir, &root_path)); + RETURN_IF_ERROR(doris::io::global_local_filesystem()->canonicalize(dir, &root_path)); doris::StorePath path; auto res = parse_root_path(root_path, &path); if (!res.ok()) { @@ -156,8 +156,8 @@ Status init_data_dir(const std::string& dir, std::unique_ptr* ret) { std::cout << "new data dir failed" << std::endl; return Status::InternalError("new data dir failed"); } - st = p->init(); - if (!st.ok()) { + res = p->init(); + if (!res.ok()) { std::cout << "data_dir load failed" << std::endl; return Status::InternalError("data_dir load failed"); } @@ -188,7 +188,7 @@ void batch_delete_meta(const std::string& tablet_file) { } // 1. get dir std::string dir; - Status st = io::global_local_filesystem()->canonicalize(v[0], &dir); + Status st = doris::io::global_local_filesystem()->canonicalize(v[0], &dir); if (!st.ok()) { std::cout << "invalid root dir in tablet_file: " << line << std::endl; err_num++; @@ -295,7 +295,7 @@ Status get_segment_footer(doris::io::FileReader* file_reader, SegmentFooterPB* f void show_segment_footer(const std::string& file_name) { doris::io::FileReaderSPtr file_reader; - Status st = doris::io::global_local_filesystem()->open_file(file_name, &file_reader); + Status status = doris::io::global_local_filesystem()->open_file(file_name, &file_reader); if (!status.ok()) { std::cout << "open file failed: " << status << std::endl; return; @@ -327,7 +327,7 @@ int main(int argc, char** argv) { show_meta(); } else if (FLAGS_operation == "batch_delete_meta") { std::string tablet_file; - Status st = io::global_local_filesystem()->canonicalize(FLAGS_tablet_file, &tablet_file); + Status st = doris::io::global_local_filesystem()->canonicalize(FLAGS_tablet_file, &tablet_file); if (!st.ok()) { std::cout << "invalid tablet file: " << FLAGS_tablet_file << ", error: " << st.to_string() << std::endl; From baac14838afa19fa764f0d6b943a19a48effc85f Mon Sep 17 00:00:00 2001 From: morningman Date: Wed, 29 Mar 2023 22:50:55 +0800 Subject: [PATCH 3/8] fix jvm --- be/CMakeLists.txt | 5 -- be/src/io/fs/broker_file_reader.cpp | 1 - be/src/io/fs/hdfs_file_reader.cpp | 3 +- be/src/io/fs/hdfs_file_system.cpp | 7 ++ be/src/io/fs/hdfs_file_writer.cpp | 9 +-- be/src/io/hdfs_builder.h | 2 +- be/src/tools/meta_tool.cpp | 3 +- be/src/util/jni-util.cpp | 81 ++++++++++++++----- be/src/util/libjvm_loader.cpp | 9 +++ .../exec/format/parquet/bool_rle_decoder.cpp | 3 + .../exec/format/parquet/parquet_thrift_util.h | 1 - .../parquet/vparquet_column_chunk_reader.cpp | 3 +- .../exec/format/parquet/vparquet_reader.cpp | 13 --- bin/start_be.sh | 26 +++--- conf/be.conf | 3 + 15 files changed, 107 insertions(+), 62 deletions(-) diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt index 1037321e454244..e394e0b72afc97 100644 --- a/be/CMakeLists.txt +++ b/be/CMakeLists.txt @@ -776,14 +776,9 @@ if (ARCH_AMD64) add_library(hadoop_hdfs STATIC IMPORTED) set_target_properties(hadoop_hdfs PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib/hadoop_hdfs/native/libhdfs.a) - add_library(jvm SHARED IMPORTED) - FILE(GLOB_RECURSE LIB_JVM $ENV{JAVA_HOME}/jre/lib/*/libjvm.so) - set_target_properties(jvm PROPERTIES IMPORTED_LOCATION ${LIB_JVM}) - set(COMMON_THIRDPARTY ${COMMON_THIRDPARTY} hadoop_hdfs - jvm ) add_definitions(-DUSE_HADOOP_HDFS) else() diff --git a/be/src/io/fs/broker_file_reader.cpp b/be/src/io/fs/broker_file_reader.cpp index fc1334eed91564..40ddfe6b4407a5 100644 --- a/be/src/io/fs/broker_file_reader.cpp +++ b/be/src/io/fs/broker_file_reader.cpp @@ -35,7 +35,6 @@ BrokerFileReader::BrokerFileReader(const TNetworkAddress& broker_addr, const Pat _fd(fd), _fs(std::move(fs)) { _fs->get_client(&_client); - // LOG(INFO) << "yy debug broker reader size: " << _file_size; DorisMetrics::instance()->broker_file_open_reading->increment(1); DorisMetrics::instance()->broker_file_reader_total->increment(1); } diff --git a/be/src/io/fs/hdfs_file_reader.cpp b/be/src/io/fs/hdfs_file_reader.cpp index 77e15d7beea764..ddd035213befe3 100644 --- a/be/src/io/fs/hdfs_file_reader.cpp +++ b/be/src/io/fs/hdfs_file_reader.cpp @@ -86,8 +86,7 @@ Status HdfsFileReader::read_at_impl(size_t offset, Slice result, size_t* bytes_r if (loop_read < 0) { return Status::InternalError( "Read hdfs file failed. (BE: {}) namenode:{}, path:{}, err: {}", - BackendOptions::get_localhost(), _name_node, _path.string(), - hdfs_error()); + BackendOptions::get_localhost(), _name_node, _path.string(), hdfs_error()); } if (loop_read == 0) { break; diff --git a/be/src/io/fs/hdfs_file_system.cpp b/be/src/io/fs/hdfs_file_system.cpp index 3eb5bea4c45a21..0d33bd30c9f654 100644 --- a/be/src/io/fs/hdfs_file_system.cpp +++ b/be/src/io/fs/hdfs_file_system.cpp @@ -68,6 +68,13 @@ class HdfsFileSystemCache { Status HdfsFileSystem::create(const THdfsParams& hdfs_params, const std::string& path, std::shared_ptr* fs) { +#ifdef USE_HADOOP_HDFS + if (!config::enable_java_support) { + return Status::InternalError( + "hdfs file system is not enabled, you can change be config enable_java_support to " + "true."); + } +#endif (*fs).reset(new HdfsFileSystem(hdfs_params, path)); return (*fs)->connect(); } diff --git a/be/src/io/fs/hdfs_file_writer.cpp b/be/src/io/fs/hdfs_file_writer.cpp index ffb56e39e0ce45..723856fa254102 100644 --- a/be/src/io/fs/hdfs_file_writer.cpp +++ b/be/src/io/fs/hdfs_file_writer.cpp @@ -52,8 +52,7 @@ Status HdfsFileWriter::close() { std::stringstream ss; ss << "failed to flush hdfs file. " << "(BE: " << BackendOptions::get_localhost() << ")" - << "namenode:" << _hdfs_fs->_namenode << " path:" << _path - << ", err: " << hdfs_error(); + << "namenode:" << _hdfs_fs->_namenode << " path:" << _path << ", err: " << hdfs_error(); LOG(WARNING) << ss.str(); return Status::InternalError(ss.str()); } @@ -83,8 +82,7 @@ Status HdfsFileWriter::appendv(const Slice* data, size_t data_cnt) { hdfsWrite(_hdfs_fs->_fs_handle->hdfs_fs, _hdfs_file, p, left_bytes); if (written_bytes < 0) { return Status::InternalError("write hdfs failed. namenode: {}, path: {}, error: {}", - _hdfs_fs->_namenode, _path.native(), - hdfs_error()); + _hdfs_fs->_namenode, _path.native(), hdfs_error()); } left_bytes -= written_bytes; p += written_bytes; @@ -127,8 +125,7 @@ Status HdfsFileWriter::_open() { std::stringstream ss; ss << "open file failed. " << "(BE: " << BackendOptions::get_localhost() << ")" - << " namenode:" << _hdfs_fs->_namenode << " path:" << _path - << ", err: " << hdfs_error(); + << " namenode:" << _hdfs_fs->_namenode << " path:" << _path << ", err: " << hdfs_error(); LOG(WARNING) << ss.str(); return Status::InternalError(ss.str()); } diff --git a/be/src/io/hdfs_builder.h b/be/src/io/hdfs_builder.h index b04b94a7e29077..7d448cb1cb53ca 100644 --- a/be/src/io/hdfs_builder.h +++ b/be/src/io/hdfs_builder.h @@ -18,8 +18,8 @@ #pragma once #include "common/status.h" -#include "io/fs/hdfs.h" #include "gen_cpp/PlanNodes_types.h" +#include "io/fs/hdfs.h" namespace doris { diff --git a/be/src/tools/meta_tool.cpp b/be/src/tools/meta_tool.cpp index 377f2e85876b59..452402852f77ba 100644 --- a/be/src/tools/meta_tool.cpp +++ b/be/src/tools/meta_tool.cpp @@ -327,7 +327,8 @@ int main(int argc, char** argv) { show_meta(); } else if (FLAGS_operation == "batch_delete_meta") { std::string tablet_file; - Status st = doris::io::global_local_filesystem()->canonicalize(FLAGS_tablet_file, &tablet_file); + Status st = + doris::io::global_local_filesystem()->canonicalize(FLAGS_tablet_file, &tablet_file); if (!st.ok()) { std::cout << "invalid tablet file: " << FLAGS_tablet_file << ", error: " << st.to_string() << std::endl; diff --git a/be/src/util/jni-util.cpp b/be/src/util/jni-util.cpp index b274ae08e813da..01afce76dd10cc 100644 --- a/be/src/util/jni-util.cpp +++ b/be/src/util/jni-util.cpp @@ -69,38 +69,85 @@ const std::string GetDorisJNIClasspath() { // Only used on non-x86 platform [[maybe_unused]] void FindOrCreateJavaVM() { int num_vms; - int rv = LibJVMLoader::JNI_GetCreatedJavaVMs(&g_vm, 1, &num_vms); + int rv = JNI_GetCreatedJavaVMs(&g_vm, 1, &num_vms); if (rv == 0) { + JavaVMOption* options; auto classpath = GetDorisJNIClasspath(); - std::string heap_size = fmt::format("-Xmx{}", "1024m"); - std::string log_path = fmt::format("-DlogPath={}/log/udf-jdbc.log", getenv("DORIS_HOME")); + // The following 5 opts are default opts, + // they can be override by JAVA_OPTS env var. + std::string heap_size = fmt::format("-Xmx{}", "1g"); + std::string log_path = fmt::format("-DlogPath={}/log/jni.log", getenv("DORIS_HOME")); std::string jvm_name = fmt::format("-Dsun.java.command={}", "DorisBE"); + std::string critical_jni = "-XX:-CriticalJNINatives"; + std::string max_fd_limit = "-XX:-MaxFDLimit"; - JavaVMOption options[] = { - {const_cast(classpath.c_str()), nullptr}, - {const_cast(heap_size.c_str()), nullptr}, - {const_cast(log_path.c_str()), nullptr}, - {const_cast(jvm_name.c_str()), nullptr}, + char* java_opts = getenv("JAVA_OPTS"); + + int no_args; + if (java_opts == nullptr) { + no_args = 5; // classpath, heapsize, log path, jvm_name, critical +#ifdef __APPLE__ + no_args++; // -XX:-MaxFDLimit +#endif + options = (JavaVMOption*)calloc(no_args, sizeof(JavaVMOption)); + options[0].optionString = const_cast(classpath.c_str()); + options[1].optionString = const_cast(heap_size.c_str()); + options[2].optionString = const_cast(log_path.c_str()); + options[3].optionString = const_cast(jvm_name.c_str()); + options[4].optionString = const_cast(critical_jni.c_str()); #ifdef __APPLE__ - // On macOS, we should disable MaxFDLimit, otherwise the RLIMIT_NOFILE - // will be assigned the minimum of OPEN_MAX (10240) and rlim_cur (See src/hotspot/os/bsd/os_bsd.cpp) - // and it can not pass the check performed by storage engine. - // The newer JDK has fixed this issue. - {const_cast("-XX:-MaxFDLimit"), nullptr}, + // On macOS, we should disable MaxFDLimit, otherwise the RLIMIT_NOFILE + // will be assigned the minimum of OPEN_MAX (10240) and rlim_cur (See src/hotspot/os/bsd/os_bsd.cpp) + // and it can not pass the check performed by storage engine. + // The newer JDK has fixed this issue. + options[5].optionString = const_cast(max_fd_limit.c_str()); #endif - }; + } else { + // user specified opts + // 1. find the number of args + java_opts = strdup(java_opts); + char *str, *token, *save_ptr; + char jvm_arg_delims[] = " "; + for (no_args = 1, str = java_opts;; no_args++, str = nullptr) { + token = strtok_r(str, jvm_arg_delims, &save_ptr); + if (token == nullptr) { + break; + } + } + free(java_opts); + // 2. set args + options = (JavaVMOption*)calloc(no_args, sizeof(JavaVMOption)); + options[0].optionString = const_cast(classpath.c_str()); + java_opts = getenv("JAVA_OPTS"); + if (java_opts != NULL) { + java_opts = strdup(java_opts); + for (no_args = 1, str = java_opts;; no_args++, str = nullptr) { + token = strtok_r(str, jvm_arg_delims, &save_ptr); + if (token == nullptr) { + break; + } + options[no_args].optionString = token; + } + } + } + JNIEnv* env; JavaVMInitArgs vm_args; vm_args.version = JNI_VERSION_1_8; vm_args.options = options; - vm_args.nOptions = sizeof(options) / sizeof(JavaVMOption); + vm_args.nOptions = no_args; // Set it to JNI_FALSE because JNI_TRUE will let JVM ignore the max size config. vm_args.ignoreUnrecognized = JNI_FALSE; - jint res = LibJVMLoader::JNI_CreateJavaVM(&g_vm, (void**)&env, &vm_args); + jint res = JNI_CreateJavaVM(&g_vm, (void**)&env, &vm_args); if (JNI_OK != res) { DCHECK(false) << "Failed to create JVM, code= " << res; } + + if (java_opts != nullptr) { + free(java_opts); + } + free(options); } else { CHECK_EQ(rv, 0) << "Could not find any created Java VM"; CHECK_EQ(num_vms, 1) << "No VMs returned"; @@ -225,9 +272,7 @@ Status JniUtil::LocalToGlobalRef(JNIEnv* env, jobject local_ref, jobject* global } Status JniUtil::Init() { -#ifdef USE_LIBHDFS3 RETURN_IF_ERROR(LibJVMLoader::instance().load()); -#endif // Get the JNIEnv* corresponding to current thread. JNIEnv* env; diff --git a/be/src/util/libjvm_loader.cpp b/be/src/util/libjvm_loader.cpp index 127d28c2de6efb..6175da6081f264 100644 --- a/be/src/util/libjvm_loader.cpp +++ b/be/src/util/libjvm_loader.cpp @@ -25,6 +25,15 @@ #include "common/status.h" +_JNI_IMPORT_OR_EXPORT_ jint JNICALL JNI_GetCreatedJavaVMs(JavaVM** vm_buf, jsize bufLen, + jsize* numVMs) { + return doris::LibJVMLoader::JNI_GetCreatedJavaVMs(vm_buf, bufLen, numVMs); +} + +_JNI_IMPORT_OR_EXPORT_ jint JNICALL JNI_CreateJavaVM(JavaVM** pvm, void** penv, void* args) { + return doris::LibJVMLoader::JNI_CreateJavaVM(pvm, penv, args); +} + namespace { #ifndef __APPLE__ diff --git a/be/src/vec/exec/format/parquet/bool_rle_decoder.cpp b/be/src/vec/exec/format/parquet/bool_rle_decoder.cpp index 0856687bbf92c3..c954f98b2570b1 100644 --- a/be/src/vec/exec/format/parquet/bool_rle_decoder.cpp +++ b/be/src/vec/exec/format/parquet/bool_rle_decoder.cpp @@ -24,6 +24,7 @@ void BoolRLEDecoder::set_data(Slice* slice) { _data = slice; _num_bytes = slice->size; _offset = 0; + if (_num_bytes < 4) { LOG(FATAL) << "Received invalid length : " + std::to_string(_num_bytes) + " (corrupt data page?)"; @@ -62,6 +63,8 @@ Status BoolRLEDecoder::decode_values(MutableColumnPtr& doris_column, DataTypePtr case ColumnSelectVector::CONTENT: { bool value; // Can't use uint8_t directly, we should correct it. for (size_t i = 0; i < run_length; ++i) { + DCHECK(_current_value_idx < max_values) + << _current_value_idx << " vs. " << max_values; value = _values[_current_value_idx++]; column_data[data_index++] = (UInt8)value; } diff --git a/be/src/vec/exec/format/parquet/parquet_thrift_util.h b/be/src/vec/exec/format/parquet/parquet_thrift_util.h index 9c540d67692c0e..cccbe0f9c2258a 100644 --- a/be/src/vec/exec/format/parquet/parquet_thrift_util.h +++ b/be/src/vec/exec/format/parquet/parquet_thrift_util.h @@ -63,7 +63,6 @@ static Status parse_thrift_footer(io::FileReaderSPtr file, FileMetaData** file_m RETURN_IF_ERROR( file->read_at(file_size - PARQUET_FOOTER_SIZE - metadata_size, res, &bytes_read)); DCHECK_EQ(bytes_read, metadata_size); - LOG(INFO) << "yy debug bytes_read: " << bytes_read << ", metadata_size: " << metadata_size; RETURN_IF_ERROR(deserialize_thrift_msg(meta_buff.get(), &metadata_size, true, &t_metadata)); *file_metadata = new FileMetaData(t_metadata); RETURN_IF_ERROR((*file_metadata)->init_schema()); diff --git a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp index b74d9c3db0dbf4..b08e316c22a1e2 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp @@ -233,7 +233,8 @@ void ColumnChunkReader::_reserve_decompress_buf(size_t size) { Status ColumnChunkReader::skip_values(size_t num_values, bool skip_data) { if (UNLIKELY(_remaining_num_values < num_values)) { - return Status::IOError("Skip too many values in current page"); + return Status::IOError("Skip too many values in current page. {} vs. {}", + _remaining_num_values, num_values); } _remaining_num_values -= num_values; if (skip_data) { diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_reader.cpp index 8babca9108c952..50f01b6b86f90e 100644 --- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp @@ -167,30 +167,21 @@ void ParquetReader::close() { } Status ParquetReader::_open_file() { - LOG(INFO) << "yy debug 1"; if (_file_reader == nullptr) { - LOG(INFO) << "yy debug 2"; SCOPED_RAW_TIMER(&_statistics.open_file_time); ++_statistics.open_file_num; RETURN_IF_ERROR(FileFactory::create_file_reader( _profile, _system_properties, _file_description, &_file_system, &_file_reader)); - LOG(INFO) << "yy debug 3"; } - LOG(INFO) << "yy debug 4"; if (_file_metadata == nullptr) { - // LOG(INFO) << "yy debug 5: file size: " << _file_reader->size(); SCOPED_RAW_TIMER(&_statistics.parse_footer_time); if (_file_reader->size() == 0) { - LOG(INFO) << "yy debug 6"; return Status::EndOfFile("open file failed, empty parquet file: " + _scan_range.path); } - LOG(INFO) << "yy debug 7"; if (_kv_cache == nullptr) { - LOG(INFO) << "yy debug 8"; _is_file_metadata_owned = true; RETURN_IF_ERROR(parse_thrift_footer(_file_reader, &_file_metadata)); } else { - LOG(INFO) << "yy debug 9"; _is_file_metadata_owned = false; _file_metadata = _kv_cache->get( _meta_cache_key(_file_reader->path()), [&]() -> FileMetaData* { @@ -203,12 +194,9 @@ Status ParquetReader::_open_file() { } return meta; }); - LOG(INFO) << "yy debug 10"; } - LOG(INFO) << "yy debug 11"; if (_file_metadata == nullptr) { - LOG(INFO) << "yy debug 12"; return Status::InternalError("failed to get file meta data: {}", _file_description.path); } @@ -242,7 +230,6 @@ void ParquetReader::_init_file_description() { _file_description.path = _scan_range.path; _file_description.start_offset = _scan_range.start_offset; _file_description.file_size = _scan_range.__isset.file_size ? _scan_range.file_size : 0; - LOG(INFO) << "yy debug _file_description.file_size: " << _scan_range.file_size; } Status ParquetReader::init_reader( diff --git a/bin/start_be.sh b/bin/start_be.sh index 2aa665288a49d4..9affdb9296e6d3 100755 --- a/bin/start_be.sh +++ b/bin/start_be.sh @@ -20,6 +20,7 @@ set -eo pipefail curdir="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" +MACHINE_OS=$(uname -s) if [[ "$(uname -s)" == 'Darwin' ]] && command -v brew &>/dev/null; then PATH="$(brew --prefix)/opt/gnu-getopt/bin:${PATH}" export PATH @@ -253,22 +254,21 @@ if [[ -f "${DORIS_HOME}/conf/hdfs-site.xml" ]]; then export LIBHDFS3_CONF="${DORIS_HOME}/conf/hdfs-site.xml" fi -# set jvm library for hadoop libhdfs -if [[ -d "${DORIS_HOME}/lib/hadoop_hdfs/" ]]; then - MACHINE_ARCH=$(uname -m) - if [[ "${MACHINE_ARCH}" == "x86_64" ]]; then - # TODO: for now, only support hadoop libs on x86_64 - jvm_arch=amd64 - export LD_LIBRARY_PATH=$JAVA_HOME/jre/lib/$jvm_arch/server:$JAVA_HOME/jre/lib/$jvm_arch:$LD_LIBRARY_PATH - export LD_LIBRARY_PATH=$DORIS_HOME/lib/hadoop_hdfs/native:$LD_LIBRARY_PATH - export LIBHDFS_OPTS="${JAVA_OPTS}" +if [[ -z $JAVA_OPTS ]]; then + # set default JAVA_OPTS + CUR_DATE=$(date +%Y%m%d-%H%M%S) + JAVA_OPTS="-Xmx1024m -DlogPath=$DORIS_HOME/log/jni.log -Xloggc:$DORIS_HOME/log/be.gc.log.$CUR_DATE -Dsun.java.command=DorisBE -XX:-CriticalJNINatives" + if [[ "${MACHINE_OS}" == "Darwin" ]]; then + JAVA_OPTS="${JAVA_OPTS} -XX:-MaxFDLimit" fi fi -# FIXME: for debug -echo "CLASSPATH: ${CLASSPATH}\n" -echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}\n" -echo "LIBHDFS_OPTS: ${LIBHDFS_OPTS}\n" +# set LIBHDFS_OPTS for hadoop libhdfs +export LIBHDFS_OPTS="${JAVA_OPTS}" + +#echo "CLASSPATH: ${CLASSPATH}" +#echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}" +#echo "LIBHDFS_OPTS: ${LIBHDFS_OPTS}" # see https://github.com/apache/doris/blob/master/docs/zh-CN/community/developer-guide/debug-tool.md#jemalloc-heap-profile export JEMALLOC_CONF="percpu_arena:percpu,background_thread:true,metadata_thp:auto,muzzy_decay_ms:30000,dirty_decay_ms:30000,oversize_threshold:0,lg_tcache_max:16,prof_prefix:jeprof.out" diff --git a/conf/be.conf b/conf/be.conf index 30eee9e0885191..cc1b8f6c593e4b 100644 --- a/conf/be.conf +++ b/conf/be.conf @@ -17,6 +17,9 @@ PPROF_TMPDIR="$DORIS_HOME/log/" +DATE = `date +%Y%m%d-%H%M%S` +JAVA_OPTS="-Xmx1024m -DlogPath=$DORIS_HOME/log/jni.log -Xloggc:$DORIS_HOME/log/be.gc.log.$CUR_DATE -Dsun.java.command=DorisBE -XX:-CriticalJNINatives" + # since 1.2, the JAVA_HOME need to be set to run BE process. # JAVA_HOME=/path/to/jdk/ From 3c75e4b213625bdb65cffe2e4fef7eac1dad51a3 Mon Sep 17 00:00:00 2001 From: morningman Date: Thu, 30 Mar 2023 00:09:17 +0800 Subject: [PATCH 4/8] fix libhdfs3 --- be/src/io/fs/hdfs.h | 2 +- be/src/io/hdfs_builder.cpp | 1 + bin/start_be.sh | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/be/src/io/fs/hdfs.h b/be/src/io/fs/hdfs.h index 5e288e8dbdd2bc..eb9e1b2c079573 100644 --- a/be/src/io/fs/hdfs.h +++ b/be/src/io/fs/hdfs.h @@ -17,7 +17,7 @@ #pragma once -#if defined(__x86_64__) +#ifdef USE_HADOOP_HDFS #include #else #include diff --git a/be/src/io/hdfs_builder.cpp b/be/src/io/hdfs_builder.cpp index 8f3b765dbbd5fe..8647a7450a4752 100644 --- a/be/src/io/hdfs_builder.cpp +++ b/be/src/io/hdfs_builder.cpp @@ -26,6 +26,7 @@ #include "util/string_util.h" #include "util/uid_util.h" #include "util/url_coding.h" + namespace doris { Status HDFSCommonBuilder::init_hdfs_builder() { diff --git a/bin/start_be.sh b/bin/start_be.sh index 9affdb9296e6d3..83be4b30ee6a5f 100755 --- a/bin/start_be.sh +++ b/bin/start_be.sh @@ -99,6 +99,8 @@ fi # the CLASSPATH and LIBHDFS_OPTS is used for hadoop libhdfs # and conf/ dir so that hadoop libhdfs can read .xml config file in conf/ export CLASSPATH="${DORIS_HOME}/conf/:$DORIS_CLASSPATH" +# DORIS_CLASSPATH is for self-managed jni +export DORIS_CLASSPATH="-Djava.class.path=${DORIS_CLASSPATH}" jdk_version() { local java_cmd="${1}" From 1c5107215f46d466a7f74e34080b2bb5ce0ce861 Mon Sep 17 00:00:00 2001 From: morningman Date: Thu, 30 Mar 2023 13:42:40 +0800 Subject: [PATCH 5/8] fix review --- be/src/util/jni-util.cpp | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/be/src/util/jni-util.cpp b/be/src/util/jni-util.cpp index 01afce76dd10cc..61020dcfc48b97 100644 --- a/be/src/util/jni-util.cpp +++ b/be/src/util/jni-util.cpp @@ -28,8 +28,9 @@ #include "common/config.h" #include "gutil/strings/substitute.h" -#include "jni_native_method.h" -#include "libjvm_loader.h" +#include "util/defer_op.h" +#include "util/jni_native_method.h" +#include "util/libjvm_loader.h" using std::string; @@ -71,7 +72,7 @@ const std::string GetDorisJNIClasspath() { int num_vms; int rv = JNI_GetCreatedJavaVMs(&g_vm, 1, &num_vms); if (rv == 0) { - JavaVMOption* options; + JavaVMOption* options = nullptr; auto classpath = GetDorisJNIClasspath(); // The following 5 opts are default opts, // they can be override by JAVA_OPTS env var. @@ -82,6 +83,14 @@ const std::string GetDorisJNIClasspath() { std::string max_fd_limit = "-XX:-MaxFDLimit"; char* java_opts = getenv("JAVA_OPTS"); + Defer defer {[&]() { + if (java_opts != nullptr) { + free(java_opts); + } + if (options != nullptr) { + free(options); + } + }}; int no_args; if (java_opts == nullptr) { @@ -143,11 +152,6 @@ const std::string GetDorisJNIClasspath() { if (JNI_OK != res) { DCHECK(false) << "Failed to create JVM, code= " << res; } - - if (java_opts != nullptr) { - free(java_opts); - } - free(options); } else { CHECK_EQ(rv, 0) << "Could not find any created Java VM"; CHECK_EQ(num_vms, 1) << "No VMs returned"; From 8adec90fbcf28d4f435da64f7b958e279b28892a Mon Sep 17 00:00:00 2001 From: morningman Date: Thu, 30 Mar 2023 13:50:34 +0800 Subject: [PATCH 6/8] fix shell --- bin/start_be.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/start_be.sh b/bin/start_be.sh index 83be4b30ee6a5f..b180324205e6b6 100755 --- a/bin/start_be.sh +++ b/bin/start_be.sh @@ -98,7 +98,7 @@ fi # the CLASSPATH and LIBHDFS_OPTS is used for hadoop libhdfs # and conf/ dir so that hadoop libhdfs can read .xml config file in conf/ -export CLASSPATH="${DORIS_HOME}/conf/:$DORIS_CLASSPATH" +export CLASSPATH="${DORIS_HOME}/conf/:${DORIS_CLASSPATH}" # DORIS_CLASSPATH is for self-managed jni export DORIS_CLASSPATH="-Djava.class.path=${DORIS_CLASSPATH}" @@ -256,10 +256,10 @@ if [[ -f "${DORIS_HOME}/conf/hdfs-site.xml" ]]; then export LIBHDFS3_CONF="${DORIS_HOME}/conf/hdfs-site.xml" fi -if [[ -z $JAVA_OPTS ]]; then +if [[ -z ${JAVA_OPTS} ]]; then # set default JAVA_OPTS CUR_DATE=$(date +%Y%m%d-%H%M%S) - JAVA_OPTS="-Xmx1024m -DlogPath=$DORIS_HOME/log/jni.log -Xloggc:$DORIS_HOME/log/be.gc.log.$CUR_DATE -Dsun.java.command=DorisBE -XX:-CriticalJNINatives" + JAVA_OPTS="-Xmx1024m -DlogPath=${DORIS_HOME}/log/jni.log -Xloggc:${DORIS_HOME}/log/be.gc.log.${CUR_DATE} -Dsun.java.command=DorisBE -XX:-CriticalJNINatives" if [[ "${MACHINE_OS}" == "Darwin" ]]; then JAVA_OPTS="${JAVA_OPTS} -XX:-MaxFDLimit" fi From 9d1ffb5df795129d7768a380a323bcf1d70c2fbb Mon Sep 17 00:00:00 2001 From: morningman Date: Thu, 30 Mar 2023 21:31:57 +0800 Subject: [PATCH 7/8] fix jni --- be/src/util/jni-util.cpp | 85 +++++++++++----------------------------- bin/start_be.sh | 7 ++-- 2 files changed, 27 insertions(+), 65 deletions(-) diff --git a/be/src/util/jni-util.cpp b/be/src/util/jni-util.cpp index 61020dcfc48b97..fe5a6014f3635e 100644 --- a/be/src/util/jni-util.cpp +++ b/be/src/util/jni-util.cpp @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include "common/config.h" #include "gutil/strings/substitute.h" @@ -72,79 +74,38 @@ const std::string GetDorisJNIClasspath() { int num_vms; int rv = JNI_GetCreatedJavaVMs(&g_vm, 1, &num_vms); if (rv == 0) { - JavaVMOption* options = nullptr; - auto classpath = GetDorisJNIClasspath(); - // The following 5 opts are default opts, - // they can be override by JAVA_OPTS env var. - std::string heap_size = fmt::format("-Xmx{}", "1g"); - std::string log_path = fmt::format("-DlogPath={}/log/jni.log", getenv("DORIS_HOME")); - std::string jvm_name = fmt::format("-Dsun.java.command={}", "DorisBE"); - std::string critical_jni = "-XX:-CriticalJNINatives"; - std::string max_fd_limit = "-XX:-MaxFDLimit"; + std::vector options; char* java_opts = getenv("JAVA_OPTS"); - Defer defer {[&]() { - if (java_opts != nullptr) { - free(java_opts); - } - if (options != nullptr) { - free(options); - } - }}; - - int no_args; if (java_opts == nullptr) { - no_args = 5; // classpath, heapsize, log path, jvm_name, critical -#ifdef __APPLE__ - no_args++; // -XX:-MaxFDLimit -#endif - options = (JavaVMOption*)calloc(no_args, sizeof(JavaVMOption)); - options[0].optionString = const_cast(classpath.c_str()); - options[1].optionString = const_cast(heap_size.c_str()); - options[2].optionString = const_cast(log_path.c_str()); - options[3].optionString = const_cast(jvm_name.c_str()); - options[4].optionString = const_cast(critical_jni.c_str()); + options = { + GetDorisJNIClasspath(), fmt::format("-Xmx{}", "1g"), + fmt::format("-DlogPath={}/log/jni.log", getenv("DORIS_HOME")), + fmt::format("-Dsun.java.command={}", "DorisBE"), "-XX:-CriticalJNINatives", #ifdef __APPLE__ - // On macOS, we should disable MaxFDLimit, otherwise the RLIMIT_NOFILE - // will be assigned the minimum of OPEN_MAX (10240) and rlim_cur (See src/hotspot/os/bsd/os_bsd.cpp) - // and it can not pass the check performed by storage engine. - // The newer JDK has fixed this issue. - options[5].optionString = const_cast(max_fd_limit.c_str()); + // On macOS, we should disable MaxFDLimit, otherwise the RLIMIT_NOFILE + // will be assigned the minimum of OPEN_MAX (10240) and rlim_cur (See src/hotspot/os/bsd/os_bsd.cpp) + // and it can not pass the check performed by storage engine. + // The newer JDK has fixed this issue. + "-XX:-MaxFDLimit" #endif + }; } else { - // user specified opts - // 1. find the number of args - java_opts = strdup(java_opts); - char *str, *token, *save_ptr; - char jvm_arg_delims[] = " "; - for (no_args = 1, str = java_opts;; no_args++, str = nullptr) { - token = strtok_r(str, jvm_arg_delims, &save_ptr); - if (token == nullptr) { - break; - } - } - free(java_opts); - // 2. set args - options = (JavaVMOption*)calloc(no_args, sizeof(JavaVMOption)); - options[0].optionString = const_cast(classpath.c_str()); - java_opts = getenv("JAVA_OPTS"); - if (java_opts != NULL) { - java_opts = strdup(java_opts); - for (no_args = 1, str = java_opts;; no_args++, str = nullptr) { - token = strtok_r(str, jvm_arg_delims, &save_ptr); - if (token == nullptr) { - break; - } - options[no_args].optionString = token; - } - } + std::istringstream stream(java_opts); + options = std::vector(std::istream_iterator {stream}, + std::istream_iterator()); + options.push_back(GetDorisJNIClasspath()); + } + std::unique_ptr jvm_options(new JavaVMOption[options.size()]); + for (int i = 0; i < options.size(); ++i) { + jvm_options[i] = {const_cast(options[i].c_str()), nullptr}; } JNIEnv* env; JavaVMInitArgs vm_args; vm_args.version = JNI_VERSION_1_8; - vm_args.options = options; - vm_args.nOptions = no_args; + vm_args.options = jvm_options.get(); + vm_args.nOptions = options.size(); // Set it to JNI_FALSE because JNI_TRUE will let JVM ignore the max size config. vm_args.ignoreUnrecognized = JNI_FALSE; diff --git a/bin/start_be.sh b/bin/start_be.sh index b180324205e6b6..bbaea90c022dcf 100755 --- a/bin/start_be.sh +++ b/bin/start_be.sh @@ -260,9 +260,10 @@ if [[ -z ${JAVA_OPTS} ]]; then # set default JAVA_OPTS CUR_DATE=$(date +%Y%m%d-%H%M%S) JAVA_OPTS="-Xmx1024m -DlogPath=${DORIS_HOME}/log/jni.log -Xloggc:${DORIS_HOME}/log/be.gc.log.${CUR_DATE} -Dsun.java.command=DorisBE -XX:-CriticalJNINatives" - if [[ "${MACHINE_OS}" == "Darwin" ]]; then - JAVA_OPTS="${JAVA_OPTS} -XX:-MaxFDLimit" - fi +fi + +if [[ "${MACHINE_OS}" == "Darwin" ]]; then + JAVA_OPTS="${JAVA_OPTS} -XX:-MaxFDLimit" fi # set LIBHDFS_OPTS for hadoop libhdfs From 0477c3b1e3467e304602ecbb6199daa6bc0bfce2 Mon Sep 17 00:00:00 2001 From: morningman Date: Thu, 30 Mar 2023 23:08:10 +0800 Subject: [PATCH 8/8] 1 --- be/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt index e394e0b72afc97..dd007cb2f253e2 100644 --- a/be/CMakeLists.txt +++ b/be/CMakeLists.txt @@ -772,7 +772,7 @@ set(COMMON_THIRDPARTY simdjson ) -if (ARCH_AMD64) +if (ARCH_AMD64 AND OS_LINUX) add_library(hadoop_hdfs STATIC IMPORTED) set_target_properties(hadoop_hdfs PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib/hadoop_hdfs/native/libhdfs.a)