Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions be/src/io/fs/err_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,5 +53,25 @@ std::string hdfs_error() {
return ss.str();
}

std::string glob_err_to_str(int code) {
std::string msg;
// https://sites.uclouvain.be/SystInfo/usr/include/glob.h.html
switch (code) {
case 1:
msg = "Ran out of memory";
break;
case 2:
msg = "read error";
break;
case 3:
msg = "No matches found";
break;
default:
msg = "unknown";
break;
}
return fmt::format("({}), {}", code, msg);
}

} // namespace io
} // namespace doris
1 change: 1 addition & 0 deletions be/src/io/fs/err_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ namespace io {
std::string errno_to_str();
std::string errcode_to_str(const std::error_code& ec);
std::string hdfs_error();
std::string glob_err_to_str(int code);

} // namespace io
} // namespace doris
50 changes: 50 additions & 0 deletions be/src/io/fs/local_file_system.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include <fcntl.h>
#include <fmt/format.h>
#include <glob.h>
#include <glog/logging.h>
#include <openssl/md5.h>
#include <sys/mman.h>
Expand Down Expand Up @@ -428,5 +429,54 @@ const std::shared_ptr<LocalFileSystem>& global_local_filesystem() {
return local_fs;
}

Status LocalFileSystem::canonicalize_local_file(const std::string& dir,
const std::string& file_path,
std::string* full_path) {
const std::string absolute_path = dir + "/" + file_path;
std::string canonical_path;
RETURN_IF_ERROR(canonicalize(absolute_path, &canonical_path));
if (!contain_path(dir, canonical_path)) {
return Status::InvalidArgument("file path is not allowed: {}", canonical_path);
}

*full_path = canonical_path;
return Status::OK();
}

Status LocalFileSystem::safe_glob(const std::string& path, std::vector<FileInfo>* res) {
if (path.find("..") != std::string::npos) {
return Status::InvalidArgument("can not contain '..' in path");
}
std::string full_path = config::user_files_secure_path + "/" + path;
std::vector<std::string> files;
RETURN_IF_ERROR(_glob(full_path, &files));
for (auto& file : files) {
FileInfo fi;
fi.is_file = true;
RETURN_IF_ERROR(canonicalize_local_file("", file, &(fi.file_name)));
RETURN_IF_ERROR(file_size_impl(fi.file_name, &(fi.file_size)));
res->push_back(std::move(fi));
}
return Status::OK();
}

Status LocalFileSystem::_glob(const std::string& pattern, std::vector<std::string>* res) {
glob_t glob_result;
memset(&glob_result, 0, sizeof(glob_result));

int rc = glob(pattern.c_str(), GLOB_TILDE, NULL, &glob_result);
if (rc != 0) {
globfree(&glob_result);
return Status::IOError("failed to glob {}: {}", pattern, glob_err_to_str(rc));
}

for (size_t i = 0; i < glob_result.gl_pathc; ++i) {
res->push_back(std::string(glob_result.gl_pathv[i]));
}

globfree(&glob_result);
return Status::OK();
}

} // namespace io
} // namespace doris
11 changes: 11 additions & 0 deletions be/src/io/fs/local_file_system.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,15 @@ class LocalFileSystem final : public FileSystem {
// read local file and save content to "content"
Status read_file_to_string(const Path& file, std::string* content);

Status canonicalize_local_file(const std::string& dir, const std::string& file_path,
std::string* full_path);

// glob list the files match the path pattern.
// the result will be saved in "res", in absolute path with file size.
// "safe" means the path will be concat with the path prefix config::user_files_secure_path,
// so that it can not list any files outside the config::user_files_secure_path
Status safe_glob(const std::string& path, std::vector<FileInfo>* res);

protected:
Status create_file_impl(const Path& file, FileWriterPtr* writer) override;
Status open_file_impl(const FileDescription& file_desc, const Path& abs_path,
Expand All @@ -97,6 +106,8 @@ class LocalFileSystem final : public FileSystem {
Status delete_directory_or_file_impl(const Path& path);

private:
// a wrapper for glob(), return file list in "res"
Status _glob(const std::string& pattern, std::vector<std::string>* res);
LocalFileSystem(Path&& root_path, std::string&& id = "");
};

Expand Down
1 change: 0 additions & 1 deletion be/src/olap/tablet_schema_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ void TabletSchemaCache::_recycle() {
}
}
_is_stopped = true;
LOG(INFO) << "xxx yyy stopped ";
}

} // namespace doris
22 changes: 22 additions & 0 deletions be/src/service/internal_service.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
#include "common/status.h"
#include "gutil/integral_types.h"
#include "http/http_client.h"
#include "io/fs/local_file_system.h"
#include "io/fs/stream_load_pipe.h"
#include "io/io_common.h"
#include "olap/data_dir.h"
Expand Down Expand Up @@ -1608,4 +1609,25 @@ void PInternalServiceImpl::get_tablet_rowset_versions(google::protobuf::RpcContr
ExecEnv::GetInstance()->storage_engine()->get_tablet_rowset_versions(request, response);
}

void PInternalServiceImpl::glob(google::protobuf::RpcController* controller,
const PGlobRequest* request, PGlobResponse* response,
google::protobuf::Closure* done) {
bool ret = _heavy_work_pool.try_offer([request, response, done]() {
brpc::ClosureGuard closure_guard(done);
std::vector<io::FileInfo> files;
Status st = io::global_local_filesystem()->safe_glob(request->pattern(), &files);
if (st.ok()) {
for (auto& file : files) {
PGlobResponse_PFileInfo* pfile = response->add_files();
pfile->set_file(file.file_name);
pfile->set_size(file.file_size);
}
}
st.to_protobuf(response->mutable_status());
});
if (!ret) {
offer_failed(response, done, _heavy_work_pool);
}
}

} // namespace doris
3 changes: 3 additions & 0 deletions be/src/service/internal_service.h
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,9 @@ class PInternalServiceImpl : public PBackendService {
PGetTabletVersionsResponse* response,
google::protobuf::Closure* done) override;

void glob(google::protobuf::RpcController* controller, const PGlobRequest* request,
PGlobResponse* response, google::protobuf::Closure* done) override;

private:
void _exec_plan_fragment_in_pthread(google::protobuf::RpcController* controller,
const PExecPlanFragmentRequest* request,
Expand Down
30 changes: 30 additions & 0 deletions be/test/io/fs/local_file_system_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -610,4 +610,34 @@ TEST_F(LocalFileSystemTest, TestRandomWrite) {
EXPECT_TRUE(file_reader->close().ok());
}
}

TEST_F(LocalFileSystemTest, TestGlob) {
std::string path = "./be/ut_build_ASAN/test/file_path/";
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(path).ok());
EXPECT_TRUE(io::global_local_filesystem()
->create_directory("./be/ut_build_ASAN/test/file_path/1")
.ok());
EXPECT_TRUE(io::global_local_filesystem()
->create_directory("./be/ut_build_ASAN/test/file_path/2")
.ok());
EXPECT_TRUE(io::global_local_filesystem()
->create_directory("./be/ut_build_ASAN/test/file_path/3")
.ok());

save_string_file("./be/ut_build_ASAN/test/file_path/1/f1.txt", "just test");
save_string_file("./be/ut_build_ASAN/test/file_path/1/f2.txt", "just test");
save_string_file("./be/ut_build_ASAN/test/file_path/f3.txt", "just test");

std::vector<io::FileInfo> files;
EXPECT_FALSE(io::global_local_filesystem()->safe_glob("./../*.txt", &files).ok());
EXPECT_FALSE(io::global_local_filesystem()->safe_glob("/*.txt", &files).ok());
EXPECT_TRUE(io::global_local_filesystem()->safe_glob("./file_path/1/*.txt", &files).ok());
EXPECT_EQ(2, files.size());
files.clear();
EXPECT_TRUE(io::global_local_filesystem()->safe_glob("./file_path/*/*.txt", &files).ok());
EXPECT_EQ(2, files.size());

EXPECT_TRUE(io::global_local_filesystem()->delete_directory(path).ok());
}

} // namespace doris
5 changes: 5 additions & 0 deletions docs/en/docs/admin-manual/config/be-config.md
Original file line number Diff line number Diff line change
Expand Up @@ -1465,3 +1465,8 @@ Indicates how many tablets failed to load in the data directory. At the same tim

* Description: If true, when the process does not exceed the soft mem limit, the query memory will not be limited; when the process memory exceeds the soft mem limit, the query with the largest ratio between the currently used memory and the exec_mem_limit will be canceled. If false, cancel query when the memory used exceeds exec_mem_limit.
* Default value: true

#### `user_files_secure_path`

* Description: The storage directory for files queried by `local` table valued functions.
* Default value: `${DORIS_HOME}`
150 changes: 150 additions & 0 deletions docs/en/docs/sql-manual/sql-functions/table-functions/local.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
---
{
"title": "local",
"language": "en"
}
---

<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->

## Local

### Name

<version since="dev">

local

</version>

### Description

Local table-valued-function(tvf), allows users to read and access local file contents on be node, just like accessing relational table. Currently supports `csv/csv_with_names/csv_with_names_and_types/json/parquet/orc` file format.

It needs `ADMIN` privilege to use.

#### syntax

```sql
local(
"file_path" = "path/to/file.txt",
"backend_id" = "be_id",
"format" = "csv",
"keyn" = "valuen"
...
);
```

**parameter description**

Related parameters for accessing local file on be node:

- `file_path`:

(required) The path of the file to be read, which is a relative path to the `user_files_secure_path` directory, where `user_files_secure_path` parameter [can be configured on be](../../../admin-manual/config/be-config.md).

Can not contains `..` in path. Support using glob syntax to match multi files, such as `log/*.log`

- `backend_id`:

(required) The backend id where the file resides. The `backend_id` can be obtained by `show backends` command.

File format parameters:

- `format`: (required) Currently support `csv/csv_with_names/csv_with_names_and_types/json/parquet/orc`
- `column_separator`: (optional) default `,`.
- `line_delimiter`: (optional) default `\n`.
- `compress_type`: (optional) Currently support `UNKNOWN/PLAIN/GZ/LZO/BZ2/LZ4FRAME/DEFLATE`. Default value is `UNKNOWN`, it will automatically infer the type based on the suffix of `uri`.

The following 6 parameters are used for loading in json format. For specific usage methods, please refer to: [Json Load](../../../data-operate/import/import-way/load-json-format.md)

- `read_json_by_line`: (optional) default `"true"`
- `strip_outer_array`: (optional) default `"false"`
- `json_root`: (optional) default `""`
- `json_paths`: (optional) default `""`
- `num_as_string`: (optional) default `false`
- `fuzzy_parse`: (optional) default `false`

<version since="dev">The following 2 parameters are used for loading in csv format</version>

- `trim_double_quotes`: Boolean type (optional), the default value is `false`. True means that the outermost double quotes of each field in the csv file are trimmed.
- `skip_lines`: Integer type (optional), the default value is 0. It will skip some lines in the head of csv file. It will be disabled when the format is `csv_with_names` or `csv_with_names_and_types`.

### Examples

Analyze the log file on specified BE:

```sql
mysql> select * from local(
"file_path" = "log/be.out",
"backend_id" = "10006",
"format" = "csv")
where c1 like "%start_time%" limit 10;
+--------------------------------------------------------+
| c1 |
+--------------------------------------------------------+
| start time: 2023年 08月 07日 星期一 23:20:32 CST |
| start time: 2023年 08月 07日 星期一 23:32:10 CST |
| start time: 2023年 08月 08日 星期二 00:20:50 CST |
| start time: 2023年 08月 08日 星期二 00:29:15 CST |
+--------------------------------------------------------+
```

Read and access csv format files located at path `${DORIS_HOME}/student.csv`:

```sql
mysql> select * from local(
"file_path" = "student.csv",
"backend_id" = "10003",
"format" = "csv");
+------+---------+--------+
| c1 | c2 | c3 |
+------+---------+--------+
| 1 | alice | 18 |
| 2 | bob | 20 |
| 3 | jack | 24 |
| 4 | jackson | 19 |
| 5 | liming | d18 |
+------+---------+--------+
```

Can be used with `desc function` :

```sql
mysql> desc function local(
"file_path" = "student.csv",
"backend_id" = "10003",
"format" = "csv");
+-------+------+------+-------+---------+-------+
| Field | Type | Null | Key | Default | Extra |
+-------+------+------+-------+---------+-------+
| c1 | TEXT | Yes | false | NULL | NONE |
| c2 | TEXT | Yes | false | NULL | NONE |
| c3 | TEXT | Yes | false | NULL | NONE |
+-------+------+------+-------+---------+-------+
```

### Keywords

local, table-valued-function, tvf

### Best Practice

For more detailed usage of local tvf, please refer to [S3](./s3.md) tvf, The only difference between them is the way of accessing the storage system.
1 change: 1 addition & 0 deletions docs/sidebars.json
Original file line number Diff line number Diff line change
Expand Up @@ -700,6 +700,7 @@
"sql-manual/sql-functions/table-functions/s3",
"sql-manual/sql-functions/table-functions/hdfs",
"sql-manual/sql-functions/table-functions/iceberg_meta",
"sql-manual/sql-functions/table-functions/local",
"sql-manual/sql-functions/table-functions/backends",
"sql-manual/sql-functions/table-functions/frontends",
"sql-manual/sql-functions/table-functions/workload-group",
Expand Down
5 changes: 5 additions & 0 deletions docs/zh-CN/docs/admin-manual/config/be-config.md
Original file line number Diff line number Diff line change
Expand Up @@ -1494,3 +1494,8 @@ load tablets from header failed, failed tablets size: xxx, path=xxx

* 描述: 如果为true,则当内存未超过 exec_mem_limit 时,查询内存将不受限制;当进程内存超过 exec_mem_limit 且大于 2GB 时,查询会被取消。如果为false,则在使用的内存超过 exec_mem_limit 时取消查询。
* 默认值: true

#### `user_files_secure_path`

* 描述: `local` 表函数查询的文件的存储目录。
* 默认值: `${DORIS_HOME}`
Loading