From ff3dfd3e340618329860fe45a586d763603f17da Mon Sep 17 00:00:00 2001 From: fulmicoton Date: Thu, 12 Feb 2026 11:56:40 +0100 Subject: [PATCH] Remove unused multilang tokenizer feature The multilang feature pulls heavy dependencies (lindera, whichlang) for little apparent usage. Remove it along with associated code, benchmarks, and tests to reduce build times and binary size. --- LICENSE-3rdparty.csv | 32 -- quickwit/Cargo.lock | 373 ------------------ quickwit/Cargo.toml | 12 - quickwit/quickwit-cli/Cargo.toml | 8 - quickwit/quickwit-doc-mapper/Cargo.toml | 3 +- .../src/doc_mapper/field_mapping_entry.rs | 4 +- .../quickwit-doc-mapper/src/doc_mapper/mod.rs | 53 --- .../src/doc_mapper/tokenizer_entry.rs | 6 - quickwit/quickwit-query/Cargo.toml | 17 - .../benches/multilang_tokenizers_bench.rs | 167 -------- quickwit/quickwit-query/src/lib.rs | 2 - .../src/query_ast/wildcard_query.rs | 4 - quickwit/quickwit-query/src/tokenizers/mod.rs | 24 -- .../src/tokenizers/multilang.rs | 334 ---------------- 14 files changed, 3 insertions(+), 1036 deletions(-) delete mode 100644 quickwit/quickwit-query/benches/multilang_tokenizers_bench.rs delete mode 100644 quickwit/quickwit-query/src/tokenizers/multilang.rs diff --git a/LICENSE-3rdparty.csv b/LICENSE-3rdparty.csv index 16b05826e54..594ac812cf7 100644 --- a/LICENSE-3rdparty.csv +++ b/LICENSE-3rdparty.csv @@ -53,7 +53,6 @@ base16ct,https://github.com/RustCrypto/formats/tree/master/base16ct,Apache-2.0 O base64,https://github.com/marshallpierce/rust-base64,MIT OR Apache-2.0,Marshall Pierce base64-simd,https://github.com/Nugine/simd,MIT,The base64-simd Authors base64ct,https://github.com/RustCrypto/formats,Apache-2.0 OR MIT,RustCrypto Developers -bincode,https://github.com/servo/bincode,MIT,"Ty Overby , Francesco Mazzoli , David Tolnay , Zoey Riordan " bit-set,https://github.com/contain-rs/bit-set,Apache-2.0 OR MIT,Alexis Beingessner bit-vec,https://github.com/contain-rs/bit-vec,Apache-2.0 OR MIT,Alexis Beingessner bitflags,https://github.com/bitflags/bitflags,MIT OR Apache-2.0,The Rust Project Developers @@ -108,8 +107,6 @@ crossbeam-utils,https://github.com/crossbeam-rs/crossbeam,MIT OR Apache-2.0,The crunchy,https://github.com/eira-fransham/crunchy,MIT,Eira Fransham crypto-bigint,https://github.com/RustCrypto/crypto-bigint,Apache-2.0 OR MIT,RustCrypto Developers crypto-common,https://github.com/RustCrypto/traits,MIT OR Apache-2.0,RustCrypto Developers -csv,https://github.com/BurntSushi/rust-csv,Unlicense OR MIT,Andrew Gallant -csv-core,https://github.com/BurntSushi/rust-csv,Unlicense OR MIT,Andrew Gallant darling,https://github.com/TedDriggs/darling,MIT,Ted Driggs darling_core,https://github.com/TedDriggs/darling,MIT,Ted Driggs darling_macro,https://github.com/TedDriggs/darling,MIT,Ted Driggs @@ -135,15 +132,7 @@ elliptic-curve,https://github.com/RustCrypto/traits/tree/master/elliptic-curve,A embedded-io,https://github.com/embassy-rs/embedded-io,MIT OR Apache-2.0,The embedded-io Authors embedded-io,https://github.com/rust-embedded/embedded-hal,MIT OR Apache-2.0,The embedded-io Authors encode_unicode,https://github.com/tormol/encode_unicode,Apache-2.0 OR MIT,Torbjørn Birch Moltu -encoding,https://github.com/lifthrasiir/rust-encoding,MIT,Kang Seonghoon -encoding-index-japanese,https://github.com/lifthrasiir/rust-encoding,CC0-1.0,Kang Seonghoon -encoding-index-korean,https://github.com/lifthrasiir/rust-encoding,CC0-1.0,Kang Seonghoon -encoding-index-simpchinese,https://github.com/lifthrasiir/rust-encoding,CC0-1.0,Kang Seonghoon -encoding-index-singlebyte,https://github.com/lifthrasiir/rust-encoding,CC0-1.0,Kang Seonghoon -encoding-index-tradchinese,https://github.com/lifthrasiir/rust-encoding,CC0-1.0,Kang Seonghoon -encoding_index_tests,https://github.com/lifthrasiir/rust-encoding,CC0-1.0,Kang Seonghoon encoding_rs,https://github.com/hsivonen/encoding_rs,(Apache-2.0 OR MIT) AND BSD-3-Clause,Henri Sivonen -encoding_rs_io,https://github.com/BurntSushi/encoding_rs_io,MIT OR Apache-2.0,Andrew Gallant enum-iterator,https://github.com/stephaneyfx/enum-iterator,0BSD,Stephane Raux enum-iterator-derive,https://github.com/stephaneyfx/enum-iterator,0BSD,Stephane Raux env_filter,https://github.com/rust-cli/env_logger,MIT OR Apache-2.0,The env_filter Authors @@ -156,7 +145,6 @@ fail,https://github.com/tikv/fail-rs,Apache-2.0,The TiKV Project Developers fastdivide,https://github.com/fulmicoton/fastdivide,zlib-acknowledgement OR MIT,Paul Masurel fastrand,https://github.com/smol-rs/fastrand,Apache-2.0 OR MIT,Stjepan Glavina ff,https://github.com/zkcrypto/ff,MIT OR Apache-2.0,"Sean Bowe , Jack Grigg " -filetime,https://github.com/alexcrichton/filetime,MIT OR Apache-2.0,Alex Crichton find-msvc-tools,https://github.com/rust-lang/cc-rs,MIT OR Apache-2.0,The find-msvc-tools Authors fixedbitset,https://github.com/petgraph/fixedbitset,MIT OR Apache-2.0,bluss flate2,https://github.com/rust-lang/flate2-rs,MIT OR Apache-2.0,"Alex Crichton , Josh Triplett " @@ -231,8 +219,6 @@ is-terminal,https://github.com/sunfishcode/is-terminal,MIT,"softprops -jiff,https://github.com/BurntSushi/jiff,Unlicense OR MIT,Andrew Gallant -jiff-static,https://github.com/BurntSushi/jiff,Unlicense OR MIT,Andrew Gallant jobserver,https://github.com/rust-lang/jobserver-rs,MIT OR Apache-2.0,Alex Crichton js-sys,https://github.com/wasm-bindgen/wasm-bindgen/tree/master/crates/js-sys,MIT OR Apache-2.0,The wasm-bindgen Developers json_comments,https://github.com/tmccombs/json-comments-rs,Apache-2.0,Thayne McCombs @@ -240,19 +226,6 @@ lazy_static,https://github.com/rust-lang-nursery/lazy-static.rs,MIT OR Apache-2. levenshtein_automata,https://github.com/tantivy-search/levenshtein-automata,MIT,Paul Masurel libc,https://github.com/rust-lang/libc,MIT OR Apache-2.0,The Rust Project Developers libm,https://github.com/rust-lang/compiler-builtins,MIT,Jorge Aparicio -libredox,https://gitlab.redox-os.org/redox-os/libredox,MIT,4lDO2 <4lDO2@protonmail.com> -lindera-cc-cedict,https://github.com/lindera-morphology/lindera,MIT,The lindera-cc-cedict Authors -lindera-cc-cedict-builder,https://github.com/lindera-morphology/lindera,MIT,The lindera-cc-cedict-builder Authors -lindera-core,https://github.com/lindera-morphology/lindera,MIT,The lindera-core Authors -lindera-decompress,https://github.com/lindera-morphology/lindera,MIT,The lindera-decompress Authors -lindera-dictionary,https://github.com/lindera-morphology/lindera,MIT,The lindera-dictionary Authors -lindera-ipadic,https://github.com/lindera-morphology/lindera,MIT,The lindera-ipadic Authors -lindera-ipadic-builder,https://github.com/lindera-morphology/lindera,MIT,The lindera-ipadic-builder Authors -lindera-ipadic-neologd-builder,https://github.com/lindera-morphology/lindera,MIT,The lindera-ipadic-neologd-builder Authors -lindera-ko-dic,https://github.com/lindera-morphology/lindera,MIT,The lindera-ko-dic Authors -lindera-ko-dic-builder,https://github.com/lindera-morphology/lindera,MIT,The lindera-ko-dic-builder Authors -lindera-tokenizer,https://github.com/lindera-morphology/lindera,MIT,The lindera-tokenizer Authors -lindera-unidic-builder,https://github.com/lindera-morphology/lindera,MIT,The lindera-unidic-builder Authors linked-hash-map,https://github.com/contain-rs/linked-hash-map,MIT OR Apache-2.0,"Stepan Koltsov , Andrew Paseltiner " linux-raw-sys,https://github.com/sunfishcode/linux-raw-sys,Apache-2.0 WITH LLVM-exception OR Apache-2.0 OR MIT,Dan Gohman litemap,https://github.com/unicode-org/icu4x,Unicode-3.0,The ICU4X Project Developers @@ -339,7 +312,6 @@ pnet_packet,https://github.com/libpnet/libpnet,MIT OR Apache-2.0,Robert Clipsham pnet_sys,https://github.com/libpnet/libpnet,MIT OR Apache-2.0,"Robert Clipsham , Linus Färnstrand " pnet_transport,https://github.com/libpnet/libpnet,MIT OR Apache-2.0,Robert Clipsham portable-atomic,https://github.com/taiki-e/portable-atomic,Apache-2.0 OR MIT,The portable-atomic Authors -portable-atomic-util,https://github.com/taiki-e/portable-atomic,Apache-2.0 OR MIT,The portable-atomic-util Authors postcard,https://github.com/jamesmunns/postcard,MIT OR Apache-2.0,James Munns potential_utf,https://github.com/unicode-org/icu4x,Unicode-3.0,The ICU4X Project Developers powerfmt,https://github.com/jhpratt/powerfmt,MIT OR Apache-2.0,Jacob Pratt @@ -396,7 +368,6 @@ roxmltree,https://github.com/RazrFalcon/roxmltree,MIT OR Apache-2.0,Evgeniy Reiz rust-embed,https://pyrossh.dev/repos/rust-embed,MIT,pyrossh rust-embed-impl,https://pyrossh.dev/repos/rust-embed,MIT,pyrossh rust-embed-utils,https://pyrossh.dev/repos/rust-embed,MIT,pyrossh -rust-stemmers,https://github.com/CurrySoftware/rust-stemmers,MIT OR BSD-3-Clause,"Jakob Demler , CurrySoftware " rustc-hash,https://github.com/rust-lang/rustc-hash,Apache-2.0 OR MIT,The Rust Project Developers rustix,https://github.com/bytecodealliance/rustix,Apache-2.0 WITH LLVM-exception OR Apache-2.0 OR MIT,"Dan Gohman , Jakub Konka " rustls,https://github.com/rustls/rustls,Apache-2.0 OR ISC OR MIT,The rustls Authors @@ -554,7 +525,6 @@ wasmtimer,https://github.com/whizsid/wasmtimer-rs,MIT,"WhizSid web-sys,https://github.com/wasm-bindgen/wasm-bindgen/tree/master/crates/web-sys,MIT OR Apache-2.0,The wasm-bindgen Developers web-time,https://github.com/daxpedda/web-time,MIT OR Apache-2.0,The web-time Authors webpki-roots,https://github.com/rustls/webpki-roots,CDLA-Permissive-2.0,The webpki-roots Authors -whichlang,https://github.com/quickwit-oss/whichlang,MIT,"Quickwit, Inc. " winapi,https://github.com/retep998/winapi-rs,MIT,Peter Atashian winapi,https://github.com/retep998/winapi-rs,MIT OR Apache-2.0,Peter Atashian winapi-i686-pc-windows-gnu,https://github.com/retep998/winapi-rs,MIT OR Apache-2.0,Peter Atashian @@ -598,9 +568,7 @@ windows_x86_64_msvc,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,Th winnow,https://github.com/winnow-rs/winnow,MIT,The winnow Authors wit-bindgen,https://github.com/bytecodealliance/wit-bindgen,Apache-2.0 WITH LLVM-exception OR Apache-2.0 OR MIT,Alex Crichton writeable,https://github.com/unicode-org/icu4x,Unicode-3.0,The ICU4X Project Developers -xattr,https://github.com/Stebalien/xattr,MIT OR Apache-2.0,Steven Allen xmlparser,https://github.com/RazrFalcon/xmlparser,MIT OR Apache-2.0,Yevhenii Reizner -yada,https://github.com/takuyaa/yada,MIT OR Apache-2.0,Takuya Asano yansi,https://github.com/SergioBenitez/yansi,MIT OR Apache-2.0,Sergio Benitez yoke,https://github.com/unicode-org/icu4x,Unicode-3.0,Manish Goregaokar yoke-derive,https://github.com/unicode-org/icu4x,Unicode-3.0,Manish Goregaokar diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index 4a2e43ee09a..21f1a789bec 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -1186,15 +1186,6 @@ version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7d809780667f4410e7c41b07f52439b94d2bdf8528eeedc287fa38d3b7f95d82" -[[package]] -name = "bincode" -version = "1.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" -dependencies = [ - "serde", -] - [[package]] name = "bindgen" version = "0.72.1" @@ -2654,70 +2645,6 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" -[[package]] -name = "encoding" -version = "0.2.33" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec" -dependencies = [ - "encoding-index-japanese", - "encoding-index-korean", - "encoding-index-simpchinese", - "encoding-index-singlebyte", - "encoding-index-tradchinese", -] - -[[package]] -name = "encoding-index-japanese" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding-index-korean" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding-index-simpchinese" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding-index-singlebyte" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding-index-tradchinese" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding_index_tests" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" - [[package]] name = "encoding_rs" version = "0.8.35" @@ -2727,15 +2654,6 @@ dependencies = [ "cfg-if", ] -[[package]] -name = "encoding_rs_io" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cc3c5651fb62ab8aa3103998dade57efdd028544bd300516baa31840c252a83" -dependencies = [ - "encoding_rs", -] - [[package]] name = "enum-iterator" version = "2.3.0" @@ -2763,7 +2681,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1bf3c259d255ca70051b30e2e95b5446cdb8949ac4cd22c0d7fd634d89f568e2" dependencies = [ "log", - "regex", ] [[package]] @@ -2775,7 +2692,6 @@ dependencies = [ "anstream", "anstyle", "env_filter", - "jiff", "log", ] @@ -2957,18 +2873,6 @@ version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d" -[[package]] -name = "filetime" -version = "0.2.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc0505cd1b6fa6580283f6bdf70a73fcf4aba1184038c90902b92b3dd0df63ed" -dependencies = [ - "cfg-if", - "libc", - "libredox", - "windows-sys 0.60.2", -] - [[package]] name = "find-msvc-tools" version = "0.1.6" @@ -4503,219 +4407,6 @@ dependencies = [ "vcpkg", ] -[[package]] -name = "lindera-cc-cedict" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7595a377b9723e837711366721b02662dac64d734af3dac1c01941e779e95a6b" -dependencies = [ - "bincode", - "byteorder", - "encoding", - "flate2", - "lindera-cc-cedict-builder", - "lindera-core", - "once_cell", - "tar", - "ureq", -] - -[[package]] -name = "lindera-cc-cedict-builder" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c6fbd76a65b5df73574898e871d7cff3e34bf89f544f6e1a1087cba82e25cce" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "csv", - "encoding", - "env_logger", - "glob", - "lindera-core", - "lindera-decompress", - "log", - "yada", -] - -[[package]] -name = "lindera-core" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85af015d15c25cb3b7af82ba181908f4afbec6a2636f0fdfcca6d173c1b2c7fe" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "encoding_rs", - "log", - "once_cell", - "serde", - "thiserror 1.0.69", - "yada", -] - -[[package]] -name = "lindera-decompress" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3dfc054b2f3f3eb21a24ce062a3d5f969339ddf50652038ea33993b1b97d4ba" -dependencies = [ - "anyhow", - "flate2", - "serde", -] - -[[package]] -name = "lindera-dictionary" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6b1a5d8f4cba37dcca18dc0e827233ff46695a6d878d716f16f755d264d588a" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "lindera-cc-cedict", - "lindera-cc-cedict-builder", - "lindera-core", - "lindera-ipadic", - "lindera-ipadic-builder", - "lindera-ipadic-neologd-builder", - "lindera-ko-dic", - "lindera-ko-dic-builder", - "lindera-unidic-builder", - "serde", -] - -[[package]] -name = "lindera-ipadic" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e5f1d26aba22d8a9193dcd2d087205d89e0ffb19490bc305b341e25c037f353" -dependencies = [ - "bincode", - "byteorder", - "encoding", - "flate2", - "lindera-core", - "lindera-ipadic-builder", - "once_cell", - "tar", - "ureq", -] - -[[package]] -name = "lindera-ipadic-builder" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "184a9769b05ae857bd55f5e8a94b2ae2ba8816c5c6b78c73f161b4d7490c0461" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "csv", - "encoding_rs", - "encoding_rs_io", - "env_logger", - "glob", - "lindera-core", - "lindera-decompress", - "log", - "serde", - "yada", -] - -[[package]] -name = "lindera-ipadic-neologd-builder" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b8cd28b5402425184d0f719d5bd81af87a7e36e2032b5bcceddf55011b1b22c" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "csv", - "encoding_rs", - "encoding_rs_io", - "env_logger", - "glob", - "lindera-core", - "lindera-decompress", - "log", - "serde", - "yada", -] - -[[package]] -name = "lindera-ko-dic" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a6d718720a28ac5d93b449661d8844f7858b2b71595e3198bc90e437f01e5ce" -dependencies = [ - "bincode", - "byteorder", - "encoding", - "flate2", - "lindera-core", - "lindera-ko-dic-builder", - "once_cell", - "tar", - "ureq", -] - -[[package]] -name = "lindera-ko-dic-builder" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f22de1fcdc33de258037145ae86686125214206b98d04c6dfe01f36c136c0022" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "csv", - "encoding", - "env_logger", - "glob", - "lindera-core", - "lindera-decompress", - "log", - "yada", -] - -[[package]] -name = "lindera-tokenizer" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cca45cbc1af512ce2aa9dea9a1d694430480a53bb53e37165ba143e27e81f7dd" -dependencies = [ - "bincode", - "lindera-core", - "lindera-dictionary", - "once_cell", - "serde", - "serde_json", -] - -[[package]] -name = "lindera-unidic-builder" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "359425c8dff54164ff1b068122d26df358ce18533e4771eb5c5ce68888d988f2" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "csv", - "encoding", - "env_logger", - "glob", - "lindera-core", - "lindera-decompress", - "log", - "yada", -] - [[package]] name = "linked-hash-map" version = "0.5.6" @@ -6835,7 +6526,6 @@ dependencies = [ "quickwit-cluster", "quickwit-common", "quickwit-config", - "quickwit-doc-mapper", "quickwit-index-management", "quickwit-indexing", "quickwit-ingest", @@ -7444,9 +7134,6 @@ dependencies = [ "bitpacking", "criterion", "hex", - "lindera-core", - "lindera-dictionary", - "lindera-tokenizer", "once_cell", "proptest", "quickwit-common", @@ -7462,7 +7149,6 @@ dependencies = [ "thiserror 2.0.17", "time", "tracing", - "whichlang", ] [[package]] @@ -8312,16 +7998,6 @@ dependencies = [ "walkdir", ] -[[package]] -name = "rust-stemmers" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e46a2036019fdb888131db7a4c847a1063a7493f971ed94ea82c67eada63ca54" -dependencies = [ - "serde", - "serde_derive", -] - [[package]] name = "rust_decimal" version = "1.39.0" @@ -9638,7 +9314,6 @@ dependencies = [ "oneshot", "rayon", "regex", - "rust-stemmers", "rustc-hash", "serde", "serde_json", @@ -9749,17 +9424,6 @@ dependencies = [ "serde", ] -[[package]] -name = "tar" -version = "0.4.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d863878d212c87a19c1a610eb53bb01fe12951c0501cf5a0d65f724914a667a" -dependencies = [ - "filetime", - "libc", - "xattr", -] - [[package]] name = "tempfile" version = "3.24.0" @@ -10624,21 +10288,6 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" -[[package]] -name = "ureq" -version = "2.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02d1a66277ed75f640d608235660df48c8e3c19f3b4edb6a263315626cc3c01d" -dependencies = [ - "base64 0.22.1", - "log", - "once_cell", - "rustls 0.23.36", - "rustls-pki-types", - "url", - "webpki-roots 0.26.11", -] - [[package]] name = "url" version = "2.5.8" @@ -11112,12 +10761,6 @@ dependencies = [ "rustls-pki-types", ] -[[package]] -name = "whichlang" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b9aa3ad29c3d08283ac6b769e3ec15ad1ddb88af7d2e9bc402c574973b937e7" - [[package]] name = "whoami" version = "1.6.1" @@ -11613,16 +11256,6 @@ version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" -[[package]] -name = "xattr" -version = "1.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156" -dependencies = [ - "libc", - "rustix 1.1.3", -] - [[package]] name = "xmlparser" version = "0.13.6" @@ -11635,12 +11268,6 @@ version = "0.8.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" -[[package]] -name = "yada" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aed111bd9e48a802518765906cbdadf0b45afb72b9c81ab049a3b86252adffdd" - [[package]] name = "yansi" version = "1.0.1" diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml index eefbac9159f..c2e8ec62dc3 100644 --- a/quickwit/Cargo.toml +++ b/quickwit/Cargo.toml @@ -144,17 +144,6 @@ indicatif = "0.18" itertools = "0.14" json_comments = "0.2" libz-sys = "1.1" -# Lindera tokenizer 0.30+ versions (tested up to 0.32.3) are currently broken due to upstream build failures. -# The dictionary crates attempt to download artifacts from S3 URLs that return 404 Not Found. -# Version 0.29.0 is the latest version that builds correctly. It also explicitly depends on lindera-core 0.29 -# and lindera-dictionary 0.29. -lindera-core = "0.29" -lindera-dictionary = "0.29" -lindera-tokenizer = { version = "0.29", features = [ - "cc-cedict", - "ipadic", - "ko-dic", -] } lru = "0.16" matches = "0.1" md5 = "0.8" @@ -307,7 +296,6 @@ vrl = { version = "0.29", default-features = false, features = [ "value", ] } warp = { version = "0.4", features = ["server", "test"] } -whichlang = "0.1" wiremock = "0.6" zstd = { version = "0.13", default-features = false } diff --git a/quickwit/quickwit-cli/Cargo.toml b/quickwit/quickwit-cli/Cargo.toml index c595cb7e90a..8819d92ec97 100644 --- a/quickwit/quickwit-cli/Cargo.toml +++ b/quickwit/quickwit-cli/Cargo.toml @@ -59,7 +59,6 @@ quickwit-actors = { workspace = true } quickwit-cluster = { workspace = true } quickwit-common = { workspace = true } quickwit-config = { workspace = true } -quickwit-doc-mapper = { workspace = true } quickwit-index-management = { workspace = true } quickwit-indexing = { workspace = true } quickwit-ingest = { workspace = true } @@ -105,7 +104,6 @@ release-feature-set = [ "quickwit-storage/azure", "quickwit-storage/gcs", "quickwit-metastore/postgres", - "quickwit-doc-mapper/multilang", ] release-feature-vendored-set = [ "jemalloc", @@ -119,7 +117,6 @@ release-feature-vendored-set = [ "quickwit-storage/azure", "quickwit-storage/gcs", "quickwit-metastore/postgres", - "quickwit-doc-mapper/multilang", ] release-macos-feature-vendored-set = [ "jemalloc", @@ -132,13 +129,8 @@ release-macos-feature-vendored-set = [ "quickwit-storage/azure", "quickwit-storage/gcs", "quickwit-metastore/postgres", - "quickwit-doc-mapper/multilang", ] release-jemalloc-profiled = [ "release-feature-set", "jemalloc-profiled", ] - -[package.metadata.cargo-machete] -# used to enable the `multilang` feature -ignored = ["quickwit-doc-mapper"] diff --git a/quickwit/quickwit-doc-mapper/Cargo.toml b/quickwit/quickwit-doc-mapper/Cargo.toml index ae0239e53c5..92c977fe4da 100644 --- a/quickwit/quickwit-doc-mapper/Cargo.toml +++ b/quickwit/quickwit-doc-mapper/Cargo.toml @@ -42,10 +42,9 @@ serde_yaml = { workspace = true } time = { workspace = true } quickwit-common = { workspace = true, features = ["testsuite"] } -quickwit-query = { workspace = true, features = ["multilang"] } +quickwit-query = { workspace = true } [features] -multilang = ["quickwit-query/multilang"] testsuite = [] [[bench]] diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/field_mapping_entry.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/field_mapping_entry.rs index ae3388aee32..e69d337a616 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/field_mapping_entry.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/field_mapping_entry.rs @@ -1152,7 +1152,7 @@ mod tests { "type": "text", "stored": true, "record": "basic", - "tokenizer": "en_stem" + "tokenizer": "lowercase" } "#, )?; @@ -1161,7 +1161,7 @@ mod tests { FieldMappingType::Text(options, _) => { assert_eq!(options.stored, true); let indexing_options = options.indexing_options.unwrap(); - assert_eq!(indexing_options.tokenizer.name(), "en_stem"); + assert_eq!(indexing_options.tokenizer.name(), "lowercase"); assert_eq!(indexing_options.record, IndexRecordOption::Basic); } _ => panic!("wrong property type"), diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs index bed4b18b90f..749dde228a7 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs @@ -28,8 +28,6 @@ use std::ops::Bound; pub use doc_mapper_builder::DocMapperBuilder; pub use doc_mapper_impl::DocMapper; -#[cfg(all(test, feature = "multilang"))] -pub(crate) use field_mapping_entry::TextIndexingOptions; pub use field_mapping_entry::{ BinaryFormat, FastFieldOptions, FieldMappingEntry, QuickwitBytesOptions, QuickwitJsonOptions, QuickwitTextNormalizer, @@ -812,55 +810,4 @@ mod tests { warmup_info.simplify(); assert_eq!(warmup_info, expected); } - - #[test] - #[cfg(feature = "multilang")] - fn test_doc_mapper_query_with_multilang_field() { - use quickwit_query::query_ast::TermQuery; - use tantivy::schema::IndexRecordOption; - - use crate::doc_mapper::{ - QuickwitTextOptions, QuickwitTextTokenizer, TextIndexingOptions, TokenizerType, - }; - use crate::{TokenizerConfig, TokenizerEntry}; - let mut doc_mapper_builder = DocMapperBuilder::default(); - doc_mapper_builder - .doc_mapping - .field_mappings - .push(FieldMappingEntry { - name: "multilang".to_string(), - mapping_type: FieldMappingType::Text( - QuickwitTextOptions { - indexing_options: Some(TextIndexingOptions { - tokenizer: QuickwitTextTokenizer::from_static("multilang"), - record: IndexRecordOption::Basic, - fieldnorms: false, - }), - ..Default::default() - }, - Cardinality::SingleValued, - ), - }); - doc_mapper_builder - .doc_mapping - .tokenizers - .push(TokenizerEntry { - name: "multilang".to_string(), - config: TokenizerConfig { - tokenizer_type: TokenizerType::Multilang, - filters: Vec::new(), - }, - }); - let doc_mapper = doc_mapper_builder.try_build().unwrap(); - let schema = doc_mapper.schema(); - let query_ast = quickwit_query::query_ast::QueryAst::Term(TermQuery { - field: "multilang".to_string(), - value: "JPN:す".to_string(), - }); - let (query, _) = doc_mapper.query(schema, query_ast, false, None).unwrap(); - assert_eq!( - format!("{query:?}"), - r#"TermQuery(Term(field=2, type=Str, "JPN:す"))"# - ); - } } diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/tokenizer_entry.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/tokenizer_entry.rs index b9793dc9548..0488d118c9f 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/tokenizer_entry.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/tokenizer_entry.rs @@ -44,10 +44,6 @@ impl TokenizerConfig { pub fn text_analyzer(&self) -> anyhow::Result { let mut text_analyzer_builder = match &self.tokenizer_type { TokenizerType::Simple => TextAnalyzer::builder(SimpleTokenizer::default()).dynamic(), - #[cfg(any(test, feature = "multilang"))] - TokenizerType::Multilang => { - TextAnalyzer::builder(quickwit_query::MultiLangTokenizer::default()).dynamic() - } TokenizerType::SourceCode => TextAnalyzer::builder(CodeTokenizer::default()).dynamic(), TokenizerType::Ngram(options) => { let tokenizer = @@ -120,8 +116,6 @@ impl TokenFilterType { #[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, utoipa::ToSchema)] #[serde(tag = "type", rename_all = "snake_case")] pub enum TokenizerType { - #[cfg(any(test, feature = "multilang"))] - Multilang, Ngram(NgramTokenizerOption), Regex(RegexTokenizerOption), Simple, diff --git a/quickwit/quickwit-query/Cargo.toml b/quickwit/quickwit-query/Cargo.toml index 066c00c0ff7..f24d8662715 100644 --- a/quickwit/quickwit-query/Cargo.toml +++ b/quickwit/quickwit-query/Cargo.toml @@ -15,9 +15,6 @@ anyhow = { workspace = true } base64 = { workspace = true } bitpacking = { workspace = true } hex = { workspace = true } -lindera-core = { workspace = true, optional = true } -lindera-dictionary = { workspace = true, optional = true } -lindera-tokenizer = { workspace = true, optional = true } once_cell = { workspace = true } regex = { workspace = true } serde = { workspace = true } @@ -29,7 +26,6 @@ tracing = { workspace = true } time = { workspace = true } thiserror = { workspace = true } rustc-hash = { workspace = true } -whichlang = { workspace = true, optional = true } quickwit-common = { workspace = true } quickwit-datetime = { workspace = true } @@ -42,19 +38,6 @@ time = { workspace = true } quickwit-common = { workspace = true, features = ["testsuite"] } -[features] -multilang = [ - "lindera-core", - "lindera-dictionary", - "lindera-tokenizer", - "whichlang", - "tantivy/stemmer", -] - [[bench]] name = "tokenizers_bench" harness = false - -[[bench]] -name = "multilang_tokenizers_bench" -harness = false diff --git a/quickwit/quickwit-query/benches/multilang_tokenizers_bench.rs b/quickwit/quickwit-query/benches/multilang_tokenizers_bench.rs deleted file mode 100644 index 61755dea556..00000000000 --- a/quickwit/quickwit-query/benches/multilang_tokenizers_bench.rs +++ /dev/null @@ -1,167 +0,0 @@ -// Copyright 2021-Present Datadog, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use criterion::{Criterion, Throughput, black_box, criterion_group, criterion_main}; -use quickwit_query::create_default_quickwit_tokenizer_manager; -use tantivy::tokenizer::{TextAnalyzer, Token, TokenStream}; - -// A random ascii string of length 100 chars. -const ASCII_SHORT: &str = "It is a long established fact"; -static ASCII_LONG: &str = r#"It is a long established fact that a reader will be distracted by the readable content of a - page when looking at its layout. The point of using Lorem Ipsum is that it has a - more-or-less normal distribution of letters, as opposed to using 'Content here, content - here', making it look like readable English. Many desktop publishing packages and web page - editors now use Lorem Ipsum as their default model text, and a search for 'lorem ipsum' will - uncover many web sites still in their infancy. Various versions have evolved over the years, - sometimes by accident, sometimes on purpose (injected humour and the like)."#; -const JPN_SHORT: &str = "日本ごです。 とても素敵な言葉ですね"; -const JPN_LONG: &str = r#"日本ごです。 和名の由来は、 - 太陽の動きにつれてその方向を追うように花が回るといわれたことから。 - ただしこの動きは生長に伴うものであるため、 - 実際に太陽を追って動くのは生長が盛んな若い時期だけである。 - 若いヒマワリの茎の上部の葉は太陽に正対になるように動き、 - 朝には東を向いていたのが夕方には西を向く。日没後はまもなく起きあがり、 - 夜明け前にはふたたび東に向く。この運動はつぼみを付ける頃まで続くが、 - つぼみが大きくなり花が開く素敵な言葉ですね."#; -const CMN_SHORT: &str = "滚滚长江东逝水,浪花淘尽英雄。"; -const CMN_LONG: &str = r#"滚滚长江东逝水,浪花淘尽英雄。是非成败转头空,青山依旧在,几度夕阳红。 - 白发渔樵江渚上,惯看秋月春风。一壶浊酒喜相逢,古今多少事,都付笑谈中。 - 是非成败转头空,青山依旧在,惯看秋月春风。一壶浊酒喜相逢,古今多少事, - 滚滚长江东逝水,浪花淘尽英雄。 几度夕阳红。白发渔樵江渚上,都付笑谈中。"#; -const KOR_SHORT: &str = "안녕하세요. 반갑습니다."; -const KOR_LONG: &str = r#" -포근히 내려오는 눈밭속에서는 -낯이 붉은 處女아이들도 깃들이어 오는 소리… -울고 -웃고 -수구리고 -새파라니 얼어서 -運命들이 모두다 안끼어 드는 소리… -큰놈에겐 큰 눈물자국, 작은놈에겐 작은 웃음 흔적 -큰이얘기 작은이얘기들이 오부록이 도란 그리며 안끼어 오는 소리 -끊임없이 내리는 눈발 속에서는 -山도 山도 靑山도 안끼어 드는 소리 -"#; - -fn process_tokens(analyzer: &mut TextAnalyzer, text: &str) -> Vec { - let mut token_stream = analyzer.token_stream(text); - let mut tokens: Vec = Vec::new(); - token_stream.process(&mut |token: &Token| tokens.push(token.clone())); - tokens -} - -pub fn tokenizers_throughput_benchmark(c: &mut Criterion) { - let mut group = c.benchmark_group("multilang"); - let tokenizer_manager = create_default_quickwit_tokenizer_manager(); - let mut default_tokenizer = tokenizer_manager.get_tokenizer("default").unwrap(); - let mut multilang_tokenizer = tokenizer_manager.get_tokenizer("multilang").unwrap(); - let mut chinese_tokenizer = tokenizer_manager - .get_tokenizer("chinese_compatible") - .unwrap(); - - group - .throughput(Throughput::Bytes(ASCII_SHORT.len() as u64)) - .bench_with_input("default-tokenize-short", ASCII_SHORT, |b, text| { - b.iter(|| process_tokens(&mut default_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(ASCII_LONG.len() as u64)) - .bench_with_input("default-tokenize-long", ASCII_LONG, |b, text| { - b.iter(|| process_tokens(&mut default_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(ASCII_SHORT.len() as u64)) - .bench_with_input("multilang-eng-tokenize-short", ASCII_SHORT, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(ASCII_LONG.len() as u64)) - .bench_with_input("multilang-eng-tokenize-long", ASCII_LONG, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - let short_with_prefix = "ENG:".to_string() + ASCII_SHORT; - group - .throughput(Throughput::Bytes(ASCII_SHORT.len() as u64)) - .bench_with_input( - "multilang-tokenize-short-with-prefix", - &short_with_prefix, - |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }, - ); - let long_with_prefix = "ENG:".to_string() + ASCII_LONG; - group - .throughput(Throughput::Bytes(ASCII_LONG.len() as u64)) - .bench_with_input( - "multilang-tokenize-long-with-prefix", - &long_with_prefix, - |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }, - ); - group - .throughput(Throughput::Bytes(JPN_SHORT.len() as u64)) - .bench_with_input("multilang-tokenize-jpn-short", JPN_SHORT, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(JPN_LONG.len() as u64)) - .bench_with_input("multilang-tokenize-jpn-long", JPN_LONG, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(CMN_SHORT.len() as u64)) - .bench_with_input("multilang-tokenize-cmn-short", CMN_SHORT, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(CMN_LONG.len() as u64)) - .bench_with_input("multilang-tokenize-cmn-long", CMN_LONG, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(KOR_SHORT.len() as u64)) - .bench_with_input("multilang-tokenize-kor-short", KOR_SHORT, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(KOR_LONG.len() as u64)) - .bench_with_input("multilang-tokenize-kor-long", KOR_LONG, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(CMN_SHORT.len() as u64)) - .bench_with_input( - "chinese-compatible-tokenize-cmn-short", - CMN_SHORT, - |b, text| { - b.iter(|| process_tokens(&mut chinese_tokenizer, black_box(text))); - }, - ); - group - .throughput(Throughput::Bytes(CMN_LONG.len() as u64)) - .bench_with_input( - "chinese-compatible-tokenize-cmn-long", - CMN_LONG, - |b, text| { - b.iter(|| process_tokens(&mut chinese_tokenizer, black_box(text))); - }, - ); -} - -criterion_group!( - tokenizers_throughput_benches, - tokenizers_throughput_benchmark -); -criterion_main!(tokenizers_throughput_benches); diff --git a/quickwit/quickwit-query/src/lib.rs b/quickwit/quickwit-query/src/lib.rs index b2040f73daa..8f70e155933 100644 --- a/quickwit/quickwit-query/src/lib.rs +++ b/quickwit/quickwit-query/src/lib.rs @@ -38,8 +38,6 @@ pub(crate) use not_nan_f32::NotNaNf32; pub use query_ast::utils::find_field_or_hit_dynamic; use serde::{Deserialize, Serialize}; pub use tantivy::query::Query as TantivyQuery; -#[cfg(feature = "multilang")] -pub use tokenizers::MultiLangTokenizer; pub use tokenizers::{ CodeTokenizer, DEFAULT_REMOVE_TOKEN_LENGTH, create_default_quickwit_tokenizer_manager, get_quickwit_fastfield_normalizer_manager, diff --git a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs index 84176f4a4aa..7b24a66163d 100644 --- a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs +++ b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs @@ -247,7 +247,6 @@ mod tests { "raw_lowercase", "lowercase", "default", - "en_stem", "chinese_compatible", "source_code_default", "source_code_with_hex", @@ -290,7 +289,6 @@ mod tests { "raw_lowercase", "lowercase", "default", - "en_stem", "chinese_compatible", "source_code_default", "source_code_with_hex", @@ -335,7 +333,6 @@ mod tests { "raw_lowercase", "lowercase", "default", - "en_stem", "chinese_compatible", "source_code_default", "source_code_with_hex", @@ -398,7 +395,6 @@ mod tests { "raw_lowercase", "lowercase", "default", - "en_stem", "chinese_compatible", "source_code_default", "source_code_with_hex", diff --git a/quickwit/quickwit-query/src/tokenizers/mod.rs b/quickwit/quickwit-query/src/tokenizers/mod.rs index d086c36a977..5a90715075e 100644 --- a/quickwit/quickwit-query/src/tokenizers/mod.rs +++ b/quickwit/quickwit-query/src/tokenizers/mod.rs @@ -14,8 +14,6 @@ mod chinese_compatible; mod code_tokenizer; -#[cfg(feature = "multilang")] -mod multilang; mod tokenizer_manager; use once_cell::sync::Lazy; @@ -26,8 +24,6 @@ use tantivy::tokenizer::{ use self::chinese_compatible::ChineseTokenizer; pub use self::code_tokenizer::CodeTokenizer; -#[cfg(feature = "multilang")] -pub use self::multilang::MultiLangTokenizer; pub use self::tokenizer_manager::{RAW_TOKENIZER_NAME, TokenizerManager}; pub const DEFAULT_REMOVE_TOKEN_LENGTH: usize = 255; @@ -58,17 +54,6 @@ pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager { .filter(LowerCaser) .build(); tokenizer_manager.register("default", default_tokenizer, true); - #[cfg(feature = "multilang")] - { - let en_stem_tokenizer = TextAnalyzer::builder(SimpleTokenizer::default()) - .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) - .filter(LowerCaser) - .filter(tantivy::tokenizer::Stemmer::new( - tantivy::tokenizer::Language::English, - )) - .build(); - tokenizer_manager.register("en_stem", en_stem_tokenizer, true); - } tokenizer_manager.register("whitespace", WhitespaceTokenizer::default(), false); let chinese_tokenizer = TextAnalyzer::builder(ChineseTokenizer) @@ -94,15 +79,6 @@ pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager { .build(), true, ); - #[cfg(feature = "multilang")] - tokenizer_manager.register( - "multilang_default", - TextAnalyzer::builder(MultiLangTokenizer::default()) - .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) - .filter(LowerCaser) - .build(), - true, - ); tokenizer_manager } diff --git a/quickwit/quickwit-query/src/tokenizers/multilang.rs b/quickwit/quickwit-query/src/tokenizers/multilang.rs deleted file mode 100644 index a62d2ff151c..00000000000 --- a/quickwit/quickwit-query/src/tokenizers/multilang.rs +++ /dev/null @@ -1,334 +0,0 @@ -// Copyright 2021-Present Datadog, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use lindera_core::mode::Mode; -use lindera_dictionary::{DictionaryConfig, DictionaryKind, load_dictionary_from_config}; -use lindera_tokenizer::token::Token as LinderaToken; -use lindera_tokenizer::tokenizer::Tokenizer as LinderaTokenizer; -use once_cell::sync::Lazy; -use tantivy::tokenizer::{SimpleTokenStream, SimpleTokenizer, Token, TokenStream, Tokenizer}; -use whichlang::{Lang, detect_language}; - -// Note(fmassot): we use `lindera_tokenizer::tokenizer::Tokenizer` and not -// `use lindera_tantivy::tokenizer::LinderaTokenizer` to avoid -// costly copy of lindera dictionaries each time we clone the `MultiLangTokenizer`. - -/// Mandarin chinese tokenizer. -static CMN_TOKENIZER: Lazy = Lazy::new(|| { - let cmn_dictionary_config = DictionaryConfig { - kind: Some(DictionaryKind::CcCedict), - path: None, - }; - let cmn_dictionary = load_dictionary_from_config(cmn_dictionary_config) - .expect("Lindera `CcCedict` dictionary must be present"); - LinderaTokenizer::new(cmn_dictionary, None, Mode::Normal) -}); - -/// Japanese tokenizer. -static JPN_TOKENIZER: Lazy = Lazy::new(|| { - let jpn_dictionary_config = DictionaryConfig { - kind: Some(DictionaryKind::IPADIC), - path: None, - }; - let jpn_dictionary = load_dictionary_from_config(jpn_dictionary_config) - .expect("Lindera `IPADIC` dictionary must be present"); - LinderaTokenizer::new(jpn_dictionary, None, Mode::Normal) -}); - -/// Korean tokenizer. -static KOR_TOKENIZER: Lazy = Lazy::new(|| { - let kor_dictionary_config = DictionaryConfig { - kind: Some(DictionaryKind::KoDic), - path: None, - }; - let kor_dictionary = load_dictionary_from_config(kor_dictionary_config) - .expect("Lindera `KoDic` dictionary must be present"); - LinderaTokenizer::new(kor_dictionary, None, Mode::Normal) -}); - -/// Multilanguage tokenizer that uses the `whichlang` to detect the language of the text -/// and uses the appropriate tokenizer for the detected language: -/// - lindera for Chinese, Japanese, and Korean. -/// - Quickwit's default tokenizer for other languages. -/// -/// It is possible to bypass the language detection by prefixing the text with the language code -/// followed by a colon. For example, `KOR:일본입니다` will be tokenized by the korean tokenizer. -/// Current supported prefix are: -/// - `KOR:` for Korean tokenizer -/// - `JPN:` for Japanese tokenizer -/// - `CMN:` for Chinese tokenizer -/// - `ENG:` for Quickwit's default tokenizer -#[derive(Clone, Default)] -pub struct MultiLangTokenizer { - default_tokenizer: SimpleTokenizer, - token: Token, -} - -impl Tokenizer for MultiLangTokenizer { - type TokenStream<'a> = MultiLanguageTokenStream<'a>; - fn token_stream<'a>(&'a mut self, text: &'a str) -> MultiLanguageTokenStream<'a> { - self.token.reset(); - let (language_prefix, text_to_tokenize) = get_language_from_prefix(text); - // If the text is empty, we return an empty token stream. - // `whichlang::detect_language` panicks if the text is empty. - if text.trim().is_empty() { - return MultiLanguageTokenStream::Empty; - } - let language = language_prefix.unwrap_or_else(|| detect_language(text_to_tokenize)); - match language { - Lang::Cmn => { - let lindera_token_stream = LinderaTokenStream { - tokens: CMN_TOKENIZER - .tokenize(text_to_tokenize) - .expect("tokenize method should never fail"), - token: &mut self.token, - }; - MultiLanguageTokenStream::Lindera(lindera_token_stream) - } - Lang::Jpn => { - let lindera_token_stream = LinderaTokenStream { - tokens: JPN_TOKENIZER - .tokenize(text_to_tokenize) - .expect("tokenize method should never fail"), - token: &mut self.token, - }; - MultiLanguageTokenStream::Lindera(lindera_token_stream) - } - Lang::Kor => { - let lindera_token_stream = LinderaTokenStream { - tokens: KOR_TOKENIZER - .tokenize(text_to_tokenize) - .expect("tokenize method should never fail"), - token: &mut self.token, - }; - MultiLanguageTokenStream::Lindera(lindera_token_stream) - } - _ => MultiLanguageTokenStream::Simple( - self.default_tokenizer.token_stream(text_to_tokenize), - ), - } - } -} - -/// Gets the language defined by a prefix `{ID}:text` where ID being the 3-letter language used by -/// whichlang) and returns the language and the text without the prefix. If the prefix is not -/// recognized, the language is `None` and the text is the original. -fn get_language_from_prefix(text: &str) -> (Option, &str) { - let prefix_bytes = &text.as_bytes()[0..std::cmp::min(4, text.len())]; - // TODO: refactor. - let prefix_language = match prefix_bytes { - b"CMN:" => Some(Lang::Cmn), - b"ENG:" => Some(Lang::Eng), - b"JPN:" => Some(Lang::Jpn), - b"KOR:" => Some(Lang::Kor), - _ => None, - }; - let text_without_prefix = if prefix_language.is_some() { - // This is safe as we know that the prefix is made of 4 ascii characters. - &text[4..] - } else { - text - }; - (prefix_language, text_without_prefix) -} -pub enum MultiLanguageTokenStream<'a> { - Empty, - Lindera(LinderaTokenStream<'a>), - Simple(SimpleTokenStream<'a>), -} - -impl TokenStream for MultiLanguageTokenStream<'_> { - fn advance(&mut self) -> bool { - match self { - MultiLanguageTokenStream::Empty => false, - MultiLanguageTokenStream::Lindera(tokenizer) => tokenizer.advance(), - MultiLanguageTokenStream::Simple(tokenizer) => tokenizer.advance(), - } - } - - fn token(&self) -> &Token { - match self { - MultiLanguageTokenStream::Empty => { - panic!("Cannot call token() on an empty token stream.") - } - MultiLanguageTokenStream::Lindera(tokenizer) => tokenizer.token(), - MultiLanguageTokenStream::Simple(tokenizer) => tokenizer.token(), - } - } - - fn token_mut(&mut self) -> &mut Token { - match self { - MultiLanguageTokenStream::Empty => { - panic!("Cannot call token_mut() on an empty token stream.") - } - MultiLanguageTokenStream::Lindera(tokenizer) => tokenizer.token_mut(), - MultiLanguageTokenStream::Simple(tokenizer) => tokenizer.token_mut(), - } - } -} - -pub struct LinderaTokenStream<'a> { - pub tokens: Vec>, - pub token: &'a mut Token, -} - -impl TokenStream for LinderaTokenStream<'_> { - fn advance(&mut self) -> bool { - if self.tokens.is_empty() { - return false; - } - let token = self.tokens.remove(0); - self.token.text = token.text.to_string(); - self.token.offset_from = token.byte_start; - self.token.offset_to = token.byte_end; - self.token.position = token.position; - self.token.position_length = token.position_length; - - true - } - - fn token(&self) -> &Token { - self.token - } - - fn token_mut(&mut self) -> &mut Token { - self.token - } -} - -#[cfg(test)] -mod tests { - use tantivy::tokenizer::{Token, TokenStream, Tokenizer}; - - use super::{MultiLangTokenizer, MultiLanguageTokenStream, get_language_from_prefix}; - - fn test_helper(mut tokenizer: MultiLanguageTokenStream) -> Vec { - let mut tokens: Vec = Vec::new(); - tokenizer.process(&mut |token: &Token| tokens.push(token.clone())); - tokens - } - - #[test] - fn test_multilanguage_tokenizer_cmn() { - let mut tokenizer = MultiLangTokenizer::default(); - let tokens = test_helper( - tokenizer.token_stream("地址1,包含無效的字元 (包括符號與不標準的asci阿爾發字元"), - ); - assert_eq!(tokens.len(), 19); - { - let token = &tokens[0]; - assert_eq!(token.text, "地址"); - assert_eq!(token.offset_from, 0); - assert_eq!(token.offset_to, 6); - assert_eq!(token.position, 0); - assert_eq!(token.position_length, 1); - } - } - - #[test] - fn test_multilanguage_tokenizer_jpn() { - let mut tokenizer = MultiLangTokenizer::default(); - { - let tokens = test_helper(tokenizer.token_stream("すもももももももものうち")); - assert_eq!(tokens.len(), 7); - { - let token = &tokens[0]; - assert_eq!(token.text, "すもも"); - assert_eq!(token.offset_from, 0); - assert_eq!(token.offset_to, 9); - assert_eq!(token.position, 0); - assert_eq!(token.position_length, 1); - } - } - { - // Force usage of JPN tokenizer. - let tokens = test_helper(tokenizer.token_stream("JPN:すもももももももものうち")); - assert_eq!(tokens.len(), 7); - } - { - // Force usage of ENG tokenizer. - // This tokenizer will return only one token. - let tokens = test_helper(tokenizer.token_stream("ENG:すもももももももものうち")); - assert_eq!(tokens.len(), 1); - } - } - - #[test] - fn test_multilanguage_tokenizer_kor() { - let mut tokenizer = MultiLangTokenizer::default(); - { - let tokens = test_helper(tokenizer.token_stream("일본입니다. 매우 멋진 단어입니다.")); - assert_eq!(tokens.len(), 11); - { - let token = &tokens[0]; - assert_eq!(token.text, "일본"); - assert_eq!(token.offset_from, 0); - assert_eq!(token.offset_to, 6); - assert_eq!(token.position, 0); - assert_eq!(token.position_length, 1); - } - } - { - let tokens = - test_helper(tokenizer.token_stream("KOR:일본입니다. 매우 멋진 단어입니다.")); - assert_eq!(tokens.len(), 11); - } - { - let tokens = test_helper(tokenizer.token_stream("ENG:일본입니다")); - assert_eq!(tokens.len(), 1); - } - } - - #[test] - fn test_multilanguage_tokenizer_with_empty_string() { - let mut tokenizer = MultiLangTokenizer::default(); - { - let tokens = test_helper(tokenizer.token_stream("")); - assert_eq!(tokens.len(), 0); - } - { - let tokens = test_helper(tokenizer.token_stream(" ")); - assert_eq!(tokens.len(), 0); - } - } - - #[test] - fn test_multilanguage_process_language_prefix() { - { - let (lang, text) = get_language_from_prefix("JPN:すもももももももものうち"); - assert_eq!(lang, Some(whichlang::Lang::Jpn)); - assert_eq!(text, "すもももももももものうち"); - } - { - let (lang, text) = get_language_from_prefix("CMN:地址1,包含無效的字元"); - assert_eq!(lang, Some(whichlang::Lang::Cmn)); - assert_eq!(text, "地址1,包含無效的字元"); - } - { - let (lang, text) = get_language_from_prefix("ENG:my address"); - assert_eq!(lang, Some(whichlang::Lang::Eng)); - assert_eq!(text, "my address"); - } - { - let (lang, text) = get_language_from_prefix("UNK:my address"); - assert!(lang.is_none()); - assert_eq!(text, "UNK:my address"); - } - { - let (lang, text) = get_language_from_prefix(""); - assert!(lang.is_none()); - assert_eq!(text, ""); - } - } -}