diff --git a/docs/src/format/index/scalar/fts.md b/docs/src/format/index/scalar/fts.md index 792702aca08..c051f8d4bd5 100644 --- a/docs/src/format/index/scalar/fts.md +++ b/docs/src/format/index/scalar/fts.md @@ -56,7 +56,7 @@ The metadata file contains JSON-serialized configuration and partition informati | Field | Type | Default | Description | |---------------------|---------|-----------|----------------------------------------------------------------| -| `base_tokenizer` | String | "simple" | Base tokenizer type (see Tokenizers section) | +| `base_tokenizer` | String | "icu" | Base tokenizer type (see Tokenizers section) | | `language` | String | "English" | Language for stemming and stop words | | `with_position` | Boolean | false | Store term positions for phrase queries (increases index size) | | `max_token_length` | UInt32? | None | Maximum token length (tokens longer than this are removed) | @@ -76,17 +76,17 @@ The full text search index supports multiple tokenizer types for different text | Tokenizer | Description | Use Case | |----------------|---------------------------------------------------------------------------|------------------------| -| **simple** | Splits on whitespace and punctuation, removes non-alphanumeric characters | General text (default) | +| **icu** | ICU dictionary-based Unicode word segmentation | Mixed-language text (default) | +| **simple** | Splits on whitespace and punctuation, removes non-alphanumeric characters | General ASCII-oriented text | | **whitespace** | Splits only on whitespace characters | Preserve punctuation | | **raw** | No tokenization, treats entire text as single token | Exact matching | | **ngram** | Breaks text into overlapping character sequences | Substring/fuzzy search | -| **icu** | ICU dictionary-based Unicode word segmentation | Mixed-language text | | **jieba/*** | Chinese text tokenizer with word segmentation | Chinese text | | **lindera/*** | Japanese text tokenizer with morphological analysis | Japanese text | #### ICU Tokenizer (Mixed-language text) -The ICU tokenizer uses Unicode word boundary rules and dictionary-based segmentation for complex scripts. It is useful for mixed-language text where the default `simple` tokenizer would keep an unspaced CJK span as one large token. +The ICU tokenizer uses Unicode word boundary rules and dictionary-based segmentation for complex scripts. It is the default tokenizer because it handles mixed-language text where the `simple` tokenizer would keep an unspaced CJK span as one large token. - **Models**: Uses compiled ICU4X segmenter data bundled with Lance - **Usage**: Specify as `icu` diff --git a/docs/src/guide/json.md b/docs/src/guide/json.md index 7246c8fe08a..60d63172676 100644 --- a/docs/src/guide/json.md +++ b/docs/src/guide/json.md @@ -292,7 +292,7 @@ on a single path, create an `INVERTED` index on the JSON column. dataset.create_scalar_index( "data", index_type="INVERTED", - base_tokenizer="simple", + base_tokenizer="icu", lower_case=True, stem=True, remove_stop_words=True, diff --git a/docs/src/quickstart/full-text-search.md b/docs/src/quickstart/full-text-search.md index 17327e40bc5..e008bf8f76f 100644 --- a/docs/src/quickstart/full-text-search.md +++ b/docs/src/quickstart/full-text-search.md @@ -90,7 +90,7 @@ ds.create_scalar_index( index_type="INVERTED", name="text_idx", # Optional index name (if omitted, default is "text_idx") with_position=False, # Set True to enable phrase queries (stores token positions) - base_tokenizer="simple", # Tokenizer: "simple" (whitespace+punct), "icu", "whitespace", or "raw" (no tokenization) + base_tokenizer="icu", # Tokenizer: "icu" (default), "simple" (whitespace+punct), "whitespace", or "raw" language="English", # Language used for stemming + stop words (only used if `stem` or `remove_stop_words` is True) max_token_length=40, # Drop tokens longer than this length lower_case=True, # Lowercase text before tokenization @@ -103,13 +103,13 @@ ds.create_scalar_index( ### Tokenizer Options +- **icu**: Unicode word segmentation with built-in ICU dictionaries (default) - **simple**: Splits tokens on whitespace and punctuation - **whitespace**: Splits tokens only on whitespace - **raw**: No tokenization (useful for exact matching) Lance also supports multilingual tokenization: -- **icu**: Unicode word segmentation with built-in ICU dictionaries - **jieba/default**: Chinese text tokenization using Jieba - **lindera/ipadic**: Japanese text tokenization using Lindera with IPAdic dictionary - **lindera/ko-dic**: Korean text tokenization using Lindera with Ko-dic dictionary diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index eeb2dacff6a..0a02dbeb5e4 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -3100,9 +3100,11 @@ def create_scalar_index( ``[1, num_compute_cpus]``. If unset, Lance uses ``num_compute_cpus`` workers unless ``LANCE_FTS_NUM_SHARDS`` is set. This parameter is only used for the current build and is not persisted with the index. - base_tokenizer: str, default "simple" + base_tokenizer: str, default "icu" This is for the ``INVERTED`` index. The base tokenizer to use. The value can be: + * "icu": Unicode word segmentation with dictionary support for CJK and + other scripts. * "simple": splits tokens on whitespace and punctuation. * "whitespace": splits tokens on whitespace. * "raw": no tokenization. diff --git a/python/python/tests/compat/test_scalar_indices.py b/python/python/tests/compat/test_scalar_indices.py index 35022df3b12..a01615abf53 100644 --- a/python/python/tests/compat/test_scalar_indices.py +++ b/python/python/tests/compat/test_scalar_indices.py @@ -320,7 +320,9 @@ def create(self): max_rows_per_file=100, data_storage_version=safe_data_storage_version(self.compat_version), ) - dataset.create_scalar_index("text", "INVERTED", with_position=True) + dataset.create_scalar_index( + "text", "INVERTED", with_position=True, base_tokenizer="simple" + ) def check_read(self): """Verify FTS index can be queried.""" diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index 7b4dede319b..73b6680cfff 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -694,6 +694,27 @@ def test_unindexed_full_text_search_on_empty_index(tmp_path): assert results.num_rows == 1 +def test_default_fts_tokenizer_handles_unspaced_multilingual_text(tmp_path): + data = pa.table( + { + "id": [0, 1], + "text": ["Hello, こんにちは世界!", "Hello, こんにちは!"], + } + ) + ds = lance.write_dataset(data, tmp_path) + ds.create_scalar_index( + "text", + index_type="INVERTED", + stem=False, + remove_stop_words=False, + ascii_folding=False, + ) + + results = ds.to_table(full_text_query="世界") + + assert results["id"].to_pylist() == [0] + + def test_full_text_search_without_index(dataset): row = dataset.take(indices=[0], columns=["doc"]) query_text = row.column(0)[0].as_py() @@ -978,7 +999,7 @@ def test_fts_stats(dataset): params = stats["params"] assert params["with_position"] is False - assert params["base_tokenizer"] == "simple" + assert params["base_tokenizer"] == "icu" assert params["language"] == "English" assert params["max_token_length"] == 40 assert params["lower_case"] is True @@ -1448,7 +1469,7 @@ def test_fts_deleted_rows_with_stable_row_ids(tmp_path): # Regression test: stable-row-id prefiltering must not leak deleted rows. data = pa.table( { - "text": [f"dup_{i}" for i in range(200)], + "text": [f"dup {i}" for i in range(200)], "category": [["A", "B", "C", "D", "E"][i % 5] for i in range(200)], } ) @@ -4670,7 +4691,7 @@ def test_describe_indices(tmp_path, monkeypatch, fts_format_version): details = indices[0].details assert details is not None and len(details) > 0 assert details["lance_tokenizer"] is None - assert details["base_tokenizer"] == "simple" + assert details["base_tokenizer"] == "icu" assert details["language"] == "English" assert not details["with_position"] assert details["max_token_length"] == 40 diff --git a/python/uv.lock b/python/uv.lock index 3c339603d2e..463e0a7bd6c 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -1083,19 +1083,19 @@ wheels = [ [[package]] name = "lance-namespace" -version = "0.7.6" +version = "0.7.7" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "lance-namespace-urllib3-client" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/2b/da/134670003173881bed44af656badffd91e0b2e0232c083eeacc5923d7335/lance_namespace-0.7.6.tar.gz", hash = "sha256:4e12094005d105ef1b44346c9d7feda4a0f733b127dab90c1a5ffbf7cd433770", size = 10686, upload-time = "2026-05-05T18:26:38.885Z" } +sdist = { url = "https://files.pythonhosted.org/packages/06/5c/9822af615fc1bd3ee1073994696c739aecde377be32435ec3303aed1bc5d/lance_namespace-0.7.7.tar.gz", hash = "sha256:d00b525f2e26993a6c61668e798bca6c808605ab8a79f29f86a1a1af92d91ae2", size = 10754, upload-time = "2026-05-20T17:32:59.45Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/83/88/44463a5f41f7077b2ea641f2afded72eaceb6a6a1b4a55c11b22318fed74/lance_namespace-0.7.6-py3-none-any.whl", hash = "sha256:c94a1b8a6aab127e55a20cbf44d927ae3a9b7d435656d2130dccf84ccf7c9999", size = 12519, upload-time = "2026-05-05T18:26:36.425Z" }, + { url = "https://files.pythonhosted.org/packages/11/43/186acc1156da20c351db196e2b6241b2453b16dc1b4cc8e0a626667ca471/lance_namespace-0.7.7-py3-none-any.whl", hash = "sha256:477a7ca6b5e1f673a2c9ba52f42d6e8e3ff7c27a601392a21eb90fba98d0309b", size = 12581, upload-time = "2026-05-20T17:32:57.389Z" }, ] [[package]] name = "lance-namespace-urllib3-client" -version = "0.7.6" +version = "0.7.7" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "pydantic" }, @@ -1104,9 +1104,9 @@ dependencies = [ { name = "urllib3", version = "1.26.20", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "urllib3", version = "2.5.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/01/44/024aae184c08b3800482cd9b832d534249e25de145af732d4e4c8dff38a8/lance_namespace_urllib3_client-0.7.6.tar.gz", hash = "sha256:15ae7f0d8d56fa34d837f7f6ec5c80a327a905e89ccfed05f7b409d6fe704cdf", size = 195551, upload-time = "2026-05-05T18:26:37.808Z" } +sdist = { url = "https://files.pythonhosted.org/packages/07/95/38ab81ccc1e09beeecd8ddfc61b8bc73831dc5053db1e3f9021f64a4896b/lance_namespace_urllib3_client-0.7.7.tar.gz", hash = "sha256:4d8c066628c17c6a10cf643b51a7f7ae1bfb8a614d9cc54a5af38a4ba2b4b102", size = 202930, upload-time = "2026-05-20T17:32:58.308Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/00/50/60c983cc8180772c82370dfad2104b7e788aaacc3bf9a84e8b42bb1ae6a7/lance_namespace_urllib3_client-0.7.6-py3-none-any.whl", hash = "sha256:fb884d8afff8af3aae04a3270624694a189d7ea79225dd349e6c555a1a1d6b52", size = 324603, upload-time = "2026-05-05T18:26:39.718Z" }, + { url = "https://files.pythonhosted.org/packages/35/96/5483e48e40433b1d078183c15a92c99e59a156041b0260e7f18ee34e7c08/lance_namespace_urllib3_client-0.7.7-py3-none-any.whl", hash = "sha256:9221c3e00fd89f0c811953d94b32d2ea527765280460a174f5872dc8a74c0ed6", size = 334767, upload-time = "2026-05-20T17:32:55.883Z" }, ] [[package]] @@ -2676,7 +2676,7 @@ requires-dist = [ { name = "duckdb", marker = "extra == 'tests'" }, { name = "geoarrow-rust-core", marker = "extra == 'geo'" }, { name = "geoarrow-rust-io", marker = "extra == 'geo'" }, - { name = "lance-namespace", specifier = ">=0.7.5,<0.8" }, + { name = "lance-namespace", specifier = ">=0.7.7,<0.8" }, { name = "ml-dtypes", marker = "extra == 'tests'" }, { name = "numpy", specifier = ">=1.22" }, { name = "pandas", marker = "extra == 'tests'" }, diff --git a/rust/lance-index/Cargo.toml b/rust/lance-index/Cargo.toml index b587ee57fac..f43a53ca64b 100644 --- a/rust/lance-index/Cargo.toml +++ b/rust/lance-index/Cargo.toml @@ -87,6 +87,7 @@ rstest.workspace = true chrono.workspace = true [features] +default = ["tokenizer-icu"] geo = ["dep:lance-geo", "lance-geo/geo", "dep:geoarrow-array", "dep:geoarrow-schema", "dep:geo-types"] protoc = ["dep:protobuf-src"] jieba-rs = ["tokenizer-jieba"] diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs index 152afc8bc0c..acd36d95277 100644 --- a/rust/lance-index/src/scalar/inverted/index.rs +++ b/rust/lance-index/src/scalar/inverted/index.rs @@ -1066,6 +1066,7 @@ impl ScalarIndex for InvertedIndex { fn derive_index_params(&self) -> Result { let mut params = self.params.clone(); if params.base_tokenizer.is_empty() { + // Empty tokenizer metadata only appears in legacy simple-tokenizer indexes. params.base_tokenizer = "simple".to_string(); } diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs index ed0fd80638d..967398d7e5f 100644 --- a/rust/lance-index/src/scalar/inverted/tokenizer.rs +++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs @@ -47,7 +47,7 @@ pub struct InvertedIndexParams { /// - `lindera/*`: Lindera tokenizer /// - `jieba/*`: Jieba tokenizer /// - /// `simple` is recommended for most cases and the default value + /// `icu` is recommended for most cases and is the default when the ICU tokenizer feature is enabled pub(crate) base_tokenizer: String, /// language for stemming and stop words @@ -154,7 +154,7 @@ impl TryFrom<&pbold::InvertedIndexDetails> for InvertedIndexParams { .base_tokenizer .as_ref() .cloned() - .unwrap_or(defaults.base_tokenizer), + .unwrap_or_else(|| "simple".to_string()), language: serde_json::from_str(details.language.as_str())?, with_position: details.with_position, max_token_length: details.max_token_length.map(|l| l as usize), @@ -186,7 +186,15 @@ fn default_max_ngram_length() -> u32 { impl Default for InvertedIndexParams { fn default() -> Self { - Self::new("simple".to_owned(), Language::English) + Self::new(default_base_tokenizer().to_owned(), Language::English) + } +} + +fn default_base_tokenizer() -> &'static str { + if cfg!(feature = "tokenizer-icu") { + "icu" + } else { + "simple" } } @@ -194,11 +202,11 @@ impl InvertedIndexParams { /// Create a new `InvertedIndexParams` with the given base tokenizer and language. /// /// The `base_tokenizer` can be one of the following: - /// - `simple`: splits tokens on whitespace and punctuation, default + /// - `icu`: ICU dictionary-based word segmentation, default when enabled + /// - `simple`: splits tokens on whitespace and punctuation /// - `whitespace`: splits tokens on whitespace /// - `raw`: no tokenization /// - `ngram`: N-Gram tokenizer - /// - `icu`: ICU dictionary-based word segmentation /// - `lindera/*`: Lindera tokenizer /// - `jieba/*`: Jieba tokenizer /// @@ -446,6 +454,29 @@ mod tests { #[cfg(feature = "tokenizer-icu")] use lance_tokenizer::TokenStream; + #[cfg(not(feature = "tokenizer-icu"))] + #[test] + fn test_default_uses_simple_without_icu_feature() { + assert_eq!(InvertedIndexParams::default().base_tokenizer, "simple"); + } + + #[cfg(feature = "tokenizer-icu")] + #[test] + fn test_default_uses_icu_tokenizer() { + assert_eq!(InvertedIndexParams::default().base_tokenizer, "icu"); + } + + #[test] + fn test_missing_details_base_tokenizer_uses_legacy_simple_default() { + let mut details = + crate::pbold::InvertedIndexDetails::try_from(&InvertedIndexParams::default()).unwrap(); + details.base_tokenizer = None; + + let params = InvertedIndexParams::try_from(&details).unwrap(); + + assert_eq!(params.base_tokenizer, "simple"); + } + #[test] fn test_build_only_fields_are_not_serialized() { let params = InvertedIndexParams::default() @@ -498,7 +529,6 @@ mod tests { #[test] fn test_build_icu_tokenizer() { let mut tokenizer = InvertedIndexParams::default() - .base_tokenizer("icu".to_string()) .stem(false) .remove_stop_words(false) .build() diff --git a/rust/lance/src/dataset/tests/dataset_index.rs b/rust/lance/src/dataset/tests/dataset_index.rs index e785de7bee4..a45de821f65 100644 --- a/rust/lance/src/dataset/tests/dataset_index.rs +++ b/rust/lance/src/dataset/tests/dataset_index.rs @@ -959,7 +959,7 @@ async fn test_fts_unindexed_data_with_stop_words() { .unwrap(); // Append unindexed rows with a term not in the index - let unindexed: Vec = (0..10).map(|i| format!("hello_{i}")).collect(); + let unindexed: Vec = (0..10).map(|i| format!("hello {i}")).collect(); let text_col = StringArray::from(unindexed); let batch = RecordBatch::try_new( arrow_schema::Schema::new(vec![Field::new("text", DataType::Utf8, false)]).into(), @@ -3176,6 +3176,7 @@ async fn test_sql_contains_tokens() { IndexType::Inverted, None, &InvertedIndexParams::default() + .base_tokenizer("simple".to_string()) .max_token_length(None) .stem(false), true, diff --git a/rust/lance/src/io/exec/fts.rs b/rust/lance/src/io/exec/fts.rs index c55a357fa0f..96129ea09fa 100644 --- a/rust/lance/src/io/exec/fts.rs +++ b/rust/lance/src/io/exec/fts.rs @@ -38,6 +38,7 @@ use super::utils::{IndexMetrics, InstrumentedChildInputStream, build_prefilter}; use crate::index::scalar::inverted::{load_segment_details, load_segments}; use crate::{Dataset, index::DatasetIndexInternalExt}; use lance_index::metrics::MetricsCollector; +use lance_index::scalar::InvertedIndexParams; use lance_index::scalar::inverted::builder::ScoredDoc; use lance_index::scalar::inverted::builder::document_input; use lance_index::scalar::inverted::document_tokenizer::{DocType, JsonTokenizer, LanceTokenizer}; @@ -148,11 +149,16 @@ async fn search_segments( .unzip()) } -/// Fall back to the default simple tokenizer when no on-disk FTS segment exists. +/// Fall back to the default base tokenizer when no on-disk FTS segment exists. fn default_text_tokenizer() -> Box { - Box::new(TextTokenizer::new( - TextAnalyzer::builder(SimpleTokenizer::default()).build(), - )) + InvertedIndexParams::default() + .max_token_length(None) + .lower_case(false) + .stem(false) + .remove_stop_words(false) + .ascii_folding(false) + .build() + .expect("default FTS tokenizer should build") } pub struct FtsIndexMetrics {