lance-format · Xuanwo · May 27, 2026
diff --git a/docs/src/format/index/scalar/fts.md b/docs/src/format/index/scalar/fts.md
@@ -56,7 +56,7 @@ The metadata file contains JSON-serialized configuration and partition informati
 
 | Field               | Type    | Default   | Description                                                    |
 |---------------------|---------|-----------|----------------------------------------------------------------|
-| `base_tokenizer`    | String  | "simple"  | Base tokenizer type (see Tokenizers section)                   |
+| `base_tokenizer`    | String  | "icu"     | Base tokenizer type (see Tokenizers section)                   |
 | `language`          | String  | "English" | Language for stemming and stop words                           |
 | `with_position`     | Boolean | false     | Store term positions for phrase queries (increases index size) |
 | `max_token_length`  | UInt32? | None      | Maximum token length (tokens longer than this are removed)     |
@@ -76,17 +76,17 @@ The full text search index supports multiple tokenizer types for different text
 
 | Tokenizer      | Description                                                               | Use Case               |
 |----------------|---------------------------------------------------------------------------|------------------------|
-| **simple**     | Splits on whitespace and punctuation, removes non-alphanumeric characters | General text (default) |
+| **icu**        | ICU dictionary-based Unicode word segmentation                            | Mixed-language text (default) |
+| **simple**     | Splits on whitespace and punctuation, removes non-alphanumeric characters | General ASCII-oriented text |
 | **whitespace** | Splits only on whitespace characters                                      | Preserve punctuation   |
 | **raw**        | No tokenization, treats entire text as single token                       | Exact matching         |
 | **ngram**      | Breaks text into overlapping character sequences                          | Substring/fuzzy search |
-| **icu**        | ICU dictionary-based Unicode word segmentation                            | Mixed-language text    |
 | **jieba/***    | Chinese text tokenizer with word segmentation                             | Chinese text           |
 | **lindera/***  | Japanese text tokenizer with morphological analysis                       | Japanese text          |
 
 #### ICU Tokenizer (Mixed-language text)
 
-The ICU tokenizer uses Unicode word boundary rules and dictionary-based segmentation for complex scripts. It is useful for mixed-language text where the default `simple` tokenizer would keep an unspaced CJK span as one large token.
+The ICU tokenizer uses Unicode word boundary rules and dictionary-based segmentation for complex scripts. It is the default tokenizer because it handles mixed-language text where the `simple` tokenizer would keep an unspaced CJK span as one large token.
 
 - **Models**: Uses compiled ICU4X segmenter data bundled with Lance
 - **Usage**: Specify as `icu`

diff --git a/docs/src/guide/json.md b/docs/src/guide/json.md
@@ -292,7 +292,7 @@ on a single path, create an `INVERTED` index on the JSON column.
 dataset.create_scalar_index(
     "data",
     index_type="INVERTED",
-    base_tokenizer="simple",
+    base_tokenizer="icu",
     lower_case=True,
     stem=True,
     remove_stop_words=True,

diff --git a/docs/src/quickstart/full-text-search.md b/docs/src/quickstart/full-text-search.md
@@ -90,7 +90,7 @@ ds.create_scalar_index(
     index_type="INVERTED",
     name="text_idx",              # Optional index name (if omitted, default is "text_idx")
     with_position=False,          # Set True to enable phrase queries (stores token positions)
-    base_tokenizer="simple",      # Tokenizer: "simple" (whitespace+punct), "icu", "whitespace", or "raw" (no tokenization)
+    base_tokenizer="icu",         # Tokenizer: "icu" (default), "simple" (whitespace+punct), "whitespace", or "raw"
     language="English",           # Language used for stemming + stop words (only used if `stem` or `remove_stop_words` is True)
     max_token_length=40,          # Drop tokens longer than this length
     lower_case=True,              # Lowercase text before tokenization
@@ -103,13 +103,13 @@ ds.create_scalar_index(
 
 ### Tokenizer Options
 
+- **icu**: Unicode word segmentation with built-in ICU dictionaries (default)
 - **simple**: Splits tokens on whitespace and punctuation
 - **whitespace**: Splits tokens only on whitespace
 - **raw**: No tokenization (useful for exact matching)
 
 Lance also supports multilingual tokenization:
 
-- **icu**: Unicode word segmentation with built-in ICU dictionaries
 - **jieba/default**: Chinese text tokenization using Jieba
 - **lindera/ipadic**: Japanese text tokenization using Lindera with IPAdic dictionary
 - **lindera/ko-dic**: Korean text tokenization using Lindera with Ko-dic dictionary

diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
@@ -3100,9 +3100,11 @@ def create_scalar_index(
             ``[1, num_compute_cpus]``. If unset, Lance uses ``num_compute_cpus``
             workers unless ``LANCE_FTS_NUM_SHARDS`` is set. This parameter is
             only used for the current build and is not persisted with the index.
-        base_tokenizer: str, default "simple"
+        base_tokenizer: str, default "icu"
             This is for the ``INVERTED`` index. The base tokenizer to use. The
             value can be:
+            * "icu": Unicode word segmentation with dictionary support for CJK and
+              other scripts.
             * "simple": splits tokens on whitespace and punctuation.
             * "whitespace": splits tokens on whitespace.
             * "raw": no tokenization.

diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py
@@ -694,6 +694,27 @@ def test_unindexed_full_text_search_on_empty_index(tmp_path):
     assert results.num_rows == 1
 
 
+def test_default_fts_tokenizer_handles_unspaced_multilingual_text(tmp_path):
+    data = pa.table(
+        {
+            "id": [0, 1],
+            "text": ["Hello, こんにちは世界!", "Hello, こんにちは!"],
+        }
+    )
+    ds = lance.write_dataset(data, tmp_path)
+    ds.create_scalar_index(
+        "text",
+        index_type="INVERTED",
+        stem=False,
+        remove_stop_words=False,
+        ascii_folding=False,
+    )
+
+    results = ds.to_table(full_text_query="世界")
+
+    assert results["id"].to_pylist() == [0]
+
+
 def test_full_text_search_without_index(dataset):
     row = dataset.take(indices=[0], columns=["doc"])
     query_text = row.column(0)[0].as_py()
@@ -978,7 +999,7 @@ def test_fts_stats(dataset):
     params = stats["params"]
 
     assert params["with_position"] is False
-    assert params["base_tokenizer"] == "simple"
+    assert params["base_tokenizer"] == "icu"
     assert params["language"] == "English"
     assert params["max_token_length"] == 40
     assert params["lower_case"] is True
@@ -4670,7 +4691,7 @@ def test_describe_indices(tmp_path, monkeypatch, fts_format_version):
     details = indices[0].details
     assert details is not None and len(details) > 0
     assert details["lance_tokenizer"] is None
-    assert details["base_tokenizer"] == "simple"
+    assert details["base_tokenizer"] == "icu"
     assert details["language"] == "English"
     assert not details["with_position"]
     assert details["max_token_length"] == 40

diff --git a/rust/lance-index/Cargo.toml b/rust/lance-index/Cargo.toml
@@ -87,6 +87,7 @@ rstest.workspace = true
 chrono.workspace = true
 
 [features]
+default = ["tokenizer-icu"]
 geo = ["dep:lance-geo", "lance-geo/geo", "dep:geoarrow-array", "dep:geoarrow-schema", "dep:geo-types"]
 protoc = ["dep:protobuf-src"]
 jieba-rs = ["tokenizer-jieba"]

diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs
@@ -1066,6 +1066,7 @@ impl ScalarIndex for InvertedIndex {
     fn derive_index_params(&self) -> Result<ScalarIndexParams> {
         let mut params = self.params.clone();
         if params.base_tokenizer.is_empty() {
+            // Empty tokenizer metadata only appears in legacy simple-tokenizer indexes.
             params.base_tokenizer = "simple".to_string();
         }
 

diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs
@@ -47,7 +47,7 @@ pub struct InvertedIndexParams {
     /// - `lindera/*`: Lindera tokenizer
     /// - `jieba/*`: Jieba tokenizer
     ///
-    /// `simple` is recommended for most cases and the default value
+    /// `icu` is recommended for most cases and is the default when the ICU tokenizer feature is enabled
     pub(crate) base_tokenizer: String,
 
     /// language for stemming and stop words
@@ -154,7 +154,7 @@ impl TryFrom<&pbold::InvertedIndexDetails> for InvertedIndexParams {
                 .base_tokenizer
                 .as_ref()
                 .cloned()
-                .unwrap_or(defaults.base_tokenizer),
+                .unwrap_or_else(|| "simple".to_string()),
             language: serde_json::from_str(details.language.as_str())?,
             with_position: details.with_position,
             max_token_length: details.max_token_length.map(|l| l as usize),
@@ -186,19 +186,27 @@ fn default_max_ngram_length() -> u32 {
 
 impl Default for InvertedIndexParams {
     fn default() -> Self {
-        Self::new("simple".to_owned(), Language::English)
+        Self::new(default_base_tokenizer().to_owned(), Language::English)
+    }
+}
+
+fn default_base_tokenizer() -> &'static str {
+    if cfg!(feature = "tokenizer-icu") {
+        "icu"
+    } else {
+        "simple"
     }
 }
 
 impl InvertedIndexParams {
     /// Create a new `InvertedIndexParams` with the given base tokenizer and language.
     ///
     /// The `base_tokenizer` can be one of the following:
-    /// - `simple`: splits tokens on whitespace and punctuation, default
+    /// - `icu`: ICU dictionary-based word segmentation, default when enabled
+    /// - `simple`: splits tokens on whitespace and punctuation
     /// - `whitespace`: splits tokens on whitespace
     /// - `raw`: no tokenization
     /// - `ngram`: N-Gram tokenizer
-    /// - `icu`: ICU dictionary-based word segmentation
     /// - `lindera/*`: Lindera tokenizer
     /// - `jieba/*`: Jieba tokenizer
     ///
@@ -446,6 +454,29 @@ mod tests {
     #[cfg(feature = "tokenizer-icu")]
     use lance_tokenizer::TokenStream;
 
+    #[cfg(not(feature = "tokenizer-icu"))]
+    #[test]
+    fn test_default_uses_simple_without_icu_feature() {
+        assert_eq!(InvertedIndexParams::default().base_tokenizer, "simple");
+    }
+
+    #[cfg(feature = "tokenizer-icu")]
+    #[test]
+    fn test_default_uses_icu_tokenizer() {
+        assert_eq!(InvertedIndexParams::default().base_tokenizer, "icu");
+    }
+
+    #[test]
+    fn test_missing_details_base_tokenizer_uses_legacy_simple_default() {
+        let mut details =
+            crate::pbold::InvertedIndexDetails::try_from(&InvertedIndexParams::default()).unwrap();
+        details.base_tokenizer = None;
+
+        let params = InvertedIndexParams::try_from(&details).unwrap();
+
+        assert_eq!(params.base_tokenizer, "simple");
+    }
+
     #[test]
     fn test_build_only_fields_are_not_serialized() {
         let params = InvertedIndexParams::default()
@@ -498,7 +529,6 @@ mod tests {
     #[test]
     fn test_build_icu_tokenizer() {
         let mut tokenizer = InvertedIndexParams::default()
-            .base_tokenizer("icu".to_string())
             .stem(false)
             .remove_stop_words(false)
             .build()

diff --git a/rust/lance/src/io/exec/fts.rs b/rust/lance/src/io/exec/fts.rs
@@ -38,6 +38,7 @@ use super::utils::{IndexMetrics, InstrumentedChildInputStream, build_prefilter};
 use crate::index::scalar::inverted::{load_segment_details, load_segments};
 use crate::{Dataset, index::DatasetIndexInternalExt};
 use lance_index::metrics::MetricsCollector;
+use lance_index::scalar::InvertedIndexParams;
 use lance_index::scalar::inverted::builder::ScoredDoc;
 use lance_index::scalar::inverted::builder::document_input;
 use lance_index::scalar::inverted::document_tokenizer::{DocType, JsonTokenizer, LanceTokenizer};
@@ -148,11 +149,16 @@ async fn search_segments(
         .unzip())
 }
 
-/// Fall back to the default simple tokenizer when no on-disk FTS segment exists.
+/// Fall back to the default base tokenizer when no on-disk FTS segment exists.
 fn default_text_tokenizer() -> Box<dyn LanceTokenizer> {
-    Box::new(TextTokenizer::new(
-        TextAnalyzer::builder(SimpleTokenizer::default()).build(),
-    ))
+    InvertedIndexParams::default()
+        .max_token_length(None)
+        .lower_case(false)
+        .stem(false)
+        .remove_stop_words(false)
+        .ascii_folding(false)
+        .build()
+        .expect("default FTS tokenizer should build")
 }
 
 pub struct FtsIndexMetrics {