diff --git a/docs/src/format/index/scalar/fts.md b/docs/src/format/index/scalar/fts.md
index 792702aca08..c051f8d4bd5 100644
--- a/docs/src/format/index/scalar/fts.md
+++ b/docs/src/format/index/scalar/fts.md
@@ -56,7 +56,7 @@ The metadata file contains JSON-serialized configuration and partition informati
 
 | Field               | Type    | Default   | Description                                                    |
 |---------------------|---------|-----------|----------------------------------------------------------------|
-| `base_tokenizer`    | String  | "simple"  | Base tokenizer type (see Tokenizers section)                   |
+| `base_tokenizer`    | String  | "icu"     | Base tokenizer type (see Tokenizers section)                   |
 | `language`          | String  | "English" | Language for stemming and stop words                           |
 | `with_position`     | Boolean | false     | Store term positions for phrase queries (increases index size) |
 | `max_token_length`  | UInt32? | None      | Maximum token length (tokens longer than this are removed)     |
@@ -76,17 +76,17 @@ The full text search index supports multiple tokenizer types for different text
 
 | Tokenizer      | Description                                                               | Use Case               |
 |----------------|---------------------------------------------------------------------------|------------------------|
-| **simple**     | Splits on whitespace and punctuation, removes non-alphanumeric characters | General text (default) |
+| **icu**        | ICU dictionary-based Unicode word segmentation                            | Mixed-language text (default) |
+| **simple**     | Splits on whitespace and punctuation, removes non-alphanumeric characters | General ASCII-oriented text |
 | **whitespace** | Splits only on whitespace characters                                      | Preserve punctuation   |
 | **raw**        | No tokenization, treats entire text as single token                       | Exact matching         |
 | **ngram**      | Breaks text into overlapping character sequences                          | Substring/fuzzy search |
-| **icu**        | ICU dictionary-based Unicode word segmentation                            | Mixed-language text    |
 | **jieba/***    | Chinese text tokenizer with word segmentation                             | Chinese text           |
 | **lindera/***  | Japanese text tokenizer with morphological analysis                       | Japanese text          |
 
 #### ICU Tokenizer (Mixed-language text)
 
-The ICU tokenizer uses Unicode word boundary rules and dictionary-based segmentation for complex scripts. It is useful for mixed-language text where the default `simple` tokenizer would keep an unspaced CJK span as one large token.
+The ICU tokenizer uses Unicode word boundary rules and dictionary-based segmentation for complex scripts. It is the default tokenizer because it handles mixed-language text where the `simple` tokenizer would keep an unspaced CJK span as one large token.
 
 - **Models**: Uses compiled ICU4X segmenter data bundled with Lance
 - **Usage**: Specify as `icu`
diff --git a/docs/src/guide/json.md b/docs/src/guide/json.md
index 7246c8fe08a..60d63172676 100644
--- a/docs/src/guide/json.md
+++ b/docs/src/guide/json.md
@@ -292,7 +292,7 @@ on a single path, create an `INVERTED` index on the JSON column.
 dataset.create_scalar_index(
     "data",
     index_type="INVERTED",
-    base_tokenizer="simple",
+    base_tokenizer="icu",
     lower_case=True,
     stem=True,
     remove_stop_words=True,
diff --git a/docs/src/quickstart/full-text-search.md b/docs/src/quickstart/full-text-search.md
index 17327e40bc5..e008bf8f76f 100644
--- a/docs/src/quickstart/full-text-search.md
+++ b/docs/src/quickstart/full-text-search.md
@@ -90,7 +90,7 @@ ds.create_scalar_index(
     index_type="INVERTED",
     name="text_idx",              # Optional index name (if omitted, default is "text_idx")
     with_position=False,          # Set True to enable phrase queries (stores token positions)
-    base_tokenizer="simple",      # Tokenizer: "simple" (whitespace+punct), "icu", "whitespace", or "raw" (no tokenization)
+    base_tokenizer="icu",         # Tokenizer: "icu" (default), "simple" (whitespace+punct), "whitespace", or "raw"
     language="English",           # Language used for stemming + stop words (only used if `stem` or `remove_stop_words` is True)
     max_token_length=40,          # Drop tokens longer than this length
     lower_case=True,              # Lowercase text before tokenization
@@ -103,13 +103,13 @@ ds.create_scalar_index(
 
 ### Tokenizer Options
 
+- **icu**: Unicode word segmentation with built-in ICU dictionaries (default)
 - **simple**: Splits tokens on whitespace and punctuation
 - **whitespace**: Splits tokens only on whitespace
 - **raw**: No tokenization (useful for exact matching)
 
 Lance also supports multilingual tokenization:
 
-- **icu**: Unicode word segmentation with built-in ICU dictionaries
 - **jieba/default**: Chinese text tokenization using Jieba
 - **lindera/ipadic**: Japanese text tokenization using Lindera with IPAdic dictionary
 - **lindera/ko-dic**: Korean text tokenization using Lindera with Ko-dic dictionary
diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
index eeb2dacff6a..0a02dbeb5e4 100644
--- a/python/python/lance/dataset.py
+++ b/python/python/lance/dataset.py
@@ -3100,9 +3100,11 @@ def create_scalar_index(
             ``[1, num_compute_cpus]``. If unset, Lance uses ``num_compute_cpus``
             workers unless ``LANCE_FTS_NUM_SHARDS`` is set. This parameter is
             only used for the current build and is not persisted with the index.
-        base_tokenizer: str, default "simple"
+        base_tokenizer: str, default "icu"
             This is for the ``INVERTED`` index. The base tokenizer to use. The
             value can be:
+            * "icu": Unicode word segmentation with dictionary support for CJK and
+              other scripts.
             * "simple": splits tokens on whitespace and punctuation.
             * "whitespace": splits tokens on whitespace.
             * "raw": no tokenization.
diff --git a/python/python/tests/compat/test_scalar_indices.py b/python/python/tests/compat/test_scalar_indices.py
index 35022df3b12..a01615abf53 100644
--- a/python/python/tests/compat/test_scalar_indices.py
+++ b/python/python/tests/compat/test_scalar_indices.py
@@ -320,7 +320,9 @@ def create(self):
             max_rows_per_file=100,
             data_storage_version=safe_data_storage_version(self.compat_version),
         )
-        dataset.create_scalar_index("text", "INVERTED", with_position=True)
+        dataset.create_scalar_index(
+            "text", "INVERTED", with_position=True, base_tokenizer="simple"
+        )
 
     def check_read(self):
         """Verify FTS index can be queried."""
diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py
index 7b4dede319b..73b6680cfff 100644
--- a/python/python/tests/test_scalar_index.py
+++ b/python/python/tests/test_scalar_index.py
@@ -694,6 +694,27 @@ def test_unindexed_full_text_search_on_empty_index(tmp_path):
     assert results.num_rows == 1
 
 
+def test_default_fts_tokenizer_handles_unspaced_multilingual_text(tmp_path):
+    data = pa.table(
+        {
+            "id": [0, 1],
+            "text": ["Hello, こんにちは世界!", "Hello, こんにちは!"],
+        }
+    )
+    ds = lance.write_dataset(data, tmp_path)
+    ds.create_scalar_index(
+        "text",
+        index_type="INVERTED",
+        stem=False,
+        remove_stop_words=False,
+        ascii_folding=False,
+    )
+
+    results = ds.to_table(full_text_query="世界")
+
+    assert results["id"].to_pylist() == [0]
+
+
 def test_full_text_search_without_index(dataset):
     row = dataset.take(indices=[0], columns=["doc"])
     query_text = row.column(0)[0].as_py()
@@ -978,7 +999,7 @@ def test_fts_stats(dataset):
     params = stats["params"]
 
     assert params["with_position"] is False
-    assert params["base_tokenizer"] == "simple"
+    assert params["base_tokenizer"] == "icu"
     assert params["language"] == "English"
     assert params["max_token_length"] == 40
     assert params["lower_case"] is True
@@ -1448,7 +1469,7 @@ def test_fts_deleted_rows_with_stable_row_ids(tmp_path):
     # Regression test: stable-row-id prefiltering must not leak deleted rows.
     data = pa.table(
         {
-            "text": [f"dup_{i}" for i in range(200)],
+            "text": [f"dup {i}" for i in range(200)],
             "category": [["A", "B", "C", "D", "E"][i % 5] for i in range(200)],
         }
     )
@@ -4670,7 +4691,7 @@ def test_describe_indices(tmp_path, monkeypatch, fts_format_version):
     details = indices[0].details
     assert details is not None and len(details) > 0
     assert details["lance_tokenizer"] is None
-    assert details["base_tokenizer"] == "simple"
+    assert details["base_tokenizer"] == "icu"
     assert details["language"] == "English"
     assert not details["with_position"]
     assert details["max_token_length"] == 40
diff --git a/python/uv.lock b/python/uv.lock
index 3c339603d2e..463e0a7bd6c 100644
--- a/python/uv.lock
+++ b/python/uv.lock
@@ -1083,19 +1083,19 @@ wheels = [
 
 [[package]]
 name = "lance-namespace"
-version = "0.7.6"
+version = "0.7.7"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "lance-namespace-urllib3-client" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/2b/da/134670003173881bed44af656badffd91e0b2e0232c083eeacc5923d7335/lance_namespace-0.7.6.tar.gz", hash = "sha256:4e12094005d105ef1b44346c9d7feda4a0f733b127dab90c1a5ffbf7cd433770", size = 10686, upload-time = "2026-05-05T18:26:38.885Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/06/5c/9822af615fc1bd3ee1073994696c739aecde377be32435ec3303aed1bc5d/lance_namespace-0.7.7.tar.gz", hash = "sha256:d00b525f2e26993a6c61668e798bca6c808605ab8a79f29f86a1a1af92d91ae2", size = 10754, upload-time = "2026-05-20T17:32:59.45Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/83/88/44463a5f41f7077b2ea641f2afded72eaceb6a6a1b4a55c11b22318fed74/lance_namespace-0.7.6-py3-none-any.whl", hash = "sha256:c94a1b8a6aab127e55a20cbf44d927ae3a9b7d435656d2130dccf84ccf7c9999", size = 12519, upload-time = "2026-05-05T18:26:36.425Z" },
+    { url = "https://files.pythonhosted.org/packages/11/43/186acc1156da20c351db196e2b6241b2453b16dc1b4cc8e0a626667ca471/lance_namespace-0.7.7-py3-none-any.whl", hash = "sha256:477a7ca6b5e1f673a2c9ba52f42d6e8e3ff7c27a601392a21eb90fba98d0309b", size = 12581, upload-time = "2026-05-20T17:32:57.389Z" },
 ]
 
 [[package]]
 name = "lance-namespace-urllib3-client"
-version = "0.7.6"
+version = "0.7.7"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "pydantic" },
@@ -1104,9 +1104,9 @@ dependencies = [
     { name = "urllib3", version = "1.26.20", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
     { name = "urllib3", version = "2.5.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/01/44/024aae184c08b3800482cd9b832d534249e25de145af732d4e4c8dff38a8/lance_namespace_urllib3_client-0.7.6.tar.gz", hash = "sha256:15ae7f0d8d56fa34d837f7f6ec5c80a327a905e89ccfed05f7b409d6fe704cdf", size = 195551, upload-time = "2026-05-05T18:26:37.808Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/07/95/38ab81ccc1e09beeecd8ddfc61b8bc73831dc5053db1e3f9021f64a4896b/lance_namespace_urllib3_client-0.7.7.tar.gz", hash = "sha256:4d8c066628c17c6a10cf643b51a7f7ae1bfb8a614d9cc54a5af38a4ba2b4b102", size = 202930, upload-time = "2026-05-20T17:32:58.308Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/00/50/60c983cc8180772c82370dfad2104b7e788aaacc3bf9a84e8b42bb1ae6a7/lance_namespace_urllib3_client-0.7.6-py3-none-any.whl", hash = "sha256:fb884d8afff8af3aae04a3270624694a189d7ea79225dd349e6c555a1a1d6b52", size = 324603, upload-time = "2026-05-05T18:26:39.718Z" },
+    { url = "https://files.pythonhosted.org/packages/35/96/5483e48e40433b1d078183c15a92c99e59a156041b0260e7f18ee34e7c08/lance_namespace_urllib3_client-0.7.7-py3-none-any.whl", hash = "sha256:9221c3e00fd89f0c811953d94b32d2ea527765280460a174f5872dc8a74c0ed6", size = 334767, upload-time = "2026-05-20T17:32:55.883Z" },
 ]
 
 [[package]]
@@ -2676,7 +2676,7 @@ requires-dist = [
     { name = "duckdb", marker = "extra == 'tests'" },
     { name = "geoarrow-rust-core", marker = "extra == 'geo'" },
     { name = "geoarrow-rust-io", marker = "extra == 'geo'" },
-    { name = "lance-namespace", specifier = ">=0.7.5,<0.8" },
+    { name = "lance-namespace", specifier = ">=0.7.7,<0.8" },
     { name = "ml-dtypes", marker = "extra == 'tests'" },
     { name = "numpy", specifier = ">=1.22" },
     { name = "pandas", marker = "extra == 'tests'" },
diff --git a/rust/lance-index/Cargo.toml b/rust/lance-index/Cargo.toml
index b587ee57fac..f43a53ca64b 100644
--- a/rust/lance-index/Cargo.toml
+++ b/rust/lance-index/Cargo.toml
@@ -87,6 +87,7 @@ rstest.workspace = true
 chrono.workspace = true
 
 [features]
+default = ["tokenizer-icu"]
 geo = ["dep:lance-geo", "lance-geo/geo", "dep:geoarrow-array", "dep:geoarrow-schema", "dep:geo-types"]
 protoc = ["dep:protobuf-src"]
 jieba-rs = ["tokenizer-jieba"]
diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs
index 152afc8bc0c..acd36d95277 100644
--- a/rust/lance-index/src/scalar/inverted/index.rs
+++ b/rust/lance-index/src/scalar/inverted/index.rs
@@ -1066,6 +1066,7 @@ impl ScalarIndex for InvertedIndex {
     fn derive_index_params(&self) -> Result<ScalarIndexParams> {
         let mut params = self.params.clone();
         if params.base_tokenizer.is_empty() {
+            // Empty tokenizer metadata only appears in legacy simple-tokenizer indexes.
             params.base_tokenizer = "simple".to_string();
         }
 
diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs
index ed0fd80638d..967398d7e5f 100644
--- a/rust/lance-index/src/scalar/inverted/tokenizer.rs
+++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs
@@ -47,7 +47,7 @@ pub struct InvertedIndexParams {
     /// - `lindera/*`: Lindera tokenizer
     /// - `jieba/*`: Jieba tokenizer
     ///
-    /// `simple` is recommended for most cases and the default value
+    /// `icu` is recommended for most cases and is the default when the ICU tokenizer feature is enabled
     pub(crate) base_tokenizer: String,
 
     /// language for stemming and stop words
@@ -154,7 +154,7 @@ impl TryFrom<&pbold::InvertedIndexDetails> for InvertedIndexParams {
                 .base_tokenizer
                 .as_ref()
                 .cloned()
-                .unwrap_or(defaults.base_tokenizer),
+                .unwrap_or_else(|| "simple".to_string()),
             language: serde_json::from_str(details.language.as_str())?,
             with_position: details.with_position,
             max_token_length: details.max_token_length.map(|l| l as usize),
@@ -186,7 +186,15 @@ fn default_max_ngram_length() -> u32 {
 
 impl Default for InvertedIndexParams {
     fn default() -> Self {
-        Self::new("simple".to_owned(), Language::English)
+        Self::new(default_base_tokenizer().to_owned(), Language::English)
+    }
+}
+
+fn default_base_tokenizer() -> &'static str {
+    if cfg!(feature = "tokenizer-icu") {
+        "icu"
+    } else {
+        "simple"
     }
 }
 
@@ -194,11 +202,11 @@ impl InvertedIndexParams {
     /// Create a new `InvertedIndexParams` with the given base tokenizer and language.
     ///
     /// The `base_tokenizer` can be one of the following:
-    /// - `simple`: splits tokens on whitespace and punctuation, default
+    /// - `icu`: ICU dictionary-based word segmentation, default when enabled
+    /// - `simple`: splits tokens on whitespace and punctuation
     /// - `whitespace`: splits tokens on whitespace
     /// - `raw`: no tokenization
     /// - `ngram`: N-Gram tokenizer
-    /// - `icu`: ICU dictionary-based word segmentation
     /// - `lindera/*`: Lindera tokenizer
     /// - `jieba/*`: Jieba tokenizer
     ///
@@ -446,6 +454,29 @@ mod tests {
     #[cfg(feature = "tokenizer-icu")]
     use lance_tokenizer::TokenStream;
 
+    #[cfg(not(feature = "tokenizer-icu"))]
+    #[test]
+    fn test_default_uses_simple_without_icu_feature() {
+        assert_eq!(InvertedIndexParams::default().base_tokenizer, "simple");
+    }
+
+    #[cfg(feature = "tokenizer-icu")]
+    #[test]
+    fn test_default_uses_icu_tokenizer() {
+        assert_eq!(InvertedIndexParams::default().base_tokenizer, "icu");
+    }
+
+    #[test]
+    fn test_missing_details_base_tokenizer_uses_legacy_simple_default() {
+        let mut details =
+            crate::pbold::InvertedIndexDetails::try_from(&InvertedIndexParams::default()).unwrap();
+        details.base_tokenizer = None;
+
+        let params = InvertedIndexParams::try_from(&details).unwrap();
+
+        assert_eq!(params.base_tokenizer, "simple");
+    }
+
     #[test]
     fn test_build_only_fields_are_not_serialized() {
         let params = InvertedIndexParams::default()
@@ -498,7 +529,6 @@ mod tests {
     #[test]
     fn test_build_icu_tokenizer() {
         let mut tokenizer = InvertedIndexParams::default()
-            .base_tokenizer("icu".to_string())
             .stem(false)
             .remove_stop_words(false)
             .build()
diff --git a/rust/lance/src/dataset/tests/dataset_index.rs b/rust/lance/src/dataset/tests/dataset_index.rs
index e785de7bee4..a45de821f65 100644
--- a/rust/lance/src/dataset/tests/dataset_index.rs
+++ b/rust/lance/src/dataset/tests/dataset_index.rs
@@ -959,7 +959,7 @@ async fn test_fts_unindexed_data_with_stop_words() {
         .unwrap();
 
     // Append unindexed rows with a term not in the index
-    let unindexed: Vec<String> = (0..10).map(|i| format!("hello_{i}")).collect();
+    let unindexed: Vec<String> = (0..10).map(|i| format!("hello {i}")).collect();
     let text_col = StringArray::from(unindexed);
     let batch = RecordBatch::try_new(
         arrow_schema::Schema::new(vec![Field::new("text", DataType::Utf8, false)]).into(),
@@ -3176,6 +3176,7 @@ async fn test_sql_contains_tokens() {
             IndexType::Inverted,
             None,
             &InvertedIndexParams::default()
+                .base_tokenizer("simple".to_string())
                 .max_token_length(None)
                 .stem(false),
             true,
diff --git a/rust/lance/src/io/exec/fts.rs b/rust/lance/src/io/exec/fts.rs
index c55a357fa0f..96129ea09fa 100644
--- a/rust/lance/src/io/exec/fts.rs
+++ b/rust/lance/src/io/exec/fts.rs
@@ -38,6 +38,7 @@ use super::utils::{IndexMetrics, InstrumentedChildInputStream, build_prefilter};
 use crate::index::scalar::inverted::{load_segment_details, load_segments};
 use crate::{Dataset, index::DatasetIndexInternalExt};
 use lance_index::metrics::MetricsCollector;
+use lance_index::scalar::InvertedIndexParams;
 use lance_index::scalar::inverted::builder::ScoredDoc;
 use lance_index::scalar::inverted::builder::document_input;
 use lance_index::scalar::inverted::document_tokenizer::{DocType, JsonTokenizer, LanceTokenizer};
@@ -148,11 +149,16 @@ async fn search_segments(
         .unzip())
 }
 
-/// Fall back to the default simple tokenizer when no on-disk FTS segment exists.
+/// Fall back to the default base tokenizer when no on-disk FTS segment exists.
 fn default_text_tokenizer() -> Box<dyn LanceTokenizer> {
-    Box::new(TextTokenizer::new(
-        TextAnalyzer::builder(SimpleTokenizer::default()).build(),
-    ))
+    InvertedIndexParams::default()
+        .max_token_length(None)
+        .lower_case(false)
+        .stem(false)
+        .remove_stop_words(false)
+        .ascii_folding(false)
+        .build()
+        .expect("default FTS tokenizer should build")
 }
 
 pub struct FtsIndexMetrics {