apache · nishant94 · Jun 22, 2026 · Jun 24, 2026
diff --git a/cspell.json b/cspell.json
@@ -6,6 +6,8 @@
     "words": [
         "Cascader",
         "colocation",
+        "IPADIC",
+        "kuromoji",
         "lakehouse",
         "Lakehouse",
         "Linkedin",

diff --git a/docs/admin-manual/config/be-config.md b/docs/admin-manual/config/be-config.md
@@ -979,6 +979,12 @@ BaseCompaction:546859:
 
 ### Storage
 
+#### `enable_kuromoji_analyzer`
+
+* Type: bool
+* Description: Whether to enable the kuromoji (Japanese) inverted-index analyzer.
+* Default value: false
+
 #### `default_num_rows_per_column_file_block`
 
 * Type: int32

diff --git a/docs/key-features/full-text-search.mdx b/docs/key-features/full-text-search.mdx
@@ -131,6 +131,7 @@ The analyzer is the most consequential choice. It decides which queries can matc
 - **`none`**: no tokenization. The whole value is one term. Good for IDs, tags, status enums, and exact-match filters.
 - **`english`**: ASCII word-break with light normalization. Suitable for English prose.
 - **`chinese`**: Chinese word segmentation with `parser_mode` of `fine_grained` or `coarse_grained`. Suitable for CJK content.
+- **`kuromoji`**: Japanese morphological analysis. Japanese has no spaces between words, so the text is segmented into dictionary-based morphemes. `parser_mode` selects `search` (default, also breaks long compound nouns apart for better recall), `normal`, or `extended`. Suitable for Japanese content. Disabled by default — set `enable_kuromoji_analyzer = true` in `be.conf` to enable it.
 - **`unicode`**: language-agnostic word break for CJK and punctuation. A reasonable default for mixed content.
 - **`standard`**: Unicode text-segmentation tokenizer; works for most languages.
 - **`icu`**: ICU-based tokenizer for languages that need full Unicode segmentation rules (Thai, Khmer, mixed scripts).

diff --git a/docs/key-features/inverted-index.mdx b/docs/key-features/inverted-index.mdx
@@ -44,7 +44,7 @@ An inverted index is declared in DDL with `INDEX <name>(<col>) USING INVERTED [P
 **Key terms**
 
 - **`Posting list`**: the row-ID set associated with a value or term. Predicate evaluation becomes a posting-list lookup plus a set operation (AND/OR/NOT).
-- **`Parser`**: the tokenizer used for text columns. Built-in choices: `none` (whole-string, default), `english`, `chinese`, `unicode`, `standard`, plus custom analyzers since 3.1.
+- **`Parser`**: the tokenizer used for text columns. Built-in choices: `none` (whole-string, default), `english`, `chinese`, `kuromoji` (Japanese), `unicode`, `standard`, plus custom analyzers since 3.1.
 - **`MATCH operators`**: `MATCH_ANY` (any term), `MATCH_ALL` (all terms), `MATCH_PHRASE` (adjacent terms), `MATCH_PHRASE_PREFIX`, `MATCH_REGEXP`. They require an inverted index with a parser; ordinary `=` and `LIKE` do not.
 - **`.idx file`**: the on-disk index payload. One `.idx` file lives next to each segment file under the tablet directory, so the index is segment-bound and follows the rowset through compaction.
 

diff --git a/docs/sql-manual/sql-functions/scalar-functions/string-functions/tokenize.md b/docs/sql-manual/sql-functions/scalar-functions/string-functions/tokenize.md
@@ -27,9 +27,9 @@ The `properties` parameter supports the following key-value pairs (format: `"key
 
 | Property | Description | Example Values |
 |----------|-------------|----------------|
-| `built_in_analyzer` | Built-in analyzer type | `"english"`, `"chinese"`, `"unicode"`, `"icu"`, `"basic"`, `"ik"`, `"standard"`, `"none"` |
+| `built_in_analyzer` | Built-in analyzer type | `"english"`, `"chinese"`, `"kuromoji"`, `"unicode"`, `"icu"`, `"basic"`, `"ik"`, `"standard"`, `"none"` |
 | `analyzer` | Custom analyzer name (created via `CREATE INVERTED INDEX ANALYZER`) | `"my_custom_analyzer"` |
-| `parser_mode` | Parser mode (for chinese analyzers) | `"fine_grained"`, `"coarse_grained"` |
+| `parser_mode` | Parser mode. For `chinese`, controls segmentation granularity; for `kuromoji`, controls the Japanese segmentation mode | chinese: `"fine_grained"`, `"coarse_grained"`; kuromoji: `"search"` (default), `"normal"`, `"extended"` |
 | `support_phrase` | Enable phrase support (stores position information) | `"true"`, `"false"` |
 | `lower_case` | Convert tokens to lowercase | `"true"`, `"false"` |
 | `char_filter_type` | Character filter type | Varies by filter |
@@ -102,6 +102,15 @@ SELECT TOKENIZE("中华人民共和国国歌", '"built_in_analyzer"="ik"');
 [{ "token": "中华人民共和国" }, { "token": "国歌" }]
 ```
 
+```sql
+-- Using the kuromoji analyzer for Japanese text
+-- In the default search mode, the compound noun is also broken into its parts
+SELECT TOKENIZE("関西国際空港", '"built_in_analyzer"="kuromoji"');
+```
+```
+[{ "token": "関西" }, { "token": "国際" }, { "token": "空港" }]
+```
+
 ### Example 2: Using custom analyzers
 
 First, create a custom analyzer:
@@ -140,6 +149,7 @@ SELECT TOKENIZE("Hello World", '"built_in_analyzer"="standard", "support_phrase"
    - `standard`: Standard analyzer for general text
    - `english`: English language analyzer with stemming
    - `chinese`: Chinese text analyzer
+   - `kuromoji`: Japanese morphological analyzer (`parser_mode`: `search`, `normal`, `extended`). Disabled by default — set `enable_kuromoji_analyzer = true` in `be.conf` to use it.
    - `unicode`: Unicode-based analyzer for multilingual text
    - `icu`: ICU-based analyzer for advanced Unicode processing
    - `basic`: Basic tokenization

diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/admin-manual/config/be-config.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/admin-manual/config/be-config.md
@@ -970,6 +970,12 @@ BaseCompaction:546859:
 
 ### 存储
 
+#### `enable_kuromoji_analyzer`
+
+* 类型：bool
+* 描述：是否启用 kuromoji（日文）倒排索引分词器。当为 `false` 时,创建或查询带有 `"parser" = "kuromoji"` 的索引会报错,并提示需要开启此配置。
+* 默认值：false
+
 #### `default_num_rows_per_column_file_block`
 
 * 类型：int32

diff --git a/.../current/sql-manual/sql-functions/scalar-functions/string-functions/tokenize.md b/.../current/sql-manual/sql-functions/scalar-functions/string-functions/tokenize.md
@@ -27,9 +27,9 @@ VARCHAR TOKENIZE(VARCHAR str, VARCHAR properties)
 
 | 属性 | 描述 | 示例值 |
 |------|------|--------|
-| `built_in_analyzer` | 内置分词器类型 | `"english"`, `"chinese"`, `"unicode"`, `"icu"`, `"basic"`, `"ik"`, `"standard"`, `"none"` |
+| `built_in_analyzer` | 内置分词器类型 | `"english"`, `"chinese"`, `"kuromoji"`, `"unicode"`, `"icu"`, `"basic"`, `"ik"`, `"standard"`, `"none"` |
 | `analyzer` | 自定义分词器名称(通过 `CREATE INVERTED INDEX ANALYZER` 创建) | `"my_custom_analyzer"` |
-| `parser_mode` | 分词器模式(用于中文分词器) | `"fine_grained"`, `"coarse_grained"` |
+| `parser_mode` | 分词器模式。`chinese` 用于控制分词粒度;`kuromoji` 用于控制日文分词模式 | chinese: `"fine_grained"`, `"coarse_grained"`;kuromoji: `"search"`(默认), `"normal"`, `"extended"` |
 | `support_phrase` | 启用短语支持(存储位置信息) | `"true"`, `"false"` |
 | `lower_case` | 将词条转换为小写 | `"true"`, `"false"` |
 | `char_filter_type` | 字符过滤器类型 | 根据过滤器而异 |
@@ -102,6 +102,15 @@ SELECT TOKENIZE("中华人民共和国国歌", '"built_in_analyzer"="ik"');
 [{ "token": "中华人民共和国" }, { "token": "国歌" }]
 ```
 
+```sql
+-- 使用 kuromoji 分词器处理日文文本
+-- 在默认的 search 模式下,复合词也会被拆分为各个组成部分
+SELECT TOKENIZE("関西国際空港", '"built_in_analyzer"="kuromoji"');
+```
+```
+[{ "token": "関西" }, { "token": "国際" }, { "token": "空港" }]
+```
+
 ### 示例 2: 使用自定义分词器
 
 首先创建一个自定义分词器:
@@ -140,6 +149,7 @@ SELECT TOKENIZE("Hello World", '"built_in_analyzer"="standard", "support_phrase"
    - `standard`: 标准分词器,用于通用文本
    - `english`: 带词干提取的英语分词器
    - `chinese`: 中文文本分词器
+   - `kuromoji`: 日文形态素分词器(`parser_mode`: `search`、`normal`、`extended`)。默认关闭 —— 需在 `be.conf` 中设置 `enable_kuromoji_analyzer = true` 才能使用。
    - `unicode`: 基于Unicode的多语言文本分词器
    - `icu`: 基于ICU的高级Unicode处理分词器
    - `basic`: 基础分词