ansj -> ik

tangwang
1 parent ff9efda0
Showing 11 changed files with 45 additions and 31 deletions Show diff stats
docs/ES/ES_8.18/1_ES配置和使用.md
docs/ES/ES_8.18/3_ansj分词插件安装.md
docs/ES/ES_8.18/4_索引和查询测试.md
docs/ES/ES_8.18/README__ES查询相关.md
docs/ES/ES_8.18/README__分词相关.md
docs/常用查询 - ES.md
docs/搜索API对接指南.md
docs/系统设计文档v1.md
docs/索引字段说明v2-plan.md
indexer/ANCHORS_AND_SEMANTIC_ATTRIBUTES.md
suggestion/mapping.py
@@ -108,10 +108,10 @@ labelId_by_skuId_essa_* essa商品标签，区域ID标识
 #### 分词相关
 ```bash
 # 索引分词
-GET /_cat/ansj?text=14寸第4代真眼珠实身冰雪公仔带手动大推车,搪胶雪宝宝&type=index_ansj
+GET /_cat/ansj?text=14寸第4代真眼珠实身冰雪公仔带手动大推车,搪胶雪宝宝&type=index_ik
  
 # 查询分词
-GET /_cat/ansj?text=14寸第4代真眼珠实身冰雪公仔带手动大推车,搪胶雪宝宝&type=query_ansj
+GET /_cat/ansj?text=14寸第4代真眼珠实身冰雪公仔带手动大推车,搪胶雪宝宝&type=query_ik
  
 # 查看配置
 GET /_cat/ansj/config
@@ -128,7 +128,7 @@ GET /goods/_search
 #### 1. 查看字段分词结果
 ```bash
 # 查看中文名称分词结果
-GET /_cat/ansj?text=14寸第4代真眼珠实身冰雪公仔带手动大推车&type=index_ansj
+GET /_cat/ansj?text=14寸第4代真眼珠实身冰雪公仔带手动大推车&type=index_ik
  
 # 查看英文名称分词结果
 GET /_cat/ansj?text=14 inch 4th generation real eye snow doll with manual cart&type=standard
@@ -20,14 +20,14 @@ mvn package
 ./bin/elasticsearch-plugin install https://github.com/NLPchina/elasticsearch-analysis-ansj/releases/download/v8.7.0/elasticsearch-analysis-ansj-8.7.0.0-release.zip
  
 测试：
-kibana中 ： GET /_cat/ansj?text=中国&type=index_ansj
+kibana中 ： GET /_cat/ansj?text=中国&type=index_ik
 或者：
-curl -X GET "http://localhost:9200/_cat/ansj?text=中国&type=index_ansj" 
+curl -X GET "http://localhost:9200/_cat/ansj?text=中国&type=index_ik" 
  
  
 curl -X GET "http://localhost:9200/_cat/ansj?pretty" -H 'Content-Type: application/json' -d'  
 {  
-  "type": "index_ansj",  
+  "type": "index_ik",  
   "text": "中国"  
 }'
  
@@ -54,7 +54,7 @@ curl -X GET &quot;http://localhost:9200/goods/_termvectors/[DOC_ID]?fields=*&amp;pretty&quot;
 curl -X GET "http://localhost:9200/goods/_analyze?pretty" -H 'Content-Type: application/json' -d'  
 {  
   "text": "玩具",  
-  "analyzer": "index_ansj"  
+  "analyzer": "index_ik"  
 }'
  
  
@@ -247,22 +247,22 @@ POST spu/_analyze
 }
 ```
  
-### query_ansj分词器
+### query_ik分词器
  
 ```json
 POST spu/_analyze
 {
-  "analyzer": "query_ansj",
+  "analyzer": "query_ik",
   "text": "14寸第4代-眼珠实身冰雪公仔带手动大推车,搪胶雪宝宝"
 }
 ```
  
-### index_ansj分词器
+### index_ik分词器
  
 ```json
 POST spu_test/_analyze
 {
-  "analyzer": "index_ansj",
+  "analyzer": "index_ik",
   "text": "14寸第4代-眼珠实身冰雪公仔带手动大推车,搪胶雪宝宝"
 }
 ```
@@ -3,7 +3,7 @@
 工厂搜索 依赖于关键词召回、没有做语义召回和精排，加上语义召回和精排则成本过高，所以对分词效果仍然很依赖。badcase： q=锦鸿 无法召回 锦鸿达
 1. 虽然商品搜索已经不强依赖于分词效果，但是工厂搜索仍然对分词效果有很大依赖，所以进行了优化。
 2. 优化内容：
-  1) index_ansj + 自定义词典（需要配置自定义词典。 不配置自定义词典的时候，index_ansj分词结果和query_ansj完全一致，配置自定义词典时，index_ansj 将 ）
+  1) index_ik + 自定义词典（需要配置自定义词典。 不配置自定义词典的时候，index_ik分词结果和query_ik完全一致，配置自定义词典时，index_ik 将 ）
   2) ansj自定义词典 补充属性词
  
  
@@ -20,7 +20,7 @@
  
  
  
-| Query   | query_ansj 分词结果                       | index_ansj 分词结果                                |
+| Query   | query_ik 分词结果                       | index_ik 分词结果                                |
 |---------|------------------------------------------|--------------------------------------------------|
 | 锦鸿达  | 锦, 鸿达                                 | 锦, 鸿达, 鸿, 达                                  |
 | 锦鸿    | 锦鸿                                     | 锦鸿, 锦, 鸿                                      |
@@ -29,7 +29,7 @@
  
  
 ansj 插件自定义词典补充属性词后：
-| Query   | query_ansj 分词结果                       | index_ansj 分词结果                                |
+| Query   | query_ik 分词结果                       | index_ik 分词结果                                |
 |---------|------------------------------------------|--------------------------------------------------|
 | 锦鸿达  | 锦, 鸿达                                 | 锦, 鸿达, 鸿, 达                                  |
 | 锦鸿    | 锦鸿                                     | 锦鸿, 锦, 鸿                                      |
@@ -65,8 +65,8 @@ POST spu/_analyze
  
 ## ansj分词器的问题：
  
-### 1. index_ansj不符合预期
-index模式（type=index_ansj）不符合预期 ：type=index_ansj  type=query_ansj 结果永远都是一样的。
+### 1. index_ik不符合预期
+index模式（type=index_ik）不符合预期 ：type=index_ik  type=query_ik 结果永远都是一样的。
 https://github.com/NLPchina/elasticsearch-analysis-ansj/issues/235
  
 已解决：需要配置自定义词典
@@ -75,7 +75,7 @@ https://github.com/NLPchina/elasticsearch-analysis-ansj/issues/235
 特殊符号（减号）会跟后面的词粘连到一起，导致搜索减号后面的词，无法匹配：
  
 ```bash
-GET /_cat/ansj?text=狗狗系列-柴犬积木&type=index_ansj
+GET /_cat/ansj?text=狗狗系列-柴犬积木&type=index_ik
 {
 "name": "-柴",
 "nature": "nrf",
@@ -84,7 +84,7 @@ GET /_cat/ansj?text=狗狗系列-柴犬积木&amp;type=index_ansj
 "synonyms": null
 },
  
-GET /_cat/ansj?text=狗狗系列-哈士奇犬&type=index_ansj
+GET /_cat/ansj?text=狗狗系列-哈士奇犬&type=index_ik
 {
 "name": "-哈士奇",
 "nature": "nrf",
@@ -99,15 +99,15 @@ GET /_cat/ansj?text=狗狗系列-哈士奇犬&amp;type=index_ansj
 暂时解决办法是：
 ```json
         "analyzer": {
-          "index_ansj": {
+          "index_ik": {
             "type": "custom",
             "char_filter": ["hyphen_to_space"],
-            "tokenizer": "index_ansj"
+            "tokenizer": "index_ik"
           },
-          "query_ansj": {
+          "query_ik": {
             "type": "custom",
             "char_filter": ["hyphen_to_space"],
-            "tokenizer": "query_ansj"
+            "tokenizer": "query_ik"
           }
         }
 ```
@@ -87,12 +87,12 @@ curl -u &#39;saas:4hOaLaf41y2VuI8y&#39;   -X GET &#39;http://localhost:9200/search_products_
 }'
  
 Curl -u 'saas:4hOaLaf41y2VuI8y'   -X GET 'http://localhost:9200/search_products_tenant_170/_analyze'   -H 'Content-Type: application/json'   -d '{
-  "analyzer": "index_ansj",
+  "analyzer": "index_ik",
   "text": "14寸第4代-眼珠实身冰雪公仔带手动大推车,搪胶雪宝宝"
 }'
  
 curl -u 'saas:4hOaLaf41y2VuI8y'   -X GET 'http://localhost:9200/search_products_tenant_170/_analyze'   -H 'Content-Type: application/json'   -d '{
-  "analyzer": "query_ansj",
+  "analyzer": "query_ik",
   "text": "14寸第4代-眼珠实身冰雪公仔带手动大推车,搪胶雪宝宝"
 }'
  
@@ -2075,8 +2075,8 @@ curl &quot;http://localhost:6006/health&quot;
  
 | 分析器 | 语言 | 描述 |
 |--------|------|------|
-| `index_ansj` | 中文 | 中文索引分析器（用于中文字段） |
-| `query_ansj` | 中文 | 中文查询分析器（用于中文字段） |
+| `index_ik` | 中文 | 中文索引分析器（用于中文字段） |
+| `query_ik` | 中文 | 中文查询分析器（用于中文字段） |
 | `hanlp_index` ⚠️ TODO（暂不支持） | 中文 | 中文索引分析器（用于中文字段） |
 | `hanlp_standard` ⚠️ TODO（暂不支持） | 中文 | 中文查询分析器（用于中文字段） |
 | `english` | 英文 | 标准英文分析器（用于英文字段） |
@@ -89,7 +89,7 @@
 - **BOOLEAN**：布尔类型
  
 #### 分析器支持
-- **chinese_ecommerce**：中文电商分词器（index_ansj/query_ansj）
+- **chinese_ecommerce**：中文电商分词器（index_ik/query_ik）
 - **english**：英文分析器
 - **russian**：俄文分析器
 - **arabic**：阿拉伯文分析器
@@ -11,7 +11,7 @@
 #### 1.1 多语言文本字段
  
 - 为文本字段添加中英文双字段支持（title.zh/title.en, brief.zh/brief.en, description.zh/description.en, vendor.zh/vendor.en）
-- 中文字段使用 `index_ansj`/`query_ansj` 分析器（对应文档中的hanlp_index/hanlp_standard）
+- 中文字段使用 `index_ik`/`query_ik` 分析器（对应文档中的hanlp_index/hanlp_standard）
 - 英文字段使用 `english` 分析器
 - **暂时只填充中文字段，英文字段设为空**（不需要语言检测，每个tenant的语言预先知道）
  
@@ -22,7 +22,7 @@
 "qanchors": {
   "type": "object",
   "properties": {
-    "zh": { "type": "text", "analyzer": "index_ansj", "search_analyzer": "query_ansj" },
+    "zh": { "type": "text", "analyzer": "index_ik", "search_analyzer": "query_ik" },
     "en": { "type": "text", "analyzer": "english" },
     "de": { "type": "text", "analyzer": "german" },
     "ru": { "type": "text", "analyzer": "russian" },
@@ -6,7 +6,7 @@ from typing import Dict, Any, List
  
  
 ANALYZER_BY_LANG: Dict[str, str] = {
-    "zh": "index_ansj",
+    "zh": "index_ik",
     "en": "english",
     "ar": "arabic",
     "hy": "armenian",
@@ -45,7 +45,7 @@ def _completion_field(lang: str) -&gt; Dict[str, Any]:
         return {
             "type": "completion",
             "analyzer": analyzer,
-            "search_analyzer": "query_ansj",
+            "search_analyzer": "query_ik",
         }
     return {"type": "completion", "analyzer": analyzer}
  
@@ -72,6 +72,20 @@ def build_suggestion_mapping(index_languages: List[str]) -&gt; Dict[str, Any]:
             "number_of_shards": 1,
             "number_of_replicas": 0,
             "refresh_interval": "30s",
+            "analysis": {
+                "analyzer": {
+                    "index_ik": {
+                        "type": "custom",
+                        "tokenizer": "ik_max_word",
+                        "filter": ["lowercase", "asciifolding"],
+                    },
+                    "query_ik": {
+                        "type": "custom",
+                        "tokenizer": "ik_smart",
+                        "filter": ["lowercase", "asciifolding"],
+                    },
+                }
+            },
         },
         "mappings": {
             "properties": {