分词优化

tangwang
1 parent 418b6a4a
Showing 1 changed file with 7 additions and 1 deletions Show diff stats
query/keyword_extractor.py
@@ -18,6 +18,12 @@ import hanlp  # type: ignore
 # Aligns with ``rewritten_query`` / ES ``base_query`` (not a language code).
 KEYWORDS_QUERY_BASE_KEY = "base"
+# | 场景         | 推荐模型                                         |
+# | :--------- | :------------------------------------------- |
+# | 纯中文 + 最高精度 | CTB9_TOK_ELECTRA_BASE_CRF 或 MSR_TOK_ELECTRA_BASE_CRF                |
+# | 纯中文 + 速度优先 | FINE_ELECTRA_SMALL_ZH（细粒度）或 COARSE_ELECTRA_SMALL_ZH（粗粒度） |
+# | **中英文混合**  | `UD_TOK_MMINILMV2L6` 或 `UD_TOK_MMINILMV2L12` （ Transformer 编码器的层数不同）|
+
 class KeywordExtractor:
     """基于 HanLP 的名词关键词提取器（与分词位置对齐，非连续名词间插入空格）。"""
@@ -31,7 +37,7 @@ class KeywordExtractor:
         if tokenizer is not None:
             self.tok = tokenizer
         else:
-            self.tok = hanlp.load(hanlp.pretrained.tok.CTB9_TOK_ELECTRA_BASE_CRF)
+            self.tok = hanlp.load(hanlp.pretrained.tok.UD_TOK_MMINILMV2L6)
             self.tok.config.output_spans = True
         self.pos_tag = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL)
         self.ignore_keywords = frozenset(ignore_keywords or ["玩具"])