diff --git a/query/keyword_extractor.py b/query/keyword_extractor.py index 082c8a6..b435aec 100644 --- a/query/keyword_extractor.py +++ b/query/keyword_extractor.py @@ -18,6 +18,12 @@ import hanlp # type: ignore # Aligns with ``rewritten_query`` / ES ``base_query`` (not a language code). KEYWORDS_QUERY_BASE_KEY = "base" +# | 场景 | 推荐模型 | +# | :--------- | :------------------------------------------- | +# | 纯中文 + 最高精度 | CTB9_TOK_ELECTRA_BASE_CRF 或 MSR_TOK_ELECTRA_BASE_CRF | +# | 纯中文 + 速度优先 | FINE_ELECTRA_SMALL_ZH(细粒度)或 COARSE_ELECTRA_SMALL_ZH(粗粒度) | +# | **中英文混合** | `UD_TOK_MMINILMV2L6` 或 `UD_TOK_MMINILMV2L12` ( Transformer 编码器的层数不同)| + class KeywordExtractor: """基于 HanLP 的名词关键词提取器(与分词位置对齐,非连续名词间插入空格)。""" @@ -31,7 +37,7 @@ class KeywordExtractor: if tokenizer is not None: self.tok = tokenizer else: - self.tok = hanlp.load(hanlp.pretrained.tok.CTB9_TOK_ELECTRA_BASE_CRF) + self.tok = hanlp.load(hanlp.pretrained.tok.UD_TOK_MMINILMV2L6) self.tok.config.output_spans = True self.pos_tag = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL) self.ignore_keywords = frozenset(ignore_keywords or ["玩具"]) -- libgit2 0.21.2