Commit 926e1e96779762c1e00cb32c265efc4457a3c22a

Authored by tangwang
1 parent 418b6a4a

分词优化

Showing 1 changed file with 7 additions and 1 deletions   Show diff stats
query/keyword_extractor.py
@@ -18,6 +18,12 @@ import hanlp # type: ignore @@ -18,6 +18,12 @@ import hanlp # type: ignore
18 # Aligns with ``rewritten_query`` / ES ``base_query`` (not a language code). 18 # Aligns with ``rewritten_query`` / ES ``base_query`` (not a language code).
19 KEYWORDS_QUERY_BASE_KEY = "base" 19 KEYWORDS_QUERY_BASE_KEY = "base"
20 20
  21 +# | 场景 | 推荐模型 |
  22 +# | :--------- | :------------------------------------------- |
  23 +# | 纯中文 + 最高精度 | CTB9_TOK_ELECTRA_BASE_CRF 或 MSR_TOK_ELECTRA_BASE_CRF |
  24 +# | 纯中文 + 速度优先 | FINE_ELECTRA_SMALL_ZH(细粒度)或 COARSE_ELECTRA_SMALL_ZH(粗粒度) |
  25 +# | **中英文混合** | `UD_TOK_MMINILMV2L6` 或 `UD_TOK_MMINILMV2L12` ( Transformer 编码器的层数不同)|
  26 +
21 27
22 class KeywordExtractor: 28 class KeywordExtractor:
23 """基于 HanLP 的名词关键词提取器(与分词位置对齐,非连续名词间插入空格)。""" 29 """基于 HanLP 的名词关键词提取器(与分词位置对齐,非连续名词间插入空格)。"""
@@ -31,7 +37,7 @@ class KeywordExtractor: @@ -31,7 +37,7 @@ class KeywordExtractor:
31 if tokenizer is not None: 37 if tokenizer is not None:
32 self.tok = tokenizer 38 self.tok = tokenizer
33 else: 39 else:
34 - self.tok = hanlp.load(hanlp.pretrained.tok.CTB9_TOK_ELECTRA_BASE_CRF) 40 + self.tok = hanlp.load(hanlp.pretrained.tok.UD_TOK_MMINILMV2L6)
35 self.tok.config.output_spans = True 41 self.tok.config.output_spans = True
36 self.pos_tag = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL) 42 self.pos_tag = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL)
37 self.ignore_keywords = frozenset(ignore_keywords or ["玩具"]) 43 self.ignore_keywords = frozenset(ignore_keywords or ["玩具"])