Commit 926e1e96779762c1e00cb32c265efc4457a3c22a
1 parent
418b6a4a
分词优化
Showing
1 changed file
with
7 additions
and
1 deletions
Show diff stats
query/keyword_extractor.py
| ... | ... | @@ -18,6 +18,12 @@ import hanlp # type: ignore |
| 18 | 18 | # Aligns with ``rewritten_query`` / ES ``base_query`` (not a language code). |
| 19 | 19 | KEYWORDS_QUERY_BASE_KEY = "base" |
| 20 | 20 | |
| 21 | +# | 场景 | 推荐模型 | | |
| 22 | +# | :--------- | :------------------------------------------- | | |
| 23 | +# | 纯中文 + 最高精度 | CTB9_TOK_ELECTRA_BASE_CRF 或 MSR_TOK_ELECTRA_BASE_CRF | | |
| 24 | +# | 纯中文 + 速度优先 | FINE_ELECTRA_SMALL_ZH(细粒度)或 COARSE_ELECTRA_SMALL_ZH(粗粒度) | | |
| 25 | +# | **中英文混合** | `UD_TOK_MMINILMV2L6` 或 `UD_TOK_MMINILMV2L12` ( Transformer 编码器的层数不同)| | |
| 26 | + | |
| 21 | 27 | |
| 22 | 28 | class KeywordExtractor: |
| 23 | 29 | """基于 HanLP 的名词关键词提取器(与分词位置对齐,非连续名词间插入空格)。""" |
| ... | ... | @@ -31,7 +37,7 @@ class KeywordExtractor: |
| 31 | 37 | if tokenizer is not None: |
| 32 | 38 | self.tok = tokenizer |
| 33 | 39 | else: |
| 34 | - self.tok = hanlp.load(hanlp.pretrained.tok.CTB9_TOK_ELECTRA_BASE_CRF) | |
| 40 | + self.tok = hanlp.load(hanlp.pretrained.tok.UD_TOK_MMINILMV2L6) | |
| 35 | 41 | self.tok.config.output_spans = True |
| 36 | 42 | self.pos_tag = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL) |
| 37 | 43 | self.ignore_keywords = frozenset(ignore_keywords or ["玩具"]) | ... | ... |