Commit 926e1e96779762c1e00cb32c265efc4457a3c22a
1 parent
418b6a4a
分词优化
Showing
1 changed file
with
7 additions
and
1 deletions
Show diff stats
query/keyword_extractor.py
| @@ -18,6 +18,12 @@ import hanlp # type: ignore | @@ -18,6 +18,12 @@ import hanlp # type: ignore | ||
| 18 | # Aligns with ``rewritten_query`` / ES ``base_query`` (not a language code). | 18 | # Aligns with ``rewritten_query`` / ES ``base_query`` (not a language code). |
| 19 | KEYWORDS_QUERY_BASE_KEY = "base" | 19 | KEYWORDS_QUERY_BASE_KEY = "base" |
| 20 | 20 | ||
| 21 | +# | 场景 | 推荐模型 | | ||
| 22 | +# | :--------- | :------------------------------------------- | | ||
| 23 | +# | 纯中文 + 最高精度 | CTB9_TOK_ELECTRA_BASE_CRF 或 MSR_TOK_ELECTRA_BASE_CRF | | ||
| 24 | +# | 纯中文 + 速度优先 | FINE_ELECTRA_SMALL_ZH(细粒度)或 COARSE_ELECTRA_SMALL_ZH(粗粒度) | | ||
| 25 | +# | **中英文混合** | `UD_TOK_MMINILMV2L6` 或 `UD_TOK_MMINILMV2L12` ( Transformer 编码器的层数不同)| | ||
| 26 | + | ||
| 21 | 27 | ||
| 22 | class KeywordExtractor: | 28 | class KeywordExtractor: |
| 23 | """基于 HanLP 的名词关键词提取器(与分词位置对齐,非连续名词间插入空格)。""" | 29 | """基于 HanLP 的名词关键词提取器(与分词位置对齐,非连续名词间插入空格)。""" |
| @@ -31,7 +37,7 @@ class KeywordExtractor: | @@ -31,7 +37,7 @@ class KeywordExtractor: | ||
| 31 | if tokenizer is not None: | 37 | if tokenizer is not None: |
| 32 | self.tok = tokenizer | 38 | self.tok = tokenizer |
| 33 | else: | 39 | else: |
| 34 | - self.tok = hanlp.load(hanlp.pretrained.tok.CTB9_TOK_ELECTRA_BASE_CRF) | 40 | + self.tok = hanlp.load(hanlp.pretrained.tok.UD_TOK_MMINILMV2L6) |
| 35 | self.tok.config.output_spans = True | 41 | self.tok.config.output_spans = True |
| 36 | self.pos_tag = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL) | 42 | self.pos_tag = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL) |
| 37 | self.ignore_keywords = frozenset(ignore_keywords or ["玩具"]) | 43 | self.ignore_keywords = frozenset(ignore_keywords or ["玩具"]) |