ceaf6d03
tangwang
召回限定:must条件补充主干词命...
|
1
2
3
4
5
6
7
8
9
10
11
12
13
|
"""
HanLP-based noun keyword string for lexical constraints (token POS starts with N, length >= 2).
``ParsedQuery.keywords_queries`` uses the same key layout as text variants:
``KEYWORDS_QUERY_BASE_KEY`` for the rewritten source query, and ISO-like language
codes for each ``ParsedQuery.translations`` entry (non-empty extractions only).
"""
from __future__ import annotations
import logging
from typing import Any, Dict, List, Optional
|
45b39796
tangwang
qp性能优化
|
14
15
16
|
from .english_keyword_extractor import EnglishKeywordExtractor
from .tokenization import QueryTextAnalysisCache
|
ceaf6d03
tangwang
召回限定:must条件补充主干词命...
|
17
18
19
20
21
22
23
|
logger = logging.getLogger(__name__)
import hanlp # type: ignore
# Aligns with ``rewritten_query`` / ES ``base_query`` (not a language code).
KEYWORDS_QUERY_BASE_KEY = "base"
|
926e1e96
tangwang
分词优化
|
24
25
26
|
# | 场景 | 推荐模型 |
# | :--------- | :------------------------------------------- |
# | 纯中文 + 最高精度 | CTB9_TOK_ELECTRA_BASE_CRF 或 MSR_TOK_ELECTRA_BASE_CRF |
|
45b39796
tangwang
qp性能优化
|
27
|
# | 纯中文 + 速度优先 | FINE_ELECTRA_SMALL_ZH (细粒度)或 COARSE_ELECTRA_SMALL_ZH (粗粒度) |
|
926e1e96
tangwang
分词优化
|
28
29
|
# | **中英文混合** | `UD_TOK_MMINILMV2L6` 或 `UD_TOK_MMINILMV2L12` ( Transformer 编码器的层数不同)|
|
ceaf6d03
tangwang
召回限定:must条件补充主干词命...
|
30
31
32
33
34
35
36
37
38
|
class KeywordExtractor:
"""基于 HanLP 的名词关键词提取器(与分词位置对齐,非连续名词间插入空格)。"""
def __init__(
self,
tokenizer: Optional[Any] = None,
*,
ignore_keywords: Optional[List[str]] = None,
|
45b39796
tangwang
qp性能优化
|
39
|
english_extractor: Optional[EnglishKeywordExtractor] = None,
|
ceaf6d03
tangwang
召回限定:must条件补充主干词命...
|
40
41
42
43
|
):
if tokenizer is not None:
self.tok = tokenizer
else:
|
45b39796
tangwang
qp性能优化
|
44
|
self.tok = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH)
|
ceaf6d03
tangwang
召回限定:must条件补充主干词命...
|
45
46
47
|
self.tok.config.output_spans = True
self.pos_tag = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL)
self.ignore_keywords = frozenset(ignore_keywords or ["玩具"])
|
45b39796
tangwang
qp性能优化
|
48
|
self.english_extractor = english_extractor or EnglishKeywordExtractor()
|
ceaf6d03
tangwang
召回限定:must条件补充主干词命...
|
49
|
|
45b39796
tangwang
qp性能优化
|
50
51
52
53
54
55
56
|
def extract_keywords(
self,
query: str,
*,
language_hint: Optional[str] = None,
tokenizer_result: Optional[Any] = None,
) -> str:
|
ceaf6d03
tangwang
召回限定:must条件补充主干词命...
|
57
58
59
60
61
62
|
"""
从查询中提取关键词(名词,长度 ≥ 2),以空格分隔非连续片段。
"""
query = (query or "").strip()
if not query:
return ""
|
45b39796
tangwang
qp性能优化
|
63
64
65
66
67
68
69
70
|
normalized_language = str(language_hint or "").strip().lower()
if normalized_language == "en":
return self.english_extractor.extract_keywords(query)
if normalized_language and normalized_language != "zh":
return ""
tok_result_with_position = (
tokenizer_result if tokenizer_result is not None else self.tok(query)
)
|
ceaf6d03
tangwang
召回限定:must条件补充主干词命...
|
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
|
tok_result = [x[0] for x in tok_result_with_position]
if not tok_result:
return ""
pos_tags = self.pos_tag(tok_result)
pos_tag_result = list(zip(tok_result, pos_tags))
keywords: List[str] = []
last_end_pos = 0
for (word, postag), (_, start_pos, end_pos) in zip(pos_tag_result, tok_result_with_position):
if len(word) >= 2 and str(postag).startswith("N"):
if word in self.ignore_keywords:
continue
if start_pos != last_end_pos and keywords:
keywords.append(" ")
keywords.append(word)
last_end_pos = end_pos
return "".join(keywords).strip()
def collect_keywords_queries(
extractor: KeywordExtractor,
rewritten_query: str,
translations: Dict[str, str],
|
45b39796
tangwang
qp性能优化
|
93
94
95
96
|
*,
source_language: Optional[str] = None,
text_analysis_cache: Optional[QueryTextAnalysisCache] = None,
base_keywords_query: Optional[str] = None,
|
ceaf6d03
tangwang
召回限定:must条件补充主干词命...
|
97
98
99
100
101
102
103
|
) -> Dict[str, str]:
"""
Build the keyword map for all lexical variants (base + translations).
Omits entries when extraction yields an empty string.
"""
out: Dict[str, str] = {}
|
45b39796
tangwang
qp性能优化
|
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
|
base_kw = base_keywords_query
if base_kw is None:
base_kw = extractor.extract_keywords(
rewritten_query,
language_hint=source_language or (
text_analysis_cache.get_language_hint(rewritten_query)
if text_analysis_cache is not None
else None
),
tokenizer_result=(
text_analysis_cache.get_tokenizer_result(rewritten_query)
if text_analysis_cache is not None
else None
),
)
|
ceaf6d03
tangwang
召回限定:must条件补充主干词命...
|
119
120
121
122
123
124
|
if base_kw:
out[KEYWORDS_QUERY_BASE_KEY] = base_kw
for lang, text in translations.items():
lang_key = str(lang or "").strip().lower()
if not lang_key or not (text or "").strip():
continue
|
45b39796
tangwang
qp性能优化
|
125
126
127
128
129
130
131
132
133
134
135
136
137
|
kw = extractor.extract_keywords(
text,
language_hint=lang_key or (
text_analysis_cache.get_language_hint(text)
if text_analysis_cache is not None
else None
),
tokenizer_result=(
text_analysis_cache.get_tokenizer_result(text)
if text_analysis_cache is not None
else None
),
)
|
ceaf6d03
tangwang
召回限定:must条件补充主干词命...
|
138
139
140
|
if kw:
out[lang_key] = kw
return out
|