Commit 45b397964fb80661b13dbc49c8fd03990123ea41
1 parent
926e1e96
qp性能优化
Showing
11 changed files
with
760 additions
and
76 deletions
Show diff stats
config/config.yaml
| ... | ... | @@ -116,8 +116,8 @@ query_config: |
| 116 | 116 | |
| 117 | 117 | # 查询解析阶段:翻译与 query 向量并发执行,共用同一等待预算(毫秒)。 |
| 118 | 118 | # 检测语言已在租户 index_languages 内:较短;不在索引语言内:较长(翻译对召回更关键)。 |
| 119 | - translation_embedding_wait_budget_ms_source_in_index: 500 # 80 | |
| 120 | - translation_embedding_wait_budget_ms_source_not_in_index: 700 #200 | |
| 119 | + translation_embedding_wait_budget_ms_source_in_index: 200 # 80 | |
| 120 | + translation_embedding_wait_budget_ms_source_not_in_index: 300 #200 | |
| 121 | 121 | |
| 122 | 122 | style_intent: |
| 123 | 123 | enabled: true | ... | ... |
| ... | ... | @@ -0,0 +1,256 @@ |
| 1 | +""" | |
| 2 | +Lightweight English core-term extraction for lexical keyword constraints. | |
| 3 | +""" | |
| 4 | + | |
| 5 | +from __future__ import annotations | |
| 6 | + | |
| 7 | +import logging | |
| 8 | +from typing import List, Optional, Sequence, Set | |
| 9 | + | |
| 10 | +from .tokenization import normalize_query_text, simple_tokenize_query | |
| 11 | + | |
| 12 | +logger = logging.getLogger(__name__) | |
| 13 | + | |
| 14 | +_WEAK_BOOST_ADJS = frozenset( | |
| 15 | + { | |
| 16 | + "best", | |
| 17 | + "good", | |
| 18 | + "great", | |
| 19 | + "new", | |
| 20 | + "free", | |
| 21 | + "cheap", | |
| 22 | + "top", | |
| 23 | + "fine", | |
| 24 | + "real", | |
| 25 | + } | |
| 26 | +) | |
| 27 | + | |
| 28 | +_FUNCTIONAL_DEP = frozenset( | |
| 29 | + { | |
| 30 | + "det", | |
| 31 | + "aux", | |
| 32 | + "auxpass", | |
| 33 | + "prep", | |
| 34 | + "mark", | |
| 35 | + "expl", | |
| 36 | + "cc", | |
| 37 | + "punct", | |
| 38 | + "case", | |
| 39 | + } | |
| 40 | +) | |
| 41 | + | |
| 42 | +_DEMOGRAPHIC_NOUNS = frozenset( | |
| 43 | + { | |
| 44 | + "women", | |
| 45 | + "woman", | |
| 46 | + "men", | |
| 47 | + "man", | |
| 48 | + "kids", | |
| 49 | + "kid", | |
| 50 | + "boys", | |
| 51 | + "boy", | |
| 52 | + "girls", | |
| 53 | + "girl", | |
| 54 | + "baby", | |
| 55 | + "babies", | |
| 56 | + "toddler", | |
| 57 | + "adult", | |
| 58 | + "adults", | |
| 59 | + } | |
| 60 | +) | |
| 61 | + | |
| 62 | +_PRICE_PREP_LEMMAS = frozenset({"under", "over", "below", "above", "within", "between", "near"}) | |
| 63 | +_DIMENSION_ROOTS = frozenset({"size", "width", "length", "height", "weight"}) | |
| 64 | + | |
| 65 | + | |
| 66 | +def _dedupe_preserve(seq: Sequence[str]) -> List[str]: | |
| 67 | + seen: Set[str] = set() | |
| 68 | + out: List[str] = [] | |
| 69 | + for item in seq: | |
| 70 | + normalized = normalize_query_text(item) | |
| 71 | + if not normalized or normalized in seen: | |
| 72 | + continue | |
| 73 | + seen.add(normalized) | |
| 74 | + out.append(normalized) | |
| 75 | + return out | |
| 76 | + | |
| 77 | + | |
| 78 | +def _lemma_lower(token) -> str: | |
| 79 | + return ((token.lemma_ or token.text) or "").lower().strip() | |
| 80 | + | |
| 81 | + | |
| 82 | +def _surface_lower(token) -> str: | |
| 83 | + return (token.text or "").lower().strip() | |
| 84 | + | |
| 85 | + | |
| 86 | +def _project_terms_to_query_tokens(query: str, terms: Sequence[str]) -> List[str]: | |
| 87 | + simple_tokens = _dedupe_preserve(simple_tokenize_query(query)) | |
| 88 | + projected: List[str] = [] | |
| 89 | + for term in terms: | |
| 90 | + normalized = normalize_query_text(term) | |
| 91 | + if len(normalized) < 2 or normalized in _DEMOGRAPHIC_NOUNS: | |
| 92 | + continue | |
| 93 | + exact = next((token for token in simple_tokens if token == normalized), None) | |
| 94 | + if exact is not None: | |
| 95 | + projected.append(exact) | |
| 96 | + continue | |
| 97 | + partial = next( | |
| 98 | + ( | |
| 99 | + token | |
| 100 | + for token in simple_tokens | |
| 101 | + if len(normalized) >= 3 and normalized in token and token not in _DEMOGRAPHIC_NOUNS | |
| 102 | + ), | |
| 103 | + None, | |
| 104 | + ) | |
| 105 | + if partial is not None: | |
| 106 | + projected.append(partial) | |
| 107 | + continue | |
| 108 | + projected.append(normalized) | |
| 109 | + return _dedupe_preserve(projected) | |
| 110 | + | |
| 111 | + | |
| 112 | +class EnglishKeywordExtractor: | |
| 113 | + """Extracts a small set of English core product terms with spaCy.""" | |
| 114 | + | |
| 115 | + def __init__(self, nlp: Optional[object] = None) -> None: | |
| 116 | + self._nlp = nlp if nlp is not None else self._load_nlp() | |
| 117 | + | |
| 118 | + @staticmethod | |
| 119 | + def _load_nlp() -> Optional[object]: | |
| 120 | + try: | |
| 121 | + import spacy | |
| 122 | + | |
| 123 | + return spacy.load("en_core_web_sm", disable=["ner", "textcat"]) | |
| 124 | + except Exception as exc: | |
| 125 | + logger.warning("English keyword extractor disabled; failed to load spaCy model: %s", exc) | |
| 126 | + return None | |
| 127 | + | |
| 128 | + def extract_keywords(self, query: str) -> str: | |
| 129 | + text = str(query or "").strip() | |
| 130 | + if not text: | |
| 131 | + return "" | |
| 132 | + if self._nlp is None: | |
| 133 | + return self._fallback_keywords(text) | |
| 134 | + try: | |
| 135 | + return self._extract_keywords_with_spacy(text) | |
| 136 | + except Exception as exc: | |
| 137 | + logger.warning("spaCy English keyword extraction failed; using fallback: %s", exc) | |
| 138 | + return self._fallback_keywords(text) | |
| 139 | + | |
| 140 | + def _extract_keywords_with_spacy(self, query: str) -> str: | |
| 141 | + doc = self._nlp(query) | |
| 142 | + intersection: Set[str] = set() | |
| 143 | + stops = self._nlp.Defaults.stop_words | _WEAK_BOOST_ADJS | |
| 144 | + pobj_heads_to_demote: Set[int] = set() | |
| 145 | + | |
| 146 | + for token in doc: | |
| 147 | + if token.dep_ == "prep" and token.text.lower() == "for": | |
| 148 | + for child in token.children: | |
| 149 | + if child.dep_ == "pobj" and child.pos_ in ("NOUN", "PROPN"): | |
| 150 | + pobj_heads_to_demote.add(child.i) | |
| 151 | + | |
| 152 | + for token in doc: | |
| 153 | + if token.dep_ != "prep" or _lemma_lower(token) not in _PRICE_PREP_LEMMAS: | |
| 154 | + continue | |
| 155 | + for child in token.children: | |
| 156 | + if child.dep_ == "pobj" and child.pos_ in ("NOUN", "PROPN"): | |
| 157 | + pobj_heads_to_demote.add(child.i) | |
| 158 | + | |
| 159 | + for token in doc: | |
| 160 | + if token.dep_ == "dobj" and token.pos_ in ("NOUN", "PROPN") and token.i not in pobj_heads_to_demote: | |
| 161 | + intersection.add(_surface_lower(token)) | |
| 162 | + | |
| 163 | + for token in doc: | |
| 164 | + if token.dep_ == "nsubj" and token.pos_ in ("NOUN", "PROPN"): | |
| 165 | + head = token.head | |
| 166 | + if head.pos_ == "AUX" and head.dep_ == "ROOT": | |
| 167 | + intersection.add(_surface_lower(token)) | |
| 168 | + | |
| 169 | + for token in doc: | |
| 170 | + if token.dep_ == "ROOT" and token.pos_ in ("INTJ", "PROPN"): | |
| 171 | + intersection.add(_surface_lower(token)) | |
| 172 | + if token.pos_ == "PROPN": | |
| 173 | + if token.dep_ == "compound" and _lemma_lower(token.head) in _DEMOGRAPHIC_NOUNS: | |
| 174 | + continue | |
| 175 | + intersection.add(_surface_lower(token)) | |
| 176 | + | |
| 177 | + for token in doc: | |
| 178 | + if token.dep_ == "ROOT" and token.pos_ in ("NOUN", "PROPN"): | |
| 179 | + if _lemma_lower(token) in _DIMENSION_ROOTS: | |
| 180 | + for child in token.children: | |
| 181 | + if child.dep_ == "nsubj" and child.pos_ in ("NOUN", "PROPN"): | |
| 182 | + intersection.add(_surface_lower(child)) | |
| 183 | + continue | |
| 184 | + if _lemma_lower(token) in _DEMOGRAPHIC_NOUNS: | |
| 185 | + for child in token.children: | |
| 186 | + if child.dep_ == "compound" and child.pos_ == "NOUN": | |
| 187 | + intersection.add(_surface_lower(child)) | |
| 188 | + continue | |
| 189 | + if token.i in pobj_heads_to_demote: | |
| 190 | + continue | |
| 191 | + intersection.add(_surface_lower(token)) | |
| 192 | + | |
| 193 | + for token in doc: | |
| 194 | + if token.dep_ != "ROOT" or token.pos_ not in ("INTJ", "VERB", "NOUN"): | |
| 195 | + continue | |
| 196 | + pobjs = sorted( | |
| 197 | + [child for child in token.children if child.dep_ == "pobj" and child.pos_ in ("NOUN", "PROPN")], | |
| 198 | + key=lambda item: item.i, | |
| 199 | + ) | |
| 200 | + if len(pobjs) >= 2 and token.pos_ == "INTJ": | |
| 201 | + intersection.add(_surface_lower(pobjs[0])) | |
| 202 | + for extra in pobjs[1:]: | |
| 203 | + if _lemma_lower(extra) not in _DEMOGRAPHIC_NOUNS: | |
| 204 | + intersection.add(_surface_lower(extra)) | |
| 205 | + elif len(pobjs) == 1 and token.pos_ == "INTJ": | |
| 206 | + intersection.add(_surface_lower(pobjs[0])) | |
| 207 | + | |
| 208 | + if not intersection: | |
| 209 | + for chunk in doc.noun_chunks: | |
| 210 | + head = chunk.root | |
| 211 | + if head.pos_ not in ("NOUN", "PROPN"): | |
| 212 | + continue | |
| 213 | + if head.dep_ == "pobj" and head.head.dep_ == "prep": | |
| 214 | + prep = head.head | |
| 215 | + if _lemma_lower(prep) in _PRICE_PREP_LEMMAS or prep.text.lower() == "for": | |
| 216 | + continue | |
| 217 | + head_text = _surface_lower(head) | |
| 218 | + if head_text: | |
| 219 | + intersection.add(head_text) | |
| 220 | + for token in chunk: | |
| 221 | + if token == head or token.pos_ != "PROPN": | |
| 222 | + continue | |
| 223 | + intersection.add(_surface_lower(token)) | |
| 224 | + | |
| 225 | + core_terms = _dedupe_preserve( | |
| 226 | + token.text.lower() | |
| 227 | + for token in doc | |
| 228 | + if _surface_lower(token) in intersection | |
| 229 | + and _surface_lower(token) not in stops | |
| 230 | + and _surface_lower(token) not in _DEMOGRAPHIC_NOUNS | |
| 231 | + and token.dep_ not in _FUNCTIONAL_DEP | |
| 232 | + and len(_surface_lower(token)) >= 2 | |
| 233 | + ) | |
| 234 | + projected_terms = _project_terms_to_query_tokens(query, core_terms) | |
| 235 | + if projected_terms: | |
| 236 | + return " ".join(projected_terms[:3]) | |
| 237 | + return self._fallback_keywords(query) | |
| 238 | + | |
| 239 | + def _fallback_keywords(self, query: str) -> str: | |
| 240 | + tokens = [ | |
| 241 | + normalize_query_text(token) | |
| 242 | + for token in simple_tokenize_query(query) | |
| 243 | + if normalize_query_text(token) | |
| 244 | + ] | |
| 245 | + if not tokens: | |
| 246 | + return "" | |
| 247 | + | |
| 248 | + filtered = [token for token in tokens if token not in _DEMOGRAPHIC_NOUNS] | |
| 249 | + if not filtered: | |
| 250 | + filtered = tokens | |
| 251 | + | |
| 252 | + # Keep the right-most likely product head plus one close modifier. | |
| 253 | + head = filtered[-1] | |
| 254 | + if len(filtered) >= 2: | |
| 255 | + return " ".join(filtered[-2:]) | |
| 256 | + return head | ... | ... |
query/keyword_extractor.py
| ... | ... | @@ -11,6 +11,9 @@ from __future__ import annotations |
| 11 | 11 | import logging |
| 12 | 12 | from typing import Any, Dict, List, Optional |
| 13 | 13 | |
| 14 | +from .english_keyword_extractor import EnglishKeywordExtractor | |
| 15 | +from .tokenization import QueryTextAnalysisCache | |
| 16 | + | |
| 14 | 17 | logger = logging.getLogger(__name__) |
| 15 | 18 | |
| 16 | 19 | import hanlp # type: ignore |
| ... | ... | @@ -21,7 +24,7 @@ KEYWORDS_QUERY_BASE_KEY = "base" |
| 21 | 24 | # | 场景 | 推荐模型 | |
| 22 | 25 | # | :--------- | :------------------------------------------- | |
| 23 | 26 | # | 纯中文 + 最高精度 | CTB9_TOK_ELECTRA_BASE_CRF 或 MSR_TOK_ELECTRA_BASE_CRF | |
| 24 | -# | 纯中文 + 速度优先 | FINE_ELECTRA_SMALL_ZH(细粒度)或 COARSE_ELECTRA_SMALL_ZH(粗粒度) | | |
| 27 | +# | 纯中文 + 速度优先 | FINE_ELECTRA_SMALL_ZH (细粒度)或 COARSE_ELECTRA_SMALL_ZH (粗粒度) | | |
| 25 | 28 | # | **中英文混合** | `UD_TOK_MMINILMV2L6` 或 `UD_TOK_MMINILMV2L12` ( Transformer 编码器的层数不同)| |
| 26 | 29 | |
| 27 | 30 | |
| ... | ... | @@ -33,23 +36,38 @@ class KeywordExtractor: |
| 33 | 36 | tokenizer: Optional[Any] = None, |
| 34 | 37 | *, |
| 35 | 38 | ignore_keywords: Optional[List[str]] = None, |
| 39 | + english_extractor: Optional[EnglishKeywordExtractor] = None, | |
| 36 | 40 | ): |
| 37 | 41 | if tokenizer is not None: |
| 38 | 42 | self.tok = tokenizer |
| 39 | 43 | else: |
| 40 | - self.tok = hanlp.load(hanlp.pretrained.tok.UD_TOK_MMINILMV2L6) | |
| 44 | + self.tok = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH) | |
| 41 | 45 | self.tok.config.output_spans = True |
| 42 | 46 | self.pos_tag = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL) |
| 43 | 47 | self.ignore_keywords = frozenset(ignore_keywords or ["玩具"]) |
| 48 | + self.english_extractor = english_extractor or EnglishKeywordExtractor() | |
| 44 | 49 | |
| 45 | - def extract_keywords(self, query: str) -> str: | |
| 50 | + def extract_keywords( | |
| 51 | + self, | |
| 52 | + query: str, | |
| 53 | + *, | |
| 54 | + language_hint: Optional[str] = None, | |
| 55 | + tokenizer_result: Optional[Any] = None, | |
| 56 | + ) -> str: | |
| 46 | 57 | """ |
| 47 | 58 | 从查询中提取关键词(名词,长度 ≥ 2),以空格分隔非连续片段。 |
| 48 | 59 | """ |
| 49 | 60 | query = (query or "").strip() |
| 50 | 61 | if not query: |
| 51 | 62 | return "" |
| 52 | - tok_result_with_position = self.tok(query) | |
| 63 | + normalized_language = str(language_hint or "").strip().lower() | |
| 64 | + if normalized_language == "en": | |
| 65 | + return self.english_extractor.extract_keywords(query) | |
| 66 | + if normalized_language and normalized_language != "zh": | |
| 67 | + return "" | |
| 68 | + tok_result_with_position = ( | |
| 69 | + tokenizer_result if tokenizer_result is not None else self.tok(query) | |
| 70 | + ) | |
| 53 | 71 | tok_result = [x[0] for x in tok_result_with_position] |
| 54 | 72 | if not tok_result: |
| 55 | 73 | return "" |
| ... | ... | @@ -72,6 +90,10 @@ def collect_keywords_queries( |
| 72 | 90 | extractor: KeywordExtractor, |
| 73 | 91 | rewritten_query: str, |
| 74 | 92 | translations: Dict[str, str], |
| 93 | + *, | |
| 94 | + source_language: Optional[str] = None, | |
| 95 | + text_analysis_cache: Optional[QueryTextAnalysisCache] = None, | |
| 96 | + base_keywords_query: Optional[str] = None, | |
| 75 | 97 | ) -> Dict[str, str]: |
| 76 | 98 | """ |
| 77 | 99 | Build the keyword map for all lexical variants (base + translations). |
| ... | ... | @@ -79,14 +101,40 @@ def collect_keywords_queries( |
| 79 | 101 | Omits entries when extraction yields an empty string. |
| 80 | 102 | """ |
| 81 | 103 | out: Dict[str, str] = {} |
| 82 | - base_kw = extractor.extract_keywords(rewritten_query) | |
| 104 | + base_kw = base_keywords_query | |
| 105 | + if base_kw is None: | |
| 106 | + base_kw = extractor.extract_keywords( | |
| 107 | + rewritten_query, | |
| 108 | + language_hint=source_language or ( | |
| 109 | + text_analysis_cache.get_language_hint(rewritten_query) | |
| 110 | + if text_analysis_cache is not None | |
| 111 | + else None | |
| 112 | + ), | |
| 113 | + tokenizer_result=( | |
| 114 | + text_analysis_cache.get_tokenizer_result(rewritten_query) | |
| 115 | + if text_analysis_cache is not None | |
| 116 | + else None | |
| 117 | + ), | |
| 118 | + ) | |
| 83 | 119 | if base_kw: |
| 84 | 120 | out[KEYWORDS_QUERY_BASE_KEY] = base_kw |
| 85 | 121 | for lang, text in translations.items(): |
| 86 | 122 | lang_key = str(lang or "").strip().lower() |
| 87 | 123 | if not lang_key or not (text or "").strip(): |
| 88 | 124 | continue |
| 89 | - kw = extractor.extract_keywords(text) | |
| 125 | + kw = extractor.extract_keywords( | |
| 126 | + text, | |
| 127 | + language_hint=lang_key or ( | |
| 128 | + text_analysis_cache.get_language_hint(text) | |
| 129 | + if text_analysis_cache is not None | |
| 130 | + else None | |
| 131 | + ), | |
| 132 | + tokenizer_result=( | |
| 133 | + text_analysis_cache.get_tokenizer_result(text) | |
| 134 | + if text_analysis_cache is not None | |
| 135 | + else None | |
| 136 | + ), | |
| 137 | + ) | |
| 90 | 138 | if kw: |
| 91 | 139 | out[lang_key] = kw |
| 92 | 140 | return out | ... | ... |
query/product_title_exclusion.py
| ... | ... | @@ -7,7 +7,7 @@ from __future__ import annotations |
| 7 | 7 | from dataclasses import dataclass, field |
| 8 | 8 | from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Set, Tuple |
| 9 | 9 | |
| 10 | -from .tokenization import TokenizedText, normalize_query_text, tokenize_text | |
| 10 | +from .tokenization import QueryTextAnalysisCache, TokenizedText, normalize_query_text, tokenize_text | |
| 11 | 11 | |
| 12 | 12 | |
| 13 | 13 | def _dedupe_terms(terms: Iterable[str]) -> List[str]: |
| ... | ... | @@ -158,9 +158,27 @@ class ProductTitleExclusionDetector: |
| 158 | 158 | self.registry = registry |
| 159 | 159 | self.tokenizer = tokenizer |
| 160 | 160 | |
| 161 | + def _tokenize_text( | |
| 162 | + self, | |
| 163 | + text: str, | |
| 164 | + *, | |
| 165 | + analysis_cache: Optional[QueryTextAnalysisCache] = None, | |
| 166 | + ) -> TokenizedText: | |
| 167 | + if analysis_cache is not None: | |
| 168 | + return analysis_cache.get_tokenized_text( | |
| 169 | + text, | |
| 170 | + max_ngram=self.registry.max_term_ngram, | |
| 171 | + ) | |
| 172 | + return tokenize_text( | |
| 173 | + text, | |
| 174 | + tokenizer=self.tokenizer, | |
| 175 | + max_ngram=self.registry.max_term_ngram, | |
| 176 | + ) | |
| 177 | + | |
| 161 | 178 | def _build_query_variants(self, parsed_query: Any) -> Tuple[TokenizedText, ...]: |
| 162 | 179 | seen = set() |
| 163 | 180 | variants: List[TokenizedText] = [] |
| 181 | + analysis_cache = getattr(parsed_query, "_text_analysis_cache", None) | |
| 164 | 182 | texts = [ |
| 165 | 183 | getattr(parsed_query, "original_query", None), |
| 166 | 184 | getattr(parsed_query, "query_normalized", None), |
| ... | ... | @@ -180,10 +198,9 @@ class ProductTitleExclusionDetector: |
| 180 | 198 | continue |
| 181 | 199 | seen.add(normalized) |
| 182 | 200 | variants.append( |
| 183 | - tokenize_text( | |
| 201 | + self._tokenize_text( | |
| 184 | 202 | text, |
| 185 | - tokenizer=self.tokenizer, | |
| 186 | - max_ngram=self.registry.max_term_ngram, | |
| 203 | + analysis_cache=analysis_cache, | |
| 187 | 204 | ) |
| 188 | 205 | ) |
| 189 | 206 | ... | ... |
query/query_parser.py
| ... | ... | @@ -27,7 +27,7 @@ from .product_title_exclusion import ( |
| 27 | 27 | ) |
| 28 | 28 | from .query_rewriter import QueryRewriter, QueryNormalizer |
| 29 | 29 | from .style_intent import StyleIntentDetector, StyleIntentProfile, StyleIntentRegistry |
| 30 | -from .tokenization import extract_token_strings, simple_tokenize_query | |
| 30 | +from .tokenization import QueryTextAnalysisCache, contains_han_text, extract_token_strings | |
| 31 | 31 | from .keyword_extractor import KeywordExtractor, collect_keywords_queries |
| 32 | 32 | |
| 33 | 33 | logger = logging.getLogger(__name__) |
| ... | ... | @@ -119,6 +119,7 @@ class ParsedQuery: |
| 119 | 119 | keywords_queries: Dict[str, str] = field(default_factory=dict) |
| 120 | 120 | style_intent_profile: Optional[StyleIntentProfile] = None |
| 121 | 121 | product_title_exclusion_profile: Optional[ProductTitleExclusionProfile] = None |
| 122 | + _text_analysis_cache: Optional[QueryTextAnalysisCache] = field(default=None, repr=False) | |
| 122 | 123 | |
| 123 | 124 | def text_for_rerank(self) -> str: |
| 124 | 125 | """See :func:`rerank_query_text`.""" |
| ... | ... | @@ -238,7 +239,7 @@ class QueryParser: |
| 238 | 239 | if hanlp is None: |
| 239 | 240 | raise RuntimeError("HanLP is required for QueryParser tokenization") |
| 240 | 241 | logger.info("Initializing HanLP tokenizer...") |
| 241 | - tokenizer = hanlp.load(hanlp.pretrained.tok.CTB9_TOK_ELECTRA_BASE_CRF) | |
| 242 | + tokenizer = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH) | |
| 242 | 243 | tokenizer.config.output_spans = True |
| 243 | 244 | logger.info("HanLP tokenizer initialized") |
| 244 | 245 | return tokenizer |
| ... | ... | @@ -288,6 +289,33 @@ class QueryParser: |
| 288 | 289 | def _get_query_tokens(self, query: str) -> List[str]: |
| 289 | 290 | return self._extract_tokens(self._tokenizer(query)) |
| 290 | 291 | |
| 292 | + @staticmethod | |
| 293 | + def _is_ascii_latin_query(text: str) -> bool: | |
| 294 | + candidate = str(text or "").strip() | |
| 295 | + if not candidate or contains_han_text(candidate): | |
| 296 | + return False | |
| 297 | + try: | |
| 298 | + candidate.encode("ascii") | |
| 299 | + except UnicodeEncodeError: | |
| 300 | + return False | |
| 301 | + return any(ch.isalpha() for ch in candidate) | |
| 302 | + | |
| 303 | + def _detect_query_language( | |
| 304 | + self, | |
| 305 | + query_text: str, | |
| 306 | + *, | |
| 307 | + target_languages: Optional[List[str]] = None, | |
| 308 | + ) -> str: | |
| 309 | + normalized_targets = self._normalize_language_codes(target_languages) | |
| 310 | + supported_languages = self._normalize_language_codes( | |
| 311 | + getattr(self.config.query_config, "supported_languages", None) | |
| 312 | + ) | |
| 313 | + active_languages = normalized_targets or supported_languages | |
| 314 | + if active_languages and set(active_languages).issubset({"en", "zh"}): | |
| 315 | + if self._is_ascii_latin_query(query_text): | |
| 316 | + return "en" | |
| 317 | + return self.language_detector.detect(query_text) | |
| 318 | + | |
| 291 | 319 | def parse( |
| 292 | 320 | self, |
| 293 | 321 | query: str, |
| ... | ... | @@ -332,12 +360,15 @@ class QueryParser: |
| 332 | 360 | active_logger.debug(msg) |
| 333 | 361 | |
| 334 | 362 | # Stage 1: Normalize |
| 363 | + normalize_t0 = time.perf_counter() | |
| 335 | 364 | normalized = self.normalizer.normalize(query) |
| 365 | + normalize_ms = (time.perf_counter() - normalize_t0) * 1000.0 | |
| 336 | 366 | log_debug(f"Normalization completed | '{query}' -> '{normalized}'") |
| 337 | 367 | if context: |
| 338 | 368 | context.store_intermediate_result('query_normalized', normalized) |
| 339 | 369 | |
| 340 | 370 | # Stage 2: Query rewriting |
| 371 | + rewrite_t0 = time.perf_counter() | |
| 341 | 372 | query_text = normalized |
| 342 | 373 | rewritten = normalized |
| 343 | 374 | if self.config.query_config.rewrite_dictionary: # Enable rewrite if dictionary exists |
| ... | ... | @@ -348,21 +379,26 @@ class QueryParser: |
| 348 | 379 | if context: |
| 349 | 380 | context.store_intermediate_result('rewritten_query', rewritten) |
| 350 | 381 | context.add_warning(f"Query was rewritten: {query_text}") |
| 382 | + rewrite_ms = (time.perf_counter() - rewrite_t0) * 1000.0 | |
| 383 | + | |
| 384 | + normalized_targets = self._normalize_language_codes(target_languages) | |
| 351 | 385 | |
| 352 | 386 | # Stage 3: Language detection |
| 353 | - detected_lang = self.language_detector.detect(query_text) | |
| 387 | + language_detect_t0 = time.perf_counter() | |
| 388 | + detected_lang = self._detect_query_language( | |
| 389 | + query_text, | |
| 390 | + target_languages=normalized_targets, | |
| 391 | + ) | |
| 354 | 392 | # Use default language if detection failed (None or "unknown") |
| 355 | 393 | if not detected_lang or detected_lang == "unknown": |
| 356 | 394 | detected_lang = self.config.query_config.default_language |
| 395 | + language_detect_ms = (time.perf_counter() - language_detect_t0) * 1000.0 | |
| 357 | 396 | log_info(f"Language detection | Detected language: {detected_lang}") |
| 358 | 397 | if context: |
| 359 | 398 | context.store_intermediate_result('detected_language', detected_lang) |
| 360 | - # Stage 4: Query analysis (tokenization) | |
| 361 | - query_tokens = self._get_query_tokens(query_text) | |
| 362 | - | |
| 363 | - log_debug(f"Query analysis | Query tokens: {query_tokens}") | |
| 364 | - if context: | |
| 365 | - context.store_intermediate_result('query_tokens', query_tokens) | |
| 399 | + text_analysis_cache = QueryTextAnalysisCache(tokenizer=self._tokenizer) | |
| 400 | + for text_variant in (query, normalized, query_text): | |
| 401 | + text_analysis_cache.set_language_hint(text_variant, detected_lang) | |
| 366 | 402 | |
| 367 | 403 | # Stage 5: Translation + embedding. Parser only coordinates async enrichment work; the |
| 368 | 404 | # caller decides translation targets and later search-field planning. |
| ... | ... | @@ -371,7 +407,6 @@ class QueryParser: |
| 371 | 407 | future_submit_at: Dict[Any, float] = {} |
| 372 | 408 | async_executor: Optional[ThreadPoolExecutor] = None |
| 373 | 409 | detected_norm = str(detected_lang or "").strip().lower() |
| 374 | - normalized_targets = self._normalize_language_codes(target_languages) | |
| 375 | 410 | translation_targets = [lang for lang in normalized_targets if lang != detected_norm] |
| 376 | 411 | source_language_in_index = bool(normalized_targets) and detected_norm in normalized_targets |
| 377 | 412 | |
| ... | ... | @@ -398,7 +433,9 @@ class QueryParser: |
| 398 | 433 | thread_name_prefix="query-enrichment", |
| 399 | 434 | ) |
| 400 | 435 | |
| 436 | + async_submit_ms = 0.0 | |
| 401 | 437 | try: |
| 438 | + async_submit_t0 = time.perf_counter() | |
| 402 | 439 | if async_executor is not None: |
| 403 | 440 | for lang in translation_targets: |
| 404 | 441 | model_name = self._pick_query_translation_model( |
| ... | ... | @@ -466,6 +503,7 @@ class QueryParser: |
| 466 | 503 | future = async_executor.submit(_encode_image_query_vector) |
| 467 | 504 | future_to_task[future] = ("image_embedding", None) |
| 468 | 505 | future_submit_at[future] = time.perf_counter() |
| 506 | + async_submit_ms = (time.perf_counter() - async_submit_t0) * 1000.0 | |
| 469 | 507 | except Exception as e: |
| 470 | 508 | error_msg = f"Async query enrichment submission failed | Error: {str(e)}" |
| 471 | 509 | log_info(error_msg) |
| ... | ... | @@ -477,6 +515,33 @@ class QueryParser: |
| 477 | 515 | future_to_task.clear() |
| 478 | 516 | future_submit_at.clear() |
| 479 | 517 | |
| 518 | + # Stage 4: Query analysis (tokenization) now overlaps with async enrichment work. | |
| 519 | + query_analysis_t0 = time.perf_counter() | |
| 520 | + query_tokenizer_t0 = time.perf_counter() | |
| 521 | + query_tokenizer_result = text_analysis_cache.get_tokenizer_result(query_text) | |
| 522 | + query_tokenizer_ms = (time.perf_counter() - query_tokenizer_t0) * 1000.0 | |
| 523 | + query_token_extract_t0 = time.perf_counter() | |
| 524 | + query_tokens = self._extract_tokens(query_tokenizer_result) | |
| 525 | + query_token_extract_ms = (time.perf_counter() - query_token_extract_t0) * 1000.0 | |
| 526 | + query_analysis_ms = (time.perf_counter() - query_analysis_t0) * 1000.0 | |
| 527 | + | |
| 528 | + log_debug(f"Query analysis | Query tokens: {query_tokens}") | |
| 529 | + if context: | |
| 530 | + context.store_intermediate_result('query_tokens', query_tokens) | |
| 531 | + | |
| 532 | + keywords_base_query = "" | |
| 533 | + keywords_base_ms = 0.0 | |
| 534 | + try: | |
| 535 | + keywords_base_t0 = time.perf_counter() | |
| 536 | + keywords_base_query = self._keyword_extractor.extract_keywords( | |
| 537 | + query_text, | |
| 538 | + language_hint=detected_lang, | |
| 539 | + tokenizer_result=text_analysis_cache.get_tokenizer_result(query_text), | |
| 540 | + ) | |
| 541 | + keywords_base_ms = (time.perf_counter() - keywords_base_t0) * 1000.0 | |
| 542 | + except Exception as e: | |
| 543 | + log_info(f"Base keyword extraction failed | Error: {e}") | |
| 544 | + | |
| 480 | 545 | # Wait for translation + embedding concurrently; shared budget depends on whether |
| 481 | 546 | # the detected language belongs to caller-provided target_languages. |
| 482 | 547 | qc = self.config.query_config |
| ... | ... | @@ -501,7 +566,10 @@ class QueryParser: |
| 501 | 566 | f"source_in_target_languages={source_in_target_languages}" |
| 502 | 567 | ) |
| 503 | 568 | |
| 569 | + async_wait_t0 = time.perf_counter() | |
| 504 | 570 | done, not_done = wait(list(future_to_task.keys()), timeout=budget_sec) |
| 571 | + async_wait_ms = (time.perf_counter() - async_wait_t0) * 1000.0 | |
| 572 | + async_collect_t0 = time.perf_counter() | |
| 505 | 573 | for future in done: |
| 506 | 574 | task_type, lang = future_to_task[future] |
| 507 | 575 | t0 = future_submit_at.pop(future, None) |
| ... | ... | @@ -511,6 +579,7 @@ class QueryParser: |
| 511 | 579 | if task_type == "translation": |
| 512 | 580 | if result: |
| 513 | 581 | translations[lang] = result |
| 582 | + text_analysis_cache.set_language_hint(result, lang) | |
| 514 | 583 | if context: |
| 515 | 584 | context.store_intermediate_result(f"translation_{lang}", result) |
| 516 | 585 | elif task_type == "embedding": |
| ... | ... | @@ -561,20 +630,31 @@ class QueryParser: |
| 561 | 630 | log_info(timeout_msg) |
| 562 | 631 | if context: |
| 563 | 632 | context.add_warning(timeout_msg) |
| 633 | + async_collect_ms = (time.perf_counter() - async_collect_t0) * 1000.0 | |
| 564 | 634 | |
| 565 | 635 | if async_executor: |
| 566 | 636 | async_executor.shutdown(wait=False) |
| 567 | 637 | |
| 568 | 638 | if translations and context: |
| 569 | 639 | context.store_intermediate_result("translations", translations) |
| 640 | + else: | |
| 641 | + async_wait_ms = 0.0 | |
| 642 | + async_collect_ms = 0.0 | |
| 570 | 643 | |
| 644 | + tail_sync_t0 = time.perf_counter() | |
| 571 | 645 | keywords_queries: Dict[str, str] = {} |
| 646 | + keyword_tail_ms = 0.0 | |
| 572 | 647 | try: |
| 648 | + keywords_t0 = time.perf_counter() | |
| 573 | 649 | keywords_queries = collect_keywords_queries( |
| 574 | 650 | self._keyword_extractor, |
| 575 | 651 | query_text, |
| 576 | 652 | translations, |
| 653 | + source_language=detected_lang, | |
| 654 | + text_analysis_cache=text_analysis_cache, | |
| 655 | + base_keywords_query=keywords_base_query, | |
| 577 | 656 | ) |
| 657 | + keyword_tail_ms = (time.perf_counter() - keywords_t0) * 1000.0 | |
| 578 | 658 | except Exception as e: |
| 579 | 659 | log_info(f"Keyword extraction failed | Error: {e}") |
| 580 | 660 | |
| ... | ... | @@ -589,9 +669,43 @@ class QueryParser: |
| 589 | 669 | image_query_vector=image_query_vector, |
| 590 | 670 | query_tokens=query_tokens, |
| 591 | 671 | keywords_queries=keywords_queries, |
| 672 | + _text_analysis_cache=text_analysis_cache, | |
| 592 | 673 | ) |
| 674 | + style_intent_t0 = time.perf_counter() | |
| 593 | 675 | style_intent_profile = self.style_intent_detector.detect(base_result) |
| 676 | + style_intent_ms = (time.perf_counter() - style_intent_t0) * 1000.0 | |
| 677 | + product_title_exclusion_t0 = time.perf_counter() | |
| 594 | 678 | product_title_exclusion_profile = self.product_title_exclusion_detector.detect(base_result) |
| 679 | + product_title_exclusion_ms = ( | |
| 680 | + (time.perf_counter() - product_title_exclusion_t0) * 1000.0 | |
| 681 | + ) | |
| 682 | + tail_sync_ms = (time.perf_counter() - tail_sync_t0) * 1000.0 | |
| 683 | + before_wait_ms = ( | |
| 684 | + normalize_ms | |
| 685 | + + rewrite_ms | |
| 686 | + + language_detect_ms | |
| 687 | + + async_submit_ms | |
| 688 | + + query_analysis_ms | |
| 689 | + + keywords_base_ms | |
| 690 | + ) | |
| 691 | + log_info( | |
| 692 | + "Query parse stage timings | " | |
| 693 | + f"normalize_ms={normalize_ms:.1f} | " | |
| 694 | + f"rewrite_ms={rewrite_ms:.1f} | " | |
| 695 | + f"language_detect_ms={language_detect_ms:.1f} | " | |
| 696 | + f"query_tokenizer_ms={query_tokenizer_ms:.1f} | " | |
| 697 | + f"query_token_extract_ms={query_token_extract_ms:.1f} | " | |
| 698 | + f"query_analysis_ms={query_analysis_ms:.1f} | " | |
| 699 | + f"async_submit_ms={async_submit_ms:.1f} | " | |
| 700 | + f"before_wait_ms={before_wait_ms:.1f} | " | |
| 701 | + f"async_wait_ms={async_wait_ms:.1f} | " | |
| 702 | + f"async_collect_ms={async_collect_ms:.1f} | " | |
| 703 | + f"base_keywords_ms={keywords_base_ms:.1f} | " | |
| 704 | + f"keyword_tail_ms={keyword_tail_ms:.1f} | " | |
| 705 | + f"style_intent_ms={style_intent_ms:.1f} | " | |
| 706 | + f"product_title_exclusion_ms={product_title_exclusion_ms:.1f} | " | |
| 707 | + f"tail_sync_ms={tail_sync_ms:.1f}" | |
| 708 | + ) | |
| 595 | 709 | if context: |
| 596 | 710 | context.store_intermediate_result( |
| 597 | 711 | "style_intent_profile", |
| ... | ... | @@ -614,6 +728,7 @@ class QueryParser: |
| 614 | 728 | keywords_queries=keywords_queries, |
| 615 | 729 | style_intent_profile=style_intent_profile, |
| 616 | 730 | product_title_exclusion_profile=product_title_exclusion_profile, |
| 731 | + _text_analysis_cache=text_analysis_cache, | |
| 617 | 732 | ) |
| 618 | 733 | |
| 619 | 734 | parse_total_ms = (time.perf_counter() - parse_t0) * 1000.0 | ... | ... |
query/style_intent.py
| ... | ... | @@ -7,7 +7,7 @@ from __future__ import annotations |
| 7 | 7 | from dataclasses import dataclass, field |
| 8 | 8 | from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Set, Tuple |
| 9 | 9 | |
| 10 | -from .tokenization import TokenizedText, normalize_query_text, tokenize_text | |
| 10 | +from .tokenization import QueryTextAnalysisCache, TokenizedText, normalize_query_text, tokenize_text | |
| 11 | 11 | |
| 12 | 12 | |
| 13 | 13 | @dataclass(frozen=True) |
| ... | ... | @@ -233,32 +233,63 @@ class StyleIntentDetector: |
| 233 | 233 | self.registry = registry |
| 234 | 234 | self.tokenizer = tokenizer |
| 235 | 235 | |
| 236 | - def _build_query_variants(self, parsed_query: Any) -> Tuple[TokenizedText, ...]: | |
| 237 | - seen = set() | |
| 238 | - variants: List[TokenizedText] = [] | |
| 239 | - texts = [ | |
| 240 | - self._get_language_query_text(parsed_query, "zh"), | |
| 241 | - self._get_language_query_text(parsed_query, "en"), | |
| 242 | - ] | |
| 236 | + def _max_term_ngram(self) -> int: | |
| 237 | + return max( | |
| 238 | + (definition.max_term_ngram for definition in self.registry.definitions.values()), | |
| 239 | + default=3, | |
| 240 | + ) | |
| 241 | + | |
| 242 | + def _tokenize_text( | |
| 243 | + self, | |
| 244 | + text: str, | |
| 245 | + *, | |
| 246 | + analysis_cache: Optional[QueryTextAnalysisCache] = None, | |
| 247 | + ) -> TokenizedText: | |
| 248 | + max_term_ngram = self._max_term_ngram() | |
| 249 | + if analysis_cache is not None: | |
| 250 | + return analysis_cache.get_tokenized_text(text, max_ngram=max_term_ngram) | |
| 251 | + return tokenize_text( | |
| 252 | + text, | |
| 253 | + tokenizer=self.tokenizer, | |
| 254 | + max_ngram=max_term_ngram, | |
| 255 | + ) | |
| 243 | 256 | |
| 244 | - for raw_text in texts: | |
| 245 | - text = str(raw_text or "").strip() | |
| 257 | + def _build_language_variants( | |
| 258 | + self, | |
| 259 | + parsed_query: Any, | |
| 260 | + *, | |
| 261 | + analysis_cache: Optional[QueryTextAnalysisCache] = None, | |
| 262 | + ) -> Dict[str, TokenizedText]: | |
| 263 | + variants: Dict[str, TokenizedText] = {} | |
| 264 | + for language in ("zh", "en"): | |
| 265 | + text = self._get_language_query_text(parsed_query, language).strip() | |
| 246 | 266 | if not text: |
| 247 | 267 | continue |
| 248 | - normalized = normalize_query_text(text) | |
| 268 | + variants[language] = self._tokenize_text( | |
| 269 | + text, | |
| 270 | + analysis_cache=analysis_cache, | |
| 271 | + ) | |
| 272 | + return variants | |
| 273 | + | |
| 274 | + def _build_query_variants( | |
| 275 | + self, | |
| 276 | + parsed_query: Any, | |
| 277 | + *, | |
| 278 | + language_variants: Optional[Dict[str, TokenizedText]] = None, | |
| 279 | + analysis_cache: Optional[QueryTextAnalysisCache] = None, | |
| 280 | + ) -> Tuple[TokenizedText, ...]: | |
| 281 | + seen = set() | |
| 282 | + variants: List[TokenizedText] = [] | |
| 283 | + | |
| 284 | + for variant in (language_variants or self._build_language_variants( | |
| 285 | + parsed_query, | |
| 286 | + analysis_cache=analysis_cache, | |
| 287 | + )).values(): | |
| 288 | + normalized = variant.normalized_text | |
| 249 | 289 | if not normalized or normalized in seen: |
| 250 | 290 | continue |
| 251 | 291 | seen.add(normalized) |
| 252 | - variants.append( | |
| 253 | - tokenize_text( | |
| 254 | - text, | |
| 255 | - tokenizer=self.tokenizer, | |
| 256 | - max_ngram=max( | |
| 257 | - (definition.max_term_ngram for definition in self.registry.definitions.values()), | |
| 258 | - default=3, | |
| 259 | - ), | |
| 260 | - ) | |
| 261 | - ) | |
| 292 | + variants.append(variant) | |
| 262 | 293 | |
| 263 | 294 | return tuple(variants) |
| 264 | 295 | |
| ... | ... | @@ -271,26 +302,50 @@ class StyleIntentDetector: |
| 271 | 302 | return str(translated) |
| 272 | 303 | return str(getattr(parsed_query, "original_query", "") or "") |
| 273 | 304 | |
| 274 | - def _tokenize_language_query(self, parsed_query: Any, language: str) -> Optional[TokenizedText]: | |
| 305 | + def _tokenize_language_query( | |
| 306 | + self, | |
| 307 | + parsed_query: Any, | |
| 308 | + language: str, | |
| 309 | + *, | |
| 310 | + language_variants: Optional[Dict[str, TokenizedText]] = None, | |
| 311 | + analysis_cache: Optional[QueryTextAnalysisCache] = None, | |
| 312 | + ) -> Optional[TokenizedText]: | |
| 313 | + if language_variants is not None: | |
| 314 | + return language_variants.get(language) | |
| 275 | 315 | text = self._get_language_query_text(parsed_query, language).strip() |
| 276 | 316 | if not text: |
| 277 | 317 | return None |
| 278 | - return tokenize_text( | |
| 318 | + return self._tokenize_text( | |
| 279 | 319 | text, |
| 280 | - tokenizer=self.tokenizer, | |
| 281 | - max_ngram=max( | |
| 282 | - (definition.max_term_ngram for definition in self.registry.definitions.values()), | |
| 283 | - default=3, | |
| 284 | - ), | |
| 320 | + analysis_cache=analysis_cache, | |
| 285 | 321 | ) |
| 286 | 322 | |
| 287 | 323 | def detect(self, parsed_query: Any) -> StyleIntentProfile: |
| 288 | 324 | if not self.registry.enabled or not self.registry.definitions: |
| 289 | 325 | return StyleIntentProfile() |
| 290 | 326 | |
| 291 | - query_variants = self._build_query_variants(parsed_query) | |
| 292 | - zh_variant = self._tokenize_language_query(parsed_query, "zh") | |
| 293 | - en_variant = self._tokenize_language_query(parsed_query, "en") | |
| 327 | + analysis_cache = getattr(parsed_query, "_text_analysis_cache", None) | |
| 328 | + language_variants = self._build_language_variants( | |
| 329 | + parsed_query, | |
| 330 | + analysis_cache=analysis_cache, | |
| 331 | + ) | |
| 332 | + query_variants = self._build_query_variants( | |
| 333 | + parsed_query, | |
| 334 | + language_variants=language_variants, | |
| 335 | + analysis_cache=analysis_cache, | |
| 336 | + ) | |
| 337 | + zh_variant = self._tokenize_language_query( | |
| 338 | + parsed_query, | |
| 339 | + "zh", | |
| 340 | + language_variants=language_variants, | |
| 341 | + analysis_cache=analysis_cache, | |
| 342 | + ) | |
| 343 | + en_variant = self._tokenize_language_query( | |
| 344 | + parsed_query, | |
| 345 | + "en", | |
| 346 | + language_variants=language_variants, | |
| 347 | + analysis_cache=analysis_cache, | |
| 348 | + ) | |
| 294 | 349 | detected: List[DetectedStyleIntent] = [] |
| 295 | 350 | seen_pairs = set() |
| 296 | 351 | ... | ... |
query/tokenization.py
| ... | ... | @@ -6,10 +6,11 @@ from __future__ import annotations |
| 6 | 6 | |
| 7 | 7 | from dataclasses import dataclass |
| 8 | 8 | import re |
| 9 | -from typing import Any, Callable, Iterable, List, Optional, Sequence, Tuple | |
| 9 | +from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple | |
| 10 | 10 | |
| 11 | 11 | |
| 12 | -_TOKEN_PATTERN = re.compile(r"[\u4e00-\u9fff]+|[A-Za-z0-9_]+(?:-[A-Za-z0-9_]+)*") | |
| 12 | +_HAN_PATTERN = re.compile(r"[\u4e00-\u9fff]") | |
| 13 | +_TOKEN_PATTERN = re.compile(r"[\u4e00-\u9fff]+|[^\W_]+(?:[-'][^\W_]+)*", re.UNICODE) | |
| 13 | 14 | |
| 14 | 15 | |
| 15 | 16 | def normalize_query_text(text: Optional[str]) -> str: |
| ... | ... | @@ -30,6 +31,10 @@ def simple_tokenize_query(text: str) -> List[str]: |
| 30 | 31 | return _TOKEN_PATTERN.findall(text) |
| 31 | 32 | |
| 32 | 33 | |
| 34 | +def contains_han_text(text: Optional[str]) -> bool: | |
| 35 | + return bool(text and _HAN_PATTERN.search(str(text))) | |
| 36 | + | |
| 37 | + | |
| 33 | 38 | def extract_token_strings(tokenizer_result: Any) -> List[str]: |
| 34 | 39 | """Normalize tokenizer output into a flat token string list.""" |
| 35 | 40 | if not tokenizer_result: |
| ... | ... | @@ -84,6 +89,13 @@ def _build_phrase_candidates(tokens: Sequence[str], max_ngram: int) -> List[str] |
| 84 | 89 | return phrases |
| 85 | 90 | |
| 86 | 91 | |
| 92 | +def _build_coarse_tokens(text: str, fine_tokens: Sequence[str]) -> List[str]: | |
| 93 | + coarse_tokens = _dedupe_preserve_order(simple_tokenize_query(text)) | |
| 94 | + if contains_han_text(text) and fine_tokens: | |
| 95 | + return list(_dedupe_preserve_order(fine_tokens)) | |
| 96 | + return coarse_tokens | |
| 97 | + | |
| 98 | + | |
| 87 | 99 | @dataclass(frozen=True) |
| 88 | 100 | class TokenizedText: |
| 89 | 101 | text: str |
| ... | ... | @@ -93,30 +105,88 @@ class TokenizedText: |
| 93 | 105 | candidates: Tuple[str, ...] |
| 94 | 106 | |
| 95 | 107 | |
| 108 | +class QueryTextAnalysisCache: | |
| 109 | + """Per-parse cache for tokenizer output and derived token bundles.""" | |
| 110 | + | |
| 111 | + def __init__(self, *, tokenizer: Optional[Callable[[str], Any]] = None) -> None: | |
| 112 | + self.tokenizer = tokenizer | |
| 113 | + self._tokenizer_results: Dict[str, Any] = {} | |
| 114 | + self._tokenized_texts: Dict[Tuple[str, int], TokenizedText] = {} | |
| 115 | + self._language_hints: Dict[str, str] = {} | |
| 116 | + | |
| 117 | + @staticmethod | |
| 118 | + def _normalize_input(text: Optional[str]) -> str: | |
| 119 | + return str(text or "").strip() | |
| 120 | + | |
| 121 | + def set_language_hint(self, text: Optional[str], language: Optional[str]) -> None: | |
| 122 | + normalized_input = self._normalize_input(text) | |
| 123 | + normalized_language = normalize_query_text(language) | |
| 124 | + if normalized_input and normalized_language: | |
| 125 | + self._language_hints[normalized_input] = normalized_language | |
| 126 | + | |
| 127 | + def get_language_hint(self, text: Optional[str]) -> Optional[str]: | |
| 128 | + normalized_input = self._normalize_input(text) | |
| 129 | + if not normalized_input: | |
| 130 | + return None | |
| 131 | + return self._language_hints.get(normalized_input) | |
| 132 | + | |
| 133 | + def _should_use_model_tokenizer(self, text: str) -> bool: | |
| 134 | + if self.tokenizer is None: | |
| 135 | + return False | |
| 136 | + language_hint = self.get_language_hint(text) | |
| 137 | + has_han = contains_han_text(text) | |
| 138 | + if language_hint == "zh": | |
| 139 | + return has_han | |
| 140 | + return has_han | |
| 141 | + | |
| 142 | + def get_tokenizer_result(self, text: Optional[str]) -> Any: | |
| 143 | + normalized_input = self._normalize_input(text) | |
| 144 | + if not normalized_input: | |
| 145 | + return [] | |
| 146 | + if not self._should_use_model_tokenizer(normalized_input): | |
| 147 | + return simple_tokenize_query(normalized_input) | |
| 148 | + if normalized_input not in self._tokenizer_results: | |
| 149 | + self._tokenizer_results[normalized_input] = self.tokenizer(normalized_input) | |
| 150 | + return self._tokenizer_results[normalized_input] | |
| 151 | + | |
| 152 | + def get_tokenized_text(self, text: Optional[str], *, max_ngram: int = 3) -> TokenizedText: | |
| 153 | + normalized_input = self._normalize_input(text) | |
| 154 | + cache_key = (normalized_input, max(1, int(max_ngram))) | |
| 155 | + cached = self._tokenized_texts.get(cache_key) | |
| 156 | + if cached is not None: | |
| 157 | + return cached | |
| 158 | + | |
| 159 | + normalized_text = normalize_query_text(normalized_input) | |
| 160 | + fine_raw = extract_token_strings(self.get_tokenizer_result(normalized_input)) | |
| 161 | + fine_tokens = _dedupe_preserve_order(fine_raw) | |
| 162 | + coarse_tokens = _build_coarse_tokens(normalized_input, fine_tokens) | |
| 163 | + | |
| 164 | + bundle = TokenizedText( | |
| 165 | + text=normalized_input, | |
| 166 | + normalized_text=normalized_text, | |
| 167 | + fine_tokens=tuple(fine_tokens), | |
| 168 | + coarse_tokens=tuple(coarse_tokens), | |
| 169 | + candidates=tuple( | |
| 170 | + _dedupe_preserve_order( | |
| 171 | + list(fine_tokens) | |
| 172 | + + list(coarse_tokens) | |
| 173 | + + _build_phrase_candidates(fine_tokens, max_ngram=max_ngram) | |
| 174 | + + _build_phrase_candidates(coarse_tokens, max_ngram=max_ngram) | |
| 175 | + + ([normalized_text] if normalized_text else []) | |
| 176 | + ) | |
| 177 | + ), | |
| 178 | + ) | |
| 179 | + self._tokenized_texts[cache_key] = bundle | |
| 180 | + return bundle | |
| 181 | + | |
| 182 | + | |
| 96 | 183 | def tokenize_text( |
| 97 | 184 | text: str, |
| 98 | 185 | *, |
| 99 | 186 | tokenizer: Optional[Callable[[str], Any]] = None, |
| 100 | 187 | max_ngram: int = 3, |
| 101 | 188 | ) -> TokenizedText: |
| 102 | - normalized_text = normalize_query_text(text) | |
| 103 | - coarse_tokens = _dedupe_preserve_order(simple_tokenize_query(text)) | |
| 104 | - | |
| 105 | - fine_raw = extract_token_strings(tokenizer(text)) if tokenizer is not None and text else [] | |
| 106 | - fine_tokens = _dedupe_preserve_order(fine_raw) | |
| 107 | - | |
| 108 | - candidates = _dedupe_preserve_order( | |
| 109 | - list(fine_tokens) | |
| 110 | - + list(coarse_tokens) | |
| 111 | - + _build_phrase_candidates(fine_tokens, max_ngram=max_ngram) | |
| 112 | - + _build_phrase_candidates(coarse_tokens, max_ngram=max_ngram) | |
| 113 | - + ([normalized_text] if normalized_text else []) | |
| 114 | - ) | |
| 115 | - | |
| 116 | - return TokenizedText( | |
| 117 | - text=text, | |
| 118 | - normalized_text=normalized_text, | |
| 119 | - fine_tokens=tuple(fine_tokens), | |
| 120 | - coarse_tokens=tuple(coarse_tokens), | |
| 121 | - candidates=tuple(candidates), | |
| 189 | + return QueryTextAnalysisCache(tokenizer=tokenizer).get_tokenized_text( | |
| 190 | + text, | |
| 191 | + max_ngram=max_ngram, | |
| 122 | 192 | ) | ... | ... |
suggestion/service.py
| ... | ... | @@ -7,7 +7,7 @@ import time |
| 7 | 7 | from typing import Any, Dict, List, Optional |
| 8 | 8 | |
| 9 | 9 | from config.tenant_config_loader import get_tenant_config_loader |
| 10 | -from query.query_parser import simple_tokenize_query | |
| 10 | +from query.tokenization import simple_tokenize_query | |
| 11 | 11 | from suggestion.builder import get_suggestion_alias_name |
| 12 | 12 | from utils.es_client import ESClient |
| 13 | 13 | ... | ... |
tests/test_query_parser_mixed_language.py
| ... | ... | @@ -77,3 +77,79 @@ def test_parse_waits_for_translation_when_source_in_index_languages(monkeypatch) |
| 77 | 77 | assert result.detected_language == "en" |
| 78 | 78 | assert result.translations.get("zh") == "off shoulder top-zh" |
| 79 | 79 | assert not hasattr(result, "source_in_index_languages") |
| 80 | + | |
| 81 | + | |
| 82 | +def test_parse_reuses_tokenization_across_tail_stages(monkeypatch): | |
| 83 | + tokenize_calls = [] | |
| 84 | + | |
| 85 | + def counting_tokenizer(text): | |
| 86 | + tokenize_calls.append(str(text)) | |
| 87 | + return str(text).split() | |
| 88 | + | |
| 89 | + config = SearchConfig( | |
| 90 | + es_index_name="test_products", | |
| 91 | + field_boosts={"title.en": 3.0, "title.zh": 3.0}, | |
| 92 | + indexes=[IndexConfig(name="default", label="default", fields=["title.en", "title.zh"])], | |
| 93 | + query_config=QueryConfig( | |
| 94 | + enable_text_embedding=False, | |
| 95 | + enable_query_rewrite=False, | |
| 96 | + supported_languages=["en", "zh"], | |
| 97 | + default_language="en", | |
| 98 | + style_intent_terms={ | |
| 99 | + "color": [ | |
| 100 | + {"en_terms": ["black"], "zh_terms": ["黑色"], "attribute_terms": ["black"]} | |
| 101 | + ], | |
| 102 | + }, | |
| 103 | + style_intent_dimension_aliases={"color": ["color", "颜色"]}, | |
| 104 | + product_title_exclusion_rules=[ | |
| 105 | + { | |
| 106 | + "zh_trigger_terms": ["修身"], | |
| 107 | + "en_trigger_terms": ["fitted"], | |
| 108 | + "zh_title_exclusions": ["宽松"], | |
| 109 | + "en_title_exclusions": ["loose"], | |
| 110 | + } | |
| 111 | + ], | |
| 112 | + ), | |
| 113 | + function_score=FunctionScoreConfig(), | |
| 114 | + rerank=RerankConfig(), | |
| 115 | + spu_config=SPUConfig(enabled=False), | |
| 116 | + ) | |
| 117 | + parser = QueryParser( | |
| 118 | + config, | |
| 119 | + translator=_DummyTranslator(), | |
| 120 | + tokenizer=counting_tokenizer, | |
| 121 | + ) | |
| 122 | + monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en") | |
| 123 | + | |
| 124 | + result = parser.parse( | |
| 125 | + "black fitted dress", | |
| 126 | + tenant_id="0", | |
| 127 | + generate_vector=False, | |
| 128 | + target_languages=["en", "zh"], | |
| 129 | + ) | |
| 130 | + | |
| 131 | + assert result.translations == {"zh": "black fitted dress-zh"} | |
| 132 | + assert result.style_intent_profile is not None | |
| 133 | + assert result.style_intent_profile.is_active is True | |
| 134 | + assert result.product_title_exclusion_profile is not None | |
| 135 | + assert result.product_title_exclusion_profile.is_active is True | |
| 136 | + assert tokenize_calls == [] | |
| 137 | + | |
| 138 | + | |
| 139 | +def test_parse_fast_path_detects_ascii_query_as_english_without_lingua(monkeypatch): | |
| 140 | + parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer) | |
| 141 | + monkeypatch.setattr( | |
| 142 | + parser.language_detector, | |
| 143 | + "detect", | |
| 144 | + lambda text: (_ for _ in ()).throw(AssertionError("Lingua path should not be used")), | |
| 145 | + ) | |
| 146 | + | |
| 147 | + result = parser.parse( | |
| 148 | + "street t-shirt women", | |
| 149 | + tenant_id="0", | |
| 150 | + generate_vector=False, | |
| 151 | + target_languages=["en", "zh"], | |
| 152 | + ) | |
| 153 | + | |
| 154 | + assert result.detected_language == "en" | |
| 155 | + assert result.query_tokens == ["street", "t-shirt", "women"] | ... | ... |
tests/test_style_intent.py
| ... | ... | @@ -58,3 +58,37 @@ def test_style_intent_detector_uses_original_query_when_language_translation_mis |
| 58 | 58 | |
| 59 | 59 | assert profile.get_canonical_values("color") == {"black"} |
| 60 | 60 | assert profile.intents[0].attribute_terms == ("black",) |
| 61 | + | |
| 62 | + | |
| 63 | +def test_style_intent_detector_tokenizes_each_language_once(): | |
| 64 | + query_config = QueryConfig( | |
| 65 | + style_intent_terms={ | |
| 66 | + "color": [{"en_terms": ["black"], "zh_terms": ["黑色"], "attribute_terms": ["black"]}], | |
| 67 | + "size": [{"en_terms": ["xl"], "zh_terms": ["加大码"], "attribute_terms": ["xl"]}], | |
| 68 | + }, | |
| 69 | + style_intent_dimension_aliases={ | |
| 70 | + "color": ["color", "颜色"], | |
| 71 | + "size": ["size", "尺码"], | |
| 72 | + }, | |
| 73 | + ) | |
| 74 | + tokenize_calls = [] | |
| 75 | + | |
| 76 | + def counting_tokenizer(text): | |
| 77 | + tokenize_calls.append(text) | |
| 78 | + return str(text).split() | |
| 79 | + | |
| 80 | + detector = StyleIntentDetector( | |
| 81 | + StyleIntentRegistry.from_query_config(query_config), | |
| 82 | + tokenizer=counting_tokenizer, | |
| 83 | + ) | |
| 84 | + parsed_query = SimpleNamespace( | |
| 85 | + original_query="黑色 连衣裙", | |
| 86 | + query_normalized="黑色 连衣裙", | |
| 87 | + rewritten_query="黑色 连衣裙", | |
| 88 | + translations={"en": "black dress xl"}, | |
| 89 | + ) | |
| 90 | + | |
| 91 | + profile = detector.detect(parsed_query) | |
| 92 | + | |
| 93 | + assert profile.is_active is True | |
| 94 | + assert tokenize_calls == ["黑色 连衣裙"] | ... | ... |
| ... | ... | @@ -0,0 +1,13 @@ |
| 1 | +from query.tokenization import QueryTextAnalysisCache | |
| 2 | + | |
| 3 | + | |
| 4 | +def test_han_coarse_tokens_follow_model_tokens_instead_of_whole_sentence(): | |
| 5 | + cache = QueryTextAnalysisCache( | |
| 6 | + tokenizer=lambda text: [("路上", 0, 2), ("穿着", 2, 4), ("女性", 4, 6), ("黑色", 10, 12)] | |
| 7 | + ) | |
| 8 | + cache.set_language_hint("路上穿着女性的衣服是黑色的", "zh") | |
| 9 | + | |
| 10 | + tokenized = cache.get_tokenized_text("路上穿着女性的衣服是黑色的") | |
| 11 | + | |
| 12 | + assert tokenized.fine_tokens == ("路上", "穿着", "女性", "黑色") | |
| 13 | + assert tokenized.coarse_tokens == ("路上", "穿着", "女性", "黑色") | ... | ... |