Commit 45b397964fb80661b13dbc49c8fd03990123ea41

Authored by tangwang
1 parent 926e1e96

qp性能优化

config/config.yaml
... ... @@ -116,8 +116,8 @@ query_config:
116 116  
117 117 # 查询解析阶段:翻译与 query 向量并发执行,共用同一等待预算(毫秒)。
118 118 # 检测语言已在租户 index_languages 内:较短;不在索引语言内:较长(翻译对召回更关键)。
119   - translation_embedding_wait_budget_ms_source_in_index: 500 # 80
120   - translation_embedding_wait_budget_ms_source_not_in_index: 700 #200
  119 + translation_embedding_wait_budget_ms_source_in_index: 200 # 80
  120 + translation_embedding_wait_budget_ms_source_not_in_index: 300 #200
121 121  
122 122 style_intent:
123 123 enabled: true
... ...
query/english_keyword_extractor.py 0 → 100644
... ... @@ -0,0 +1,256 @@
  1 +"""
  2 +Lightweight English core-term extraction for lexical keyword constraints.
  3 +"""
  4 +
  5 +from __future__ import annotations
  6 +
  7 +import logging
  8 +from typing import List, Optional, Sequence, Set
  9 +
  10 +from .tokenization import normalize_query_text, simple_tokenize_query
  11 +
  12 +logger = logging.getLogger(__name__)
  13 +
  14 +_WEAK_BOOST_ADJS = frozenset(
  15 + {
  16 + "best",
  17 + "good",
  18 + "great",
  19 + "new",
  20 + "free",
  21 + "cheap",
  22 + "top",
  23 + "fine",
  24 + "real",
  25 + }
  26 +)
  27 +
  28 +_FUNCTIONAL_DEP = frozenset(
  29 + {
  30 + "det",
  31 + "aux",
  32 + "auxpass",
  33 + "prep",
  34 + "mark",
  35 + "expl",
  36 + "cc",
  37 + "punct",
  38 + "case",
  39 + }
  40 +)
  41 +
  42 +_DEMOGRAPHIC_NOUNS = frozenset(
  43 + {
  44 + "women",
  45 + "woman",
  46 + "men",
  47 + "man",
  48 + "kids",
  49 + "kid",
  50 + "boys",
  51 + "boy",
  52 + "girls",
  53 + "girl",
  54 + "baby",
  55 + "babies",
  56 + "toddler",
  57 + "adult",
  58 + "adults",
  59 + }
  60 +)
  61 +
  62 +_PRICE_PREP_LEMMAS = frozenset({"under", "over", "below", "above", "within", "between", "near"})
  63 +_DIMENSION_ROOTS = frozenset({"size", "width", "length", "height", "weight"})
  64 +
  65 +
  66 +def _dedupe_preserve(seq: Sequence[str]) -> List[str]:
  67 + seen: Set[str] = set()
  68 + out: List[str] = []
  69 + for item in seq:
  70 + normalized = normalize_query_text(item)
  71 + if not normalized or normalized in seen:
  72 + continue
  73 + seen.add(normalized)
  74 + out.append(normalized)
  75 + return out
  76 +
  77 +
  78 +def _lemma_lower(token) -> str:
  79 + return ((token.lemma_ or token.text) or "").lower().strip()
  80 +
  81 +
  82 +def _surface_lower(token) -> str:
  83 + return (token.text or "").lower().strip()
  84 +
  85 +
  86 +def _project_terms_to_query_tokens(query: str, terms: Sequence[str]) -> List[str]:
  87 + simple_tokens = _dedupe_preserve(simple_tokenize_query(query))
  88 + projected: List[str] = []
  89 + for term in terms:
  90 + normalized = normalize_query_text(term)
  91 + if len(normalized) < 2 or normalized in _DEMOGRAPHIC_NOUNS:
  92 + continue
  93 + exact = next((token for token in simple_tokens if token == normalized), None)
  94 + if exact is not None:
  95 + projected.append(exact)
  96 + continue
  97 + partial = next(
  98 + (
  99 + token
  100 + for token in simple_tokens
  101 + if len(normalized) >= 3 and normalized in token and token not in _DEMOGRAPHIC_NOUNS
  102 + ),
  103 + None,
  104 + )
  105 + if partial is not None:
  106 + projected.append(partial)
  107 + continue
  108 + projected.append(normalized)
  109 + return _dedupe_preserve(projected)
  110 +
  111 +
  112 +class EnglishKeywordExtractor:
  113 + """Extracts a small set of English core product terms with spaCy."""
  114 +
  115 + def __init__(self, nlp: Optional[object] = None) -> None:
  116 + self._nlp = nlp if nlp is not None else self._load_nlp()
  117 +
  118 + @staticmethod
  119 + def _load_nlp() -> Optional[object]:
  120 + try:
  121 + import spacy
  122 +
  123 + return spacy.load("en_core_web_sm", disable=["ner", "textcat"])
  124 + except Exception as exc:
  125 + logger.warning("English keyword extractor disabled; failed to load spaCy model: %s", exc)
  126 + return None
  127 +
  128 + def extract_keywords(self, query: str) -> str:
  129 + text = str(query or "").strip()
  130 + if not text:
  131 + return ""
  132 + if self._nlp is None:
  133 + return self._fallback_keywords(text)
  134 + try:
  135 + return self._extract_keywords_with_spacy(text)
  136 + except Exception as exc:
  137 + logger.warning("spaCy English keyword extraction failed; using fallback: %s", exc)
  138 + return self._fallback_keywords(text)
  139 +
  140 + def _extract_keywords_with_spacy(self, query: str) -> str:
  141 + doc = self._nlp(query)
  142 + intersection: Set[str] = set()
  143 + stops = self._nlp.Defaults.stop_words | _WEAK_BOOST_ADJS
  144 + pobj_heads_to_demote: Set[int] = set()
  145 +
  146 + for token in doc:
  147 + if token.dep_ == "prep" and token.text.lower() == "for":
  148 + for child in token.children:
  149 + if child.dep_ == "pobj" and child.pos_ in ("NOUN", "PROPN"):
  150 + pobj_heads_to_demote.add(child.i)
  151 +
  152 + for token in doc:
  153 + if token.dep_ != "prep" or _lemma_lower(token) not in _PRICE_PREP_LEMMAS:
  154 + continue
  155 + for child in token.children:
  156 + if child.dep_ == "pobj" and child.pos_ in ("NOUN", "PROPN"):
  157 + pobj_heads_to_demote.add(child.i)
  158 +
  159 + for token in doc:
  160 + if token.dep_ == "dobj" and token.pos_ in ("NOUN", "PROPN") and token.i not in pobj_heads_to_demote:
  161 + intersection.add(_surface_lower(token))
  162 +
  163 + for token in doc:
  164 + if token.dep_ == "nsubj" and token.pos_ in ("NOUN", "PROPN"):
  165 + head = token.head
  166 + if head.pos_ == "AUX" and head.dep_ == "ROOT":
  167 + intersection.add(_surface_lower(token))
  168 +
  169 + for token in doc:
  170 + if token.dep_ == "ROOT" and token.pos_ in ("INTJ", "PROPN"):
  171 + intersection.add(_surface_lower(token))
  172 + if token.pos_ == "PROPN":
  173 + if token.dep_ == "compound" and _lemma_lower(token.head) in _DEMOGRAPHIC_NOUNS:
  174 + continue
  175 + intersection.add(_surface_lower(token))
  176 +
  177 + for token in doc:
  178 + if token.dep_ == "ROOT" and token.pos_ in ("NOUN", "PROPN"):
  179 + if _lemma_lower(token) in _DIMENSION_ROOTS:
  180 + for child in token.children:
  181 + if child.dep_ == "nsubj" and child.pos_ in ("NOUN", "PROPN"):
  182 + intersection.add(_surface_lower(child))
  183 + continue
  184 + if _lemma_lower(token) in _DEMOGRAPHIC_NOUNS:
  185 + for child in token.children:
  186 + if child.dep_ == "compound" and child.pos_ == "NOUN":
  187 + intersection.add(_surface_lower(child))
  188 + continue
  189 + if token.i in pobj_heads_to_demote:
  190 + continue
  191 + intersection.add(_surface_lower(token))
  192 +
  193 + for token in doc:
  194 + if token.dep_ != "ROOT" or token.pos_ not in ("INTJ", "VERB", "NOUN"):
  195 + continue
  196 + pobjs = sorted(
  197 + [child for child in token.children if child.dep_ == "pobj" and child.pos_ in ("NOUN", "PROPN")],
  198 + key=lambda item: item.i,
  199 + )
  200 + if len(pobjs) >= 2 and token.pos_ == "INTJ":
  201 + intersection.add(_surface_lower(pobjs[0]))
  202 + for extra in pobjs[1:]:
  203 + if _lemma_lower(extra) not in _DEMOGRAPHIC_NOUNS:
  204 + intersection.add(_surface_lower(extra))
  205 + elif len(pobjs) == 1 and token.pos_ == "INTJ":
  206 + intersection.add(_surface_lower(pobjs[0]))
  207 +
  208 + if not intersection:
  209 + for chunk in doc.noun_chunks:
  210 + head = chunk.root
  211 + if head.pos_ not in ("NOUN", "PROPN"):
  212 + continue
  213 + if head.dep_ == "pobj" and head.head.dep_ == "prep":
  214 + prep = head.head
  215 + if _lemma_lower(prep) in _PRICE_PREP_LEMMAS or prep.text.lower() == "for":
  216 + continue
  217 + head_text = _surface_lower(head)
  218 + if head_text:
  219 + intersection.add(head_text)
  220 + for token in chunk:
  221 + if token == head or token.pos_ != "PROPN":
  222 + continue
  223 + intersection.add(_surface_lower(token))
  224 +
  225 + core_terms = _dedupe_preserve(
  226 + token.text.lower()
  227 + for token in doc
  228 + if _surface_lower(token) in intersection
  229 + and _surface_lower(token) not in stops
  230 + and _surface_lower(token) not in _DEMOGRAPHIC_NOUNS
  231 + and token.dep_ not in _FUNCTIONAL_DEP
  232 + and len(_surface_lower(token)) >= 2
  233 + )
  234 + projected_terms = _project_terms_to_query_tokens(query, core_terms)
  235 + if projected_terms:
  236 + return " ".join(projected_terms[:3])
  237 + return self._fallback_keywords(query)
  238 +
  239 + def _fallback_keywords(self, query: str) -> str:
  240 + tokens = [
  241 + normalize_query_text(token)
  242 + for token in simple_tokenize_query(query)
  243 + if normalize_query_text(token)
  244 + ]
  245 + if not tokens:
  246 + return ""
  247 +
  248 + filtered = [token for token in tokens if token not in _DEMOGRAPHIC_NOUNS]
  249 + if not filtered:
  250 + filtered = tokens
  251 +
  252 + # Keep the right-most likely product head plus one close modifier.
  253 + head = filtered[-1]
  254 + if len(filtered) >= 2:
  255 + return " ".join(filtered[-2:])
  256 + return head
... ...
query/keyword_extractor.py
... ... @@ -11,6 +11,9 @@ from __future__ import annotations
11 11 import logging
12 12 from typing import Any, Dict, List, Optional
13 13  
  14 +from .english_keyword_extractor import EnglishKeywordExtractor
  15 +from .tokenization import QueryTextAnalysisCache
  16 +
14 17 logger = logging.getLogger(__name__)
15 18  
16 19 import hanlp # type: ignore
... ... @@ -21,7 +24,7 @@ KEYWORDS_QUERY_BASE_KEY = &quot;base&quot;
21 24 # | 场景 | 推荐模型 |
22 25 # | :--------- | :------------------------------------------- |
23 26 # | 纯中文 + 最高精度 | CTB9_TOK_ELECTRA_BASE_CRF 或 MSR_TOK_ELECTRA_BASE_CRF |
24   -# | 纯中文 + 速度优先 | FINE_ELECTRA_SMALL_ZH(细粒度)或 COARSE_ELECTRA_SMALL_ZH(粗粒度) |
  27 +# | 纯中文 + 速度优先 | FINE_ELECTRA_SMALL_ZH (细粒度)或 COARSE_ELECTRA_SMALL_ZH (粗粒度) |
25 28 # | **中英文混合** | `UD_TOK_MMINILMV2L6` 或 `UD_TOK_MMINILMV2L12` ( Transformer 编码器的层数不同)|
26 29  
27 30  
... ... @@ -33,23 +36,38 @@ class KeywordExtractor:
33 36 tokenizer: Optional[Any] = None,
34 37 *,
35 38 ignore_keywords: Optional[List[str]] = None,
  39 + english_extractor: Optional[EnglishKeywordExtractor] = None,
36 40 ):
37 41 if tokenizer is not None:
38 42 self.tok = tokenizer
39 43 else:
40   - self.tok = hanlp.load(hanlp.pretrained.tok.UD_TOK_MMINILMV2L6)
  44 + self.tok = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH)
41 45 self.tok.config.output_spans = True
42 46 self.pos_tag = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL)
43 47 self.ignore_keywords = frozenset(ignore_keywords or ["玩具"])
  48 + self.english_extractor = english_extractor or EnglishKeywordExtractor()
44 49  
45   - def extract_keywords(self, query: str) -> str:
  50 + def extract_keywords(
  51 + self,
  52 + query: str,
  53 + *,
  54 + language_hint: Optional[str] = None,
  55 + tokenizer_result: Optional[Any] = None,
  56 + ) -> str:
46 57 """
47 58 从查询中提取关键词(名词,长度 ≥ 2),以空格分隔非连续片段。
48 59 """
49 60 query = (query or "").strip()
50 61 if not query:
51 62 return ""
52   - tok_result_with_position = self.tok(query)
  63 + normalized_language = str(language_hint or "").strip().lower()
  64 + if normalized_language == "en":
  65 + return self.english_extractor.extract_keywords(query)
  66 + if normalized_language and normalized_language != "zh":
  67 + return ""
  68 + tok_result_with_position = (
  69 + tokenizer_result if tokenizer_result is not None else self.tok(query)
  70 + )
53 71 tok_result = [x[0] for x in tok_result_with_position]
54 72 if not tok_result:
55 73 return ""
... ... @@ -72,6 +90,10 @@ def collect_keywords_queries(
72 90 extractor: KeywordExtractor,
73 91 rewritten_query: str,
74 92 translations: Dict[str, str],
  93 + *,
  94 + source_language: Optional[str] = None,
  95 + text_analysis_cache: Optional[QueryTextAnalysisCache] = None,
  96 + base_keywords_query: Optional[str] = None,
75 97 ) -> Dict[str, str]:
76 98 """
77 99 Build the keyword map for all lexical variants (base + translations).
... ... @@ -79,14 +101,40 @@ def collect_keywords_queries(
79 101 Omits entries when extraction yields an empty string.
80 102 """
81 103 out: Dict[str, str] = {}
82   - base_kw = extractor.extract_keywords(rewritten_query)
  104 + base_kw = base_keywords_query
  105 + if base_kw is None:
  106 + base_kw = extractor.extract_keywords(
  107 + rewritten_query,
  108 + language_hint=source_language or (
  109 + text_analysis_cache.get_language_hint(rewritten_query)
  110 + if text_analysis_cache is not None
  111 + else None
  112 + ),
  113 + tokenizer_result=(
  114 + text_analysis_cache.get_tokenizer_result(rewritten_query)
  115 + if text_analysis_cache is not None
  116 + else None
  117 + ),
  118 + )
83 119 if base_kw:
84 120 out[KEYWORDS_QUERY_BASE_KEY] = base_kw
85 121 for lang, text in translations.items():
86 122 lang_key = str(lang or "").strip().lower()
87 123 if not lang_key or not (text or "").strip():
88 124 continue
89   - kw = extractor.extract_keywords(text)
  125 + kw = extractor.extract_keywords(
  126 + text,
  127 + language_hint=lang_key or (
  128 + text_analysis_cache.get_language_hint(text)
  129 + if text_analysis_cache is not None
  130 + else None
  131 + ),
  132 + tokenizer_result=(
  133 + text_analysis_cache.get_tokenizer_result(text)
  134 + if text_analysis_cache is not None
  135 + else None
  136 + ),
  137 + )
90 138 if kw:
91 139 out[lang_key] = kw
92 140 return out
... ...
query/product_title_exclusion.py
... ... @@ -7,7 +7,7 @@ from __future__ import annotations
7 7 from dataclasses import dataclass, field
8 8 from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Set, Tuple
9 9  
10   -from .tokenization import TokenizedText, normalize_query_text, tokenize_text
  10 +from .tokenization import QueryTextAnalysisCache, TokenizedText, normalize_query_text, tokenize_text
11 11  
12 12  
13 13 def _dedupe_terms(terms: Iterable[str]) -> List[str]:
... ... @@ -158,9 +158,27 @@ class ProductTitleExclusionDetector:
158 158 self.registry = registry
159 159 self.tokenizer = tokenizer
160 160  
  161 + def _tokenize_text(
  162 + self,
  163 + text: str,
  164 + *,
  165 + analysis_cache: Optional[QueryTextAnalysisCache] = None,
  166 + ) -> TokenizedText:
  167 + if analysis_cache is not None:
  168 + return analysis_cache.get_tokenized_text(
  169 + text,
  170 + max_ngram=self.registry.max_term_ngram,
  171 + )
  172 + return tokenize_text(
  173 + text,
  174 + tokenizer=self.tokenizer,
  175 + max_ngram=self.registry.max_term_ngram,
  176 + )
  177 +
161 178 def _build_query_variants(self, parsed_query: Any) -> Tuple[TokenizedText, ...]:
162 179 seen = set()
163 180 variants: List[TokenizedText] = []
  181 + analysis_cache = getattr(parsed_query, "_text_analysis_cache", None)
164 182 texts = [
165 183 getattr(parsed_query, "original_query", None),
166 184 getattr(parsed_query, "query_normalized", None),
... ... @@ -180,10 +198,9 @@ class ProductTitleExclusionDetector:
180 198 continue
181 199 seen.add(normalized)
182 200 variants.append(
183   - tokenize_text(
  201 + self._tokenize_text(
184 202 text,
185   - tokenizer=self.tokenizer,
186   - max_ngram=self.registry.max_term_ngram,
  203 + analysis_cache=analysis_cache,
187 204 )
188 205 )
189 206  
... ...
query/query_parser.py
... ... @@ -27,7 +27,7 @@ from .product_title_exclusion import (
27 27 )
28 28 from .query_rewriter import QueryRewriter, QueryNormalizer
29 29 from .style_intent import StyleIntentDetector, StyleIntentProfile, StyleIntentRegistry
30   -from .tokenization import extract_token_strings, simple_tokenize_query
  30 +from .tokenization import QueryTextAnalysisCache, contains_han_text, extract_token_strings
31 31 from .keyword_extractor import KeywordExtractor, collect_keywords_queries
32 32  
33 33 logger = logging.getLogger(__name__)
... ... @@ -119,6 +119,7 @@ class ParsedQuery:
119 119 keywords_queries: Dict[str, str] = field(default_factory=dict)
120 120 style_intent_profile: Optional[StyleIntentProfile] = None
121 121 product_title_exclusion_profile: Optional[ProductTitleExclusionProfile] = None
  122 + _text_analysis_cache: Optional[QueryTextAnalysisCache] = field(default=None, repr=False)
122 123  
123 124 def text_for_rerank(self) -> str:
124 125 """See :func:`rerank_query_text`."""
... ... @@ -238,7 +239,7 @@ class QueryParser:
238 239 if hanlp is None:
239 240 raise RuntimeError("HanLP is required for QueryParser tokenization")
240 241 logger.info("Initializing HanLP tokenizer...")
241   - tokenizer = hanlp.load(hanlp.pretrained.tok.CTB9_TOK_ELECTRA_BASE_CRF)
  242 + tokenizer = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH)
242 243 tokenizer.config.output_spans = True
243 244 logger.info("HanLP tokenizer initialized")
244 245 return tokenizer
... ... @@ -288,6 +289,33 @@ class QueryParser:
288 289 def _get_query_tokens(self, query: str) -> List[str]:
289 290 return self._extract_tokens(self._tokenizer(query))
290 291  
  292 + @staticmethod
  293 + def _is_ascii_latin_query(text: str) -> bool:
  294 + candidate = str(text or "").strip()
  295 + if not candidate or contains_han_text(candidate):
  296 + return False
  297 + try:
  298 + candidate.encode("ascii")
  299 + except UnicodeEncodeError:
  300 + return False
  301 + return any(ch.isalpha() for ch in candidate)
  302 +
  303 + def _detect_query_language(
  304 + self,
  305 + query_text: str,
  306 + *,
  307 + target_languages: Optional[List[str]] = None,
  308 + ) -> str:
  309 + normalized_targets = self._normalize_language_codes(target_languages)
  310 + supported_languages = self._normalize_language_codes(
  311 + getattr(self.config.query_config, "supported_languages", None)
  312 + )
  313 + active_languages = normalized_targets or supported_languages
  314 + if active_languages and set(active_languages).issubset({"en", "zh"}):
  315 + if self._is_ascii_latin_query(query_text):
  316 + return "en"
  317 + return self.language_detector.detect(query_text)
  318 +
291 319 def parse(
292 320 self,
293 321 query: str,
... ... @@ -332,12 +360,15 @@ class QueryParser:
332 360 active_logger.debug(msg)
333 361  
334 362 # Stage 1: Normalize
  363 + normalize_t0 = time.perf_counter()
335 364 normalized = self.normalizer.normalize(query)
  365 + normalize_ms = (time.perf_counter() - normalize_t0) * 1000.0
336 366 log_debug(f"Normalization completed | '{query}' -> '{normalized}'")
337 367 if context:
338 368 context.store_intermediate_result('query_normalized', normalized)
339 369  
340 370 # Stage 2: Query rewriting
  371 + rewrite_t0 = time.perf_counter()
341 372 query_text = normalized
342 373 rewritten = normalized
343 374 if self.config.query_config.rewrite_dictionary: # Enable rewrite if dictionary exists
... ... @@ -348,21 +379,26 @@ class QueryParser:
348 379 if context:
349 380 context.store_intermediate_result('rewritten_query', rewritten)
350 381 context.add_warning(f"Query was rewritten: {query_text}")
  382 + rewrite_ms = (time.perf_counter() - rewrite_t0) * 1000.0
  383 +
  384 + normalized_targets = self._normalize_language_codes(target_languages)
351 385  
352 386 # Stage 3: Language detection
353   - detected_lang = self.language_detector.detect(query_text)
  387 + language_detect_t0 = time.perf_counter()
  388 + detected_lang = self._detect_query_language(
  389 + query_text,
  390 + target_languages=normalized_targets,
  391 + )
354 392 # Use default language if detection failed (None or "unknown")
355 393 if not detected_lang or detected_lang == "unknown":
356 394 detected_lang = self.config.query_config.default_language
  395 + language_detect_ms = (time.perf_counter() - language_detect_t0) * 1000.0
357 396 log_info(f"Language detection | Detected language: {detected_lang}")
358 397 if context:
359 398 context.store_intermediate_result('detected_language', detected_lang)
360   - # Stage 4: Query analysis (tokenization)
361   - query_tokens = self._get_query_tokens(query_text)
362   -
363   - log_debug(f"Query analysis | Query tokens: {query_tokens}")
364   - if context:
365   - context.store_intermediate_result('query_tokens', query_tokens)
  399 + text_analysis_cache = QueryTextAnalysisCache(tokenizer=self._tokenizer)
  400 + for text_variant in (query, normalized, query_text):
  401 + text_analysis_cache.set_language_hint(text_variant, detected_lang)
366 402  
367 403 # Stage 5: Translation + embedding. Parser only coordinates async enrichment work; the
368 404 # caller decides translation targets and later search-field planning.
... ... @@ -371,7 +407,6 @@ class QueryParser:
371 407 future_submit_at: Dict[Any, float] = {}
372 408 async_executor: Optional[ThreadPoolExecutor] = None
373 409 detected_norm = str(detected_lang or "").strip().lower()
374   - normalized_targets = self._normalize_language_codes(target_languages)
375 410 translation_targets = [lang for lang in normalized_targets if lang != detected_norm]
376 411 source_language_in_index = bool(normalized_targets) and detected_norm in normalized_targets
377 412  
... ... @@ -398,7 +433,9 @@ class QueryParser:
398 433 thread_name_prefix="query-enrichment",
399 434 )
400 435  
  436 + async_submit_ms = 0.0
401 437 try:
  438 + async_submit_t0 = time.perf_counter()
402 439 if async_executor is not None:
403 440 for lang in translation_targets:
404 441 model_name = self._pick_query_translation_model(
... ... @@ -466,6 +503,7 @@ class QueryParser:
466 503 future = async_executor.submit(_encode_image_query_vector)
467 504 future_to_task[future] = ("image_embedding", None)
468 505 future_submit_at[future] = time.perf_counter()
  506 + async_submit_ms = (time.perf_counter() - async_submit_t0) * 1000.0
469 507 except Exception as e:
470 508 error_msg = f"Async query enrichment submission failed | Error: {str(e)}"
471 509 log_info(error_msg)
... ... @@ -477,6 +515,33 @@ class QueryParser:
477 515 future_to_task.clear()
478 516 future_submit_at.clear()
479 517  
  518 + # Stage 4: Query analysis (tokenization) now overlaps with async enrichment work.
  519 + query_analysis_t0 = time.perf_counter()
  520 + query_tokenizer_t0 = time.perf_counter()
  521 + query_tokenizer_result = text_analysis_cache.get_tokenizer_result(query_text)
  522 + query_tokenizer_ms = (time.perf_counter() - query_tokenizer_t0) * 1000.0
  523 + query_token_extract_t0 = time.perf_counter()
  524 + query_tokens = self._extract_tokens(query_tokenizer_result)
  525 + query_token_extract_ms = (time.perf_counter() - query_token_extract_t0) * 1000.0
  526 + query_analysis_ms = (time.perf_counter() - query_analysis_t0) * 1000.0
  527 +
  528 + log_debug(f"Query analysis | Query tokens: {query_tokens}")
  529 + if context:
  530 + context.store_intermediate_result('query_tokens', query_tokens)
  531 +
  532 + keywords_base_query = ""
  533 + keywords_base_ms = 0.0
  534 + try:
  535 + keywords_base_t0 = time.perf_counter()
  536 + keywords_base_query = self._keyword_extractor.extract_keywords(
  537 + query_text,
  538 + language_hint=detected_lang,
  539 + tokenizer_result=text_analysis_cache.get_tokenizer_result(query_text),
  540 + )
  541 + keywords_base_ms = (time.perf_counter() - keywords_base_t0) * 1000.0
  542 + except Exception as e:
  543 + log_info(f"Base keyword extraction failed | Error: {e}")
  544 +
480 545 # Wait for translation + embedding concurrently; shared budget depends on whether
481 546 # the detected language belongs to caller-provided target_languages.
482 547 qc = self.config.query_config
... ... @@ -501,7 +566,10 @@ class QueryParser:
501 566 f"source_in_target_languages={source_in_target_languages}"
502 567 )
503 568  
  569 + async_wait_t0 = time.perf_counter()
504 570 done, not_done = wait(list(future_to_task.keys()), timeout=budget_sec)
  571 + async_wait_ms = (time.perf_counter() - async_wait_t0) * 1000.0
  572 + async_collect_t0 = time.perf_counter()
505 573 for future in done:
506 574 task_type, lang = future_to_task[future]
507 575 t0 = future_submit_at.pop(future, None)
... ... @@ -511,6 +579,7 @@ class QueryParser:
511 579 if task_type == "translation":
512 580 if result:
513 581 translations[lang] = result
  582 + text_analysis_cache.set_language_hint(result, lang)
514 583 if context:
515 584 context.store_intermediate_result(f"translation_{lang}", result)
516 585 elif task_type == "embedding":
... ... @@ -561,20 +630,31 @@ class QueryParser:
561 630 log_info(timeout_msg)
562 631 if context:
563 632 context.add_warning(timeout_msg)
  633 + async_collect_ms = (time.perf_counter() - async_collect_t0) * 1000.0
564 634  
565 635 if async_executor:
566 636 async_executor.shutdown(wait=False)
567 637  
568 638 if translations and context:
569 639 context.store_intermediate_result("translations", translations)
  640 + else:
  641 + async_wait_ms = 0.0
  642 + async_collect_ms = 0.0
570 643  
  644 + tail_sync_t0 = time.perf_counter()
571 645 keywords_queries: Dict[str, str] = {}
  646 + keyword_tail_ms = 0.0
572 647 try:
  648 + keywords_t0 = time.perf_counter()
573 649 keywords_queries = collect_keywords_queries(
574 650 self._keyword_extractor,
575 651 query_text,
576 652 translations,
  653 + source_language=detected_lang,
  654 + text_analysis_cache=text_analysis_cache,
  655 + base_keywords_query=keywords_base_query,
577 656 )
  657 + keyword_tail_ms = (time.perf_counter() - keywords_t0) * 1000.0
578 658 except Exception as e:
579 659 log_info(f"Keyword extraction failed | Error: {e}")
580 660  
... ... @@ -589,9 +669,43 @@ class QueryParser:
589 669 image_query_vector=image_query_vector,
590 670 query_tokens=query_tokens,
591 671 keywords_queries=keywords_queries,
  672 + _text_analysis_cache=text_analysis_cache,
592 673 )
  674 + style_intent_t0 = time.perf_counter()
593 675 style_intent_profile = self.style_intent_detector.detect(base_result)
  676 + style_intent_ms = (time.perf_counter() - style_intent_t0) * 1000.0
  677 + product_title_exclusion_t0 = time.perf_counter()
594 678 product_title_exclusion_profile = self.product_title_exclusion_detector.detect(base_result)
  679 + product_title_exclusion_ms = (
  680 + (time.perf_counter() - product_title_exclusion_t0) * 1000.0
  681 + )
  682 + tail_sync_ms = (time.perf_counter() - tail_sync_t0) * 1000.0
  683 + before_wait_ms = (
  684 + normalize_ms
  685 + + rewrite_ms
  686 + + language_detect_ms
  687 + + async_submit_ms
  688 + + query_analysis_ms
  689 + + keywords_base_ms
  690 + )
  691 + log_info(
  692 + "Query parse stage timings | "
  693 + f"normalize_ms={normalize_ms:.1f} | "
  694 + f"rewrite_ms={rewrite_ms:.1f} | "
  695 + f"language_detect_ms={language_detect_ms:.1f} | "
  696 + f"query_tokenizer_ms={query_tokenizer_ms:.1f} | "
  697 + f"query_token_extract_ms={query_token_extract_ms:.1f} | "
  698 + f"query_analysis_ms={query_analysis_ms:.1f} | "
  699 + f"async_submit_ms={async_submit_ms:.1f} | "
  700 + f"before_wait_ms={before_wait_ms:.1f} | "
  701 + f"async_wait_ms={async_wait_ms:.1f} | "
  702 + f"async_collect_ms={async_collect_ms:.1f} | "
  703 + f"base_keywords_ms={keywords_base_ms:.1f} | "
  704 + f"keyword_tail_ms={keyword_tail_ms:.1f} | "
  705 + f"style_intent_ms={style_intent_ms:.1f} | "
  706 + f"product_title_exclusion_ms={product_title_exclusion_ms:.1f} | "
  707 + f"tail_sync_ms={tail_sync_ms:.1f}"
  708 + )
595 709 if context:
596 710 context.store_intermediate_result(
597 711 "style_intent_profile",
... ... @@ -614,6 +728,7 @@ class QueryParser:
614 728 keywords_queries=keywords_queries,
615 729 style_intent_profile=style_intent_profile,
616 730 product_title_exclusion_profile=product_title_exclusion_profile,
  731 + _text_analysis_cache=text_analysis_cache,
617 732 )
618 733  
619 734 parse_total_ms = (time.perf_counter() - parse_t0) * 1000.0
... ...
query/style_intent.py
... ... @@ -7,7 +7,7 @@ from __future__ import annotations
7 7 from dataclasses import dataclass, field
8 8 from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Set, Tuple
9 9  
10   -from .tokenization import TokenizedText, normalize_query_text, tokenize_text
  10 +from .tokenization import QueryTextAnalysisCache, TokenizedText, normalize_query_text, tokenize_text
11 11  
12 12  
13 13 @dataclass(frozen=True)
... ... @@ -233,32 +233,63 @@ class StyleIntentDetector:
233 233 self.registry = registry
234 234 self.tokenizer = tokenizer
235 235  
236   - def _build_query_variants(self, parsed_query: Any) -> Tuple[TokenizedText, ...]:
237   - seen = set()
238   - variants: List[TokenizedText] = []
239   - texts = [
240   - self._get_language_query_text(parsed_query, "zh"),
241   - self._get_language_query_text(parsed_query, "en"),
242   - ]
  236 + def _max_term_ngram(self) -> int:
  237 + return max(
  238 + (definition.max_term_ngram for definition in self.registry.definitions.values()),
  239 + default=3,
  240 + )
  241 +
  242 + def _tokenize_text(
  243 + self,
  244 + text: str,
  245 + *,
  246 + analysis_cache: Optional[QueryTextAnalysisCache] = None,
  247 + ) -> TokenizedText:
  248 + max_term_ngram = self._max_term_ngram()
  249 + if analysis_cache is not None:
  250 + return analysis_cache.get_tokenized_text(text, max_ngram=max_term_ngram)
  251 + return tokenize_text(
  252 + text,
  253 + tokenizer=self.tokenizer,
  254 + max_ngram=max_term_ngram,
  255 + )
243 256  
244   - for raw_text in texts:
245   - text = str(raw_text or "").strip()
  257 + def _build_language_variants(
  258 + self,
  259 + parsed_query: Any,
  260 + *,
  261 + analysis_cache: Optional[QueryTextAnalysisCache] = None,
  262 + ) -> Dict[str, TokenizedText]:
  263 + variants: Dict[str, TokenizedText] = {}
  264 + for language in ("zh", "en"):
  265 + text = self._get_language_query_text(parsed_query, language).strip()
246 266 if not text:
247 267 continue
248   - normalized = normalize_query_text(text)
  268 + variants[language] = self._tokenize_text(
  269 + text,
  270 + analysis_cache=analysis_cache,
  271 + )
  272 + return variants
  273 +
  274 + def _build_query_variants(
  275 + self,
  276 + parsed_query: Any,
  277 + *,
  278 + language_variants: Optional[Dict[str, TokenizedText]] = None,
  279 + analysis_cache: Optional[QueryTextAnalysisCache] = None,
  280 + ) -> Tuple[TokenizedText, ...]:
  281 + seen = set()
  282 + variants: List[TokenizedText] = []
  283 +
  284 + for variant in (language_variants or self._build_language_variants(
  285 + parsed_query,
  286 + analysis_cache=analysis_cache,
  287 + )).values():
  288 + normalized = variant.normalized_text
249 289 if not normalized or normalized in seen:
250 290 continue
251 291 seen.add(normalized)
252   - variants.append(
253   - tokenize_text(
254   - text,
255   - tokenizer=self.tokenizer,
256   - max_ngram=max(
257   - (definition.max_term_ngram for definition in self.registry.definitions.values()),
258   - default=3,
259   - ),
260   - )
261   - )
  292 + variants.append(variant)
262 293  
263 294 return tuple(variants)
264 295  
... ... @@ -271,26 +302,50 @@ class StyleIntentDetector:
271 302 return str(translated)
272 303 return str(getattr(parsed_query, "original_query", "") or "")
273 304  
274   - def _tokenize_language_query(self, parsed_query: Any, language: str) -> Optional[TokenizedText]:
  305 + def _tokenize_language_query(
  306 + self,
  307 + parsed_query: Any,
  308 + language: str,
  309 + *,
  310 + language_variants: Optional[Dict[str, TokenizedText]] = None,
  311 + analysis_cache: Optional[QueryTextAnalysisCache] = None,
  312 + ) -> Optional[TokenizedText]:
  313 + if language_variants is not None:
  314 + return language_variants.get(language)
275 315 text = self._get_language_query_text(parsed_query, language).strip()
276 316 if not text:
277 317 return None
278   - return tokenize_text(
  318 + return self._tokenize_text(
279 319 text,
280   - tokenizer=self.tokenizer,
281   - max_ngram=max(
282   - (definition.max_term_ngram for definition in self.registry.definitions.values()),
283   - default=3,
284   - ),
  320 + analysis_cache=analysis_cache,
285 321 )
286 322  
287 323 def detect(self, parsed_query: Any) -> StyleIntentProfile:
288 324 if not self.registry.enabled or not self.registry.definitions:
289 325 return StyleIntentProfile()
290 326  
291   - query_variants = self._build_query_variants(parsed_query)
292   - zh_variant = self._tokenize_language_query(parsed_query, "zh")
293   - en_variant = self._tokenize_language_query(parsed_query, "en")
  327 + analysis_cache = getattr(parsed_query, "_text_analysis_cache", None)
  328 + language_variants = self._build_language_variants(
  329 + parsed_query,
  330 + analysis_cache=analysis_cache,
  331 + )
  332 + query_variants = self._build_query_variants(
  333 + parsed_query,
  334 + language_variants=language_variants,
  335 + analysis_cache=analysis_cache,
  336 + )
  337 + zh_variant = self._tokenize_language_query(
  338 + parsed_query,
  339 + "zh",
  340 + language_variants=language_variants,
  341 + analysis_cache=analysis_cache,
  342 + )
  343 + en_variant = self._tokenize_language_query(
  344 + parsed_query,
  345 + "en",
  346 + language_variants=language_variants,
  347 + analysis_cache=analysis_cache,
  348 + )
294 349 detected: List[DetectedStyleIntent] = []
295 350 seen_pairs = set()
296 351  
... ...
query/tokenization.py
... ... @@ -6,10 +6,11 @@ from __future__ import annotations
6 6  
7 7 from dataclasses import dataclass
8 8 import re
9   -from typing import Any, Callable, Iterable, List, Optional, Sequence, Tuple
  9 +from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
10 10  
11 11  
12   -_TOKEN_PATTERN = re.compile(r"[\u4e00-\u9fff]+|[A-Za-z0-9_]+(?:-[A-Za-z0-9_]+)*")
  12 +_HAN_PATTERN = re.compile(r"[\u4e00-\u9fff]")
  13 +_TOKEN_PATTERN = re.compile(r"[\u4e00-\u9fff]+|[^\W_]+(?:[-'][^\W_]+)*", re.UNICODE)
13 14  
14 15  
15 16 def normalize_query_text(text: Optional[str]) -> str:
... ... @@ -30,6 +31,10 @@ def simple_tokenize_query(text: str) -&gt; List[str]:
30 31 return _TOKEN_PATTERN.findall(text)
31 32  
32 33  
  34 +def contains_han_text(text: Optional[str]) -> bool:
  35 + return bool(text and _HAN_PATTERN.search(str(text)))
  36 +
  37 +
33 38 def extract_token_strings(tokenizer_result: Any) -> List[str]:
34 39 """Normalize tokenizer output into a flat token string list."""
35 40 if not tokenizer_result:
... ... @@ -84,6 +89,13 @@ def _build_phrase_candidates(tokens: Sequence[str], max_ngram: int) -&gt; List[str]
84 89 return phrases
85 90  
86 91  
  92 +def _build_coarse_tokens(text: str, fine_tokens: Sequence[str]) -> List[str]:
  93 + coarse_tokens = _dedupe_preserve_order(simple_tokenize_query(text))
  94 + if contains_han_text(text) and fine_tokens:
  95 + return list(_dedupe_preserve_order(fine_tokens))
  96 + return coarse_tokens
  97 +
  98 +
87 99 @dataclass(frozen=True)
88 100 class TokenizedText:
89 101 text: str
... ... @@ -93,30 +105,88 @@ class TokenizedText:
93 105 candidates: Tuple[str, ...]
94 106  
95 107  
  108 +class QueryTextAnalysisCache:
  109 + """Per-parse cache for tokenizer output and derived token bundles."""
  110 +
  111 + def __init__(self, *, tokenizer: Optional[Callable[[str], Any]] = None) -> None:
  112 + self.tokenizer = tokenizer
  113 + self._tokenizer_results: Dict[str, Any] = {}
  114 + self._tokenized_texts: Dict[Tuple[str, int], TokenizedText] = {}
  115 + self._language_hints: Dict[str, str] = {}
  116 +
  117 + @staticmethod
  118 + def _normalize_input(text: Optional[str]) -> str:
  119 + return str(text or "").strip()
  120 +
  121 + def set_language_hint(self, text: Optional[str], language: Optional[str]) -> None:
  122 + normalized_input = self._normalize_input(text)
  123 + normalized_language = normalize_query_text(language)
  124 + if normalized_input and normalized_language:
  125 + self._language_hints[normalized_input] = normalized_language
  126 +
  127 + def get_language_hint(self, text: Optional[str]) -> Optional[str]:
  128 + normalized_input = self._normalize_input(text)
  129 + if not normalized_input:
  130 + return None
  131 + return self._language_hints.get(normalized_input)
  132 +
  133 + def _should_use_model_tokenizer(self, text: str) -> bool:
  134 + if self.tokenizer is None:
  135 + return False
  136 + language_hint = self.get_language_hint(text)
  137 + has_han = contains_han_text(text)
  138 + if language_hint == "zh":
  139 + return has_han
  140 + return has_han
  141 +
  142 + def get_tokenizer_result(self, text: Optional[str]) -> Any:
  143 + normalized_input = self._normalize_input(text)
  144 + if not normalized_input:
  145 + return []
  146 + if not self._should_use_model_tokenizer(normalized_input):
  147 + return simple_tokenize_query(normalized_input)
  148 + if normalized_input not in self._tokenizer_results:
  149 + self._tokenizer_results[normalized_input] = self.tokenizer(normalized_input)
  150 + return self._tokenizer_results[normalized_input]
  151 +
  152 + def get_tokenized_text(self, text: Optional[str], *, max_ngram: int = 3) -> TokenizedText:
  153 + normalized_input = self._normalize_input(text)
  154 + cache_key = (normalized_input, max(1, int(max_ngram)))
  155 + cached = self._tokenized_texts.get(cache_key)
  156 + if cached is not None:
  157 + return cached
  158 +
  159 + normalized_text = normalize_query_text(normalized_input)
  160 + fine_raw = extract_token_strings(self.get_tokenizer_result(normalized_input))
  161 + fine_tokens = _dedupe_preserve_order(fine_raw)
  162 + coarse_tokens = _build_coarse_tokens(normalized_input, fine_tokens)
  163 +
  164 + bundle = TokenizedText(
  165 + text=normalized_input,
  166 + normalized_text=normalized_text,
  167 + fine_tokens=tuple(fine_tokens),
  168 + coarse_tokens=tuple(coarse_tokens),
  169 + candidates=tuple(
  170 + _dedupe_preserve_order(
  171 + list(fine_tokens)
  172 + + list(coarse_tokens)
  173 + + _build_phrase_candidates(fine_tokens, max_ngram=max_ngram)
  174 + + _build_phrase_candidates(coarse_tokens, max_ngram=max_ngram)
  175 + + ([normalized_text] if normalized_text else [])
  176 + )
  177 + ),
  178 + )
  179 + self._tokenized_texts[cache_key] = bundle
  180 + return bundle
  181 +
  182 +
96 183 def tokenize_text(
97 184 text: str,
98 185 *,
99 186 tokenizer: Optional[Callable[[str], Any]] = None,
100 187 max_ngram: int = 3,
101 188 ) -> TokenizedText:
102   - normalized_text = normalize_query_text(text)
103   - coarse_tokens = _dedupe_preserve_order(simple_tokenize_query(text))
104   -
105   - fine_raw = extract_token_strings(tokenizer(text)) if tokenizer is not None and text else []
106   - fine_tokens = _dedupe_preserve_order(fine_raw)
107   -
108   - candidates = _dedupe_preserve_order(
109   - list(fine_tokens)
110   - + list(coarse_tokens)
111   - + _build_phrase_candidates(fine_tokens, max_ngram=max_ngram)
112   - + _build_phrase_candidates(coarse_tokens, max_ngram=max_ngram)
113   - + ([normalized_text] if normalized_text else [])
114   - )
115   -
116   - return TokenizedText(
117   - text=text,
118   - normalized_text=normalized_text,
119   - fine_tokens=tuple(fine_tokens),
120   - coarse_tokens=tuple(coarse_tokens),
121   - candidates=tuple(candidates),
  189 + return QueryTextAnalysisCache(tokenizer=tokenizer).get_tokenized_text(
  190 + text,
  191 + max_ngram=max_ngram,
122 192 )
... ...
suggestion/service.py
... ... @@ -7,7 +7,7 @@ import time
7 7 from typing import Any, Dict, List, Optional
8 8  
9 9 from config.tenant_config_loader import get_tenant_config_loader
10   -from query.query_parser import simple_tokenize_query
  10 +from query.tokenization import simple_tokenize_query
11 11 from suggestion.builder import get_suggestion_alias_name
12 12 from utils.es_client import ESClient
13 13  
... ...
tests/test_query_parser_mixed_language.py
... ... @@ -77,3 +77,79 @@ def test_parse_waits_for_translation_when_source_in_index_languages(monkeypatch)
77 77 assert result.detected_language == "en"
78 78 assert result.translations.get("zh") == "off shoulder top-zh"
79 79 assert not hasattr(result, "source_in_index_languages")
  80 +
  81 +
  82 +def test_parse_reuses_tokenization_across_tail_stages(monkeypatch):
  83 + tokenize_calls = []
  84 +
  85 + def counting_tokenizer(text):
  86 + tokenize_calls.append(str(text))
  87 + return str(text).split()
  88 +
  89 + config = SearchConfig(
  90 + es_index_name="test_products",
  91 + field_boosts={"title.en": 3.0, "title.zh": 3.0},
  92 + indexes=[IndexConfig(name="default", label="default", fields=["title.en", "title.zh"])],
  93 + query_config=QueryConfig(
  94 + enable_text_embedding=False,
  95 + enable_query_rewrite=False,
  96 + supported_languages=["en", "zh"],
  97 + default_language="en",
  98 + style_intent_terms={
  99 + "color": [
  100 + {"en_terms": ["black"], "zh_terms": ["黑色"], "attribute_terms": ["black"]}
  101 + ],
  102 + },
  103 + style_intent_dimension_aliases={"color": ["color", "颜色"]},
  104 + product_title_exclusion_rules=[
  105 + {
  106 + "zh_trigger_terms": ["修身"],
  107 + "en_trigger_terms": ["fitted"],
  108 + "zh_title_exclusions": ["宽松"],
  109 + "en_title_exclusions": ["loose"],
  110 + }
  111 + ],
  112 + ),
  113 + function_score=FunctionScoreConfig(),
  114 + rerank=RerankConfig(),
  115 + spu_config=SPUConfig(enabled=False),
  116 + )
  117 + parser = QueryParser(
  118 + config,
  119 + translator=_DummyTranslator(),
  120 + tokenizer=counting_tokenizer,
  121 + )
  122 + monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en")
  123 +
  124 + result = parser.parse(
  125 + "black fitted dress",
  126 + tenant_id="0",
  127 + generate_vector=False,
  128 + target_languages=["en", "zh"],
  129 + )
  130 +
  131 + assert result.translations == {"zh": "black fitted dress-zh"}
  132 + assert result.style_intent_profile is not None
  133 + assert result.style_intent_profile.is_active is True
  134 + assert result.product_title_exclusion_profile is not None
  135 + assert result.product_title_exclusion_profile.is_active is True
  136 + assert tokenize_calls == []
  137 +
  138 +
  139 +def test_parse_fast_path_detects_ascii_query_as_english_without_lingua(monkeypatch):
  140 + parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
  141 + monkeypatch.setattr(
  142 + parser.language_detector,
  143 + "detect",
  144 + lambda text: (_ for _ in ()).throw(AssertionError("Lingua path should not be used")),
  145 + )
  146 +
  147 + result = parser.parse(
  148 + "street t-shirt women",
  149 + tenant_id="0",
  150 + generate_vector=False,
  151 + target_languages=["en", "zh"],
  152 + )
  153 +
  154 + assert result.detected_language == "en"
  155 + assert result.query_tokens == ["street", "t-shirt", "women"]
... ...
tests/test_style_intent.py
... ... @@ -58,3 +58,37 @@ def test_style_intent_detector_uses_original_query_when_language_translation_mis
58 58  
59 59 assert profile.get_canonical_values("color") == {"black"}
60 60 assert profile.intents[0].attribute_terms == ("black",)
  61 +
  62 +
  63 +def test_style_intent_detector_tokenizes_each_language_once():
  64 + query_config = QueryConfig(
  65 + style_intent_terms={
  66 + "color": [{"en_terms": ["black"], "zh_terms": ["黑色"], "attribute_terms": ["black"]}],
  67 + "size": [{"en_terms": ["xl"], "zh_terms": ["加大码"], "attribute_terms": ["xl"]}],
  68 + },
  69 + style_intent_dimension_aliases={
  70 + "color": ["color", "颜色"],
  71 + "size": ["size", "尺码"],
  72 + },
  73 + )
  74 + tokenize_calls = []
  75 +
  76 + def counting_tokenizer(text):
  77 + tokenize_calls.append(text)
  78 + return str(text).split()
  79 +
  80 + detector = StyleIntentDetector(
  81 + StyleIntentRegistry.from_query_config(query_config),
  82 + tokenizer=counting_tokenizer,
  83 + )
  84 + parsed_query = SimpleNamespace(
  85 + original_query="黑色 连衣裙",
  86 + query_normalized="黑色 连衣裙",
  87 + rewritten_query="黑色 连衣裙",
  88 + translations={"en": "black dress xl"},
  89 + )
  90 +
  91 + profile = detector.detect(parsed_query)
  92 +
  93 + assert profile.is_active is True
  94 + assert tokenize_calls == ["黑色 连衣裙"]
... ...
tests/test_tokenization.py 0 → 100644
... ... @@ -0,0 +1,13 @@
  1 +from query.tokenization import QueryTextAnalysisCache
  2 +
  3 +
  4 +def test_han_coarse_tokens_follow_model_tokens_instead_of_whole_sentence():
  5 + cache = QueryTextAnalysisCache(
  6 + tokenizer=lambda text: [("路上", 0, 2), ("穿着", 2, 4), ("女性", 4, 6), ("黑色", 10, 12)]
  7 + )
  8 + cache.set_language_hint("路上穿着女性的衣服是黑色的", "zh")
  9 +
  10 + tokenized = cache.get_tokenized_text("路上穿着女性的衣服是黑色的")
  11 +
  12 + assert tokenized.fine_tokens == ("路上", "穿着", "女性", "黑色")
  13 + assert tokenized.coarse_tokens == ("路上", "穿着", "女性", "黑色")
... ...