Commit 45b397964fb80661b13dbc49c8fd03990123ea41

Authored by tangwang
1 parent 926e1e96

qp性能优化

config/config.yaml
@@ -116,8 +116,8 @@ query_config: @@ -116,8 +116,8 @@ query_config:
116 116
117 # 查询解析阶段:翻译与 query 向量并发执行,共用同一等待预算(毫秒)。 117 # 查询解析阶段:翻译与 query 向量并发执行,共用同一等待预算(毫秒)。
118 # 检测语言已在租户 index_languages 内:较短;不在索引语言内:较长(翻译对召回更关键)。 118 # 检测语言已在租户 index_languages 内:较短;不在索引语言内:较长(翻译对召回更关键)。
119 - translation_embedding_wait_budget_ms_source_in_index: 500 # 80  
120 - translation_embedding_wait_budget_ms_source_not_in_index: 700 #200 119 + translation_embedding_wait_budget_ms_source_in_index: 200 # 80
  120 + translation_embedding_wait_budget_ms_source_not_in_index: 300 #200
121 121
122 style_intent: 122 style_intent:
123 enabled: true 123 enabled: true
query/english_keyword_extractor.py 0 → 100644
@@ -0,0 +1,256 @@ @@ -0,0 +1,256 @@
  1 +"""
  2 +Lightweight English core-term extraction for lexical keyword constraints.
  3 +"""
  4 +
  5 +from __future__ import annotations
  6 +
  7 +import logging
  8 +from typing import List, Optional, Sequence, Set
  9 +
  10 +from .tokenization import normalize_query_text, simple_tokenize_query
  11 +
  12 +logger = logging.getLogger(__name__)
  13 +
  14 +_WEAK_BOOST_ADJS = frozenset(
  15 + {
  16 + "best",
  17 + "good",
  18 + "great",
  19 + "new",
  20 + "free",
  21 + "cheap",
  22 + "top",
  23 + "fine",
  24 + "real",
  25 + }
  26 +)
  27 +
  28 +_FUNCTIONAL_DEP = frozenset(
  29 + {
  30 + "det",
  31 + "aux",
  32 + "auxpass",
  33 + "prep",
  34 + "mark",
  35 + "expl",
  36 + "cc",
  37 + "punct",
  38 + "case",
  39 + }
  40 +)
  41 +
  42 +_DEMOGRAPHIC_NOUNS = frozenset(
  43 + {
  44 + "women",
  45 + "woman",
  46 + "men",
  47 + "man",
  48 + "kids",
  49 + "kid",
  50 + "boys",
  51 + "boy",
  52 + "girls",
  53 + "girl",
  54 + "baby",
  55 + "babies",
  56 + "toddler",
  57 + "adult",
  58 + "adults",
  59 + }
  60 +)
  61 +
  62 +_PRICE_PREP_LEMMAS = frozenset({"under", "over", "below", "above", "within", "between", "near"})
  63 +_DIMENSION_ROOTS = frozenset({"size", "width", "length", "height", "weight"})
  64 +
  65 +
  66 +def _dedupe_preserve(seq: Sequence[str]) -> List[str]:
  67 + seen: Set[str] = set()
  68 + out: List[str] = []
  69 + for item in seq:
  70 + normalized = normalize_query_text(item)
  71 + if not normalized or normalized in seen:
  72 + continue
  73 + seen.add(normalized)
  74 + out.append(normalized)
  75 + return out
  76 +
  77 +
  78 +def _lemma_lower(token) -> str:
  79 + return ((token.lemma_ or token.text) or "").lower().strip()
  80 +
  81 +
  82 +def _surface_lower(token) -> str:
  83 + return (token.text or "").lower().strip()
  84 +
  85 +
  86 +def _project_terms_to_query_tokens(query: str, terms: Sequence[str]) -> List[str]:
  87 + simple_tokens = _dedupe_preserve(simple_tokenize_query(query))
  88 + projected: List[str] = []
  89 + for term in terms:
  90 + normalized = normalize_query_text(term)
  91 + if len(normalized) < 2 or normalized in _DEMOGRAPHIC_NOUNS:
  92 + continue
  93 + exact = next((token for token in simple_tokens if token == normalized), None)
  94 + if exact is not None:
  95 + projected.append(exact)
  96 + continue
  97 + partial = next(
  98 + (
  99 + token
  100 + for token in simple_tokens
  101 + if len(normalized) >= 3 and normalized in token and token not in _DEMOGRAPHIC_NOUNS
  102 + ),
  103 + None,
  104 + )
  105 + if partial is not None:
  106 + projected.append(partial)
  107 + continue
  108 + projected.append(normalized)
  109 + return _dedupe_preserve(projected)
  110 +
  111 +
  112 +class EnglishKeywordExtractor:
  113 + """Extracts a small set of English core product terms with spaCy."""
  114 +
  115 + def __init__(self, nlp: Optional[object] = None) -> None:
  116 + self._nlp = nlp if nlp is not None else self._load_nlp()
  117 +
  118 + @staticmethod
  119 + def _load_nlp() -> Optional[object]:
  120 + try:
  121 + import spacy
  122 +
  123 + return spacy.load("en_core_web_sm", disable=["ner", "textcat"])
  124 + except Exception as exc:
  125 + logger.warning("English keyword extractor disabled; failed to load spaCy model: %s", exc)
  126 + return None
  127 +
  128 + def extract_keywords(self, query: str) -> str:
  129 + text = str(query or "").strip()
  130 + if not text:
  131 + return ""
  132 + if self._nlp is None:
  133 + return self._fallback_keywords(text)
  134 + try:
  135 + return self._extract_keywords_with_spacy(text)
  136 + except Exception as exc:
  137 + logger.warning("spaCy English keyword extraction failed; using fallback: %s", exc)
  138 + return self._fallback_keywords(text)
  139 +
  140 + def _extract_keywords_with_spacy(self, query: str) -> str:
  141 + doc = self._nlp(query)
  142 + intersection: Set[str] = set()
  143 + stops = self._nlp.Defaults.stop_words | _WEAK_BOOST_ADJS
  144 + pobj_heads_to_demote: Set[int] = set()
  145 +
  146 + for token in doc:
  147 + if token.dep_ == "prep" and token.text.lower() == "for":
  148 + for child in token.children:
  149 + if child.dep_ == "pobj" and child.pos_ in ("NOUN", "PROPN"):
  150 + pobj_heads_to_demote.add(child.i)
  151 +
  152 + for token in doc:
  153 + if token.dep_ != "prep" or _lemma_lower(token) not in _PRICE_PREP_LEMMAS:
  154 + continue
  155 + for child in token.children:
  156 + if child.dep_ == "pobj" and child.pos_ in ("NOUN", "PROPN"):
  157 + pobj_heads_to_demote.add(child.i)
  158 +
  159 + for token in doc:
  160 + if token.dep_ == "dobj" and token.pos_ in ("NOUN", "PROPN") and token.i not in pobj_heads_to_demote:
  161 + intersection.add(_surface_lower(token))
  162 +
  163 + for token in doc:
  164 + if token.dep_ == "nsubj" and token.pos_ in ("NOUN", "PROPN"):
  165 + head = token.head
  166 + if head.pos_ == "AUX" and head.dep_ == "ROOT":
  167 + intersection.add(_surface_lower(token))
  168 +
  169 + for token in doc:
  170 + if token.dep_ == "ROOT" and token.pos_ in ("INTJ", "PROPN"):
  171 + intersection.add(_surface_lower(token))
  172 + if token.pos_ == "PROPN":
  173 + if token.dep_ == "compound" and _lemma_lower(token.head) in _DEMOGRAPHIC_NOUNS:
  174 + continue
  175 + intersection.add(_surface_lower(token))
  176 +
  177 + for token in doc:
  178 + if token.dep_ == "ROOT" and token.pos_ in ("NOUN", "PROPN"):
  179 + if _lemma_lower(token) in _DIMENSION_ROOTS:
  180 + for child in token.children:
  181 + if child.dep_ == "nsubj" and child.pos_ in ("NOUN", "PROPN"):
  182 + intersection.add(_surface_lower(child))
  183 + continue
  184 + if _lemma_lower(token) in _DEMOGRAPHIC_NOUNS:
  185 + for child in token.children:
  186 + if child.dep_ == "compound" and child.pos_ == "NOUN":
  187 + intersection.add(_surface_lower(child))
  188 + continue
  189 + if token.i in pobj_heads_to_demote:
  190 + continue
  191 + intersection.add(_surface_lower(token))
  192 +
  193 + for token in doc:
  194 + if token.dep_ != "ROOT" or token.pos_ not in ("INTJ", "VERB", "NOUN"):
  195 + continue
  196 + pobjs = sorted(
  197 + [child for child in token.children if child.dep_ == "pobj" and child.pos_ in ("NOUN", "PROPN")],
  198 + key=lambda item: item.i,
  199 + )
  200 + if len(pobjs) >= 2 and token.pos_ == "INTJ":
  201 + intersection.add(_surface_lower(pobjs[0]))
  202 + for extra in pobjs[1:]:
  203 + if _lemma_lower(extra) not in _DEMOGRAPHIC_NOUNS:
  204 + intersection.add(_surface_lower(extra))
  205 + elif len(pobjs) == 1 and token.pos_ == "INTJ":
  206 + intersection.add(_surface_lower(pobjs[0]))
  207 +
  208 + if not intersection:
  209 + for chunk in doc.noun_chunks:
  210 + head = chunk.root
  211 + if head.pos_ not in ("NOUN", "PROPN"):
  212 + continue
  213 + if head.dep_ == "pobj" and head.head.dep_ == "prep":
  214 + prep = head.head
  215 + if _lemma_lower(prep) in _PRICE_PREP_LEMMAS or prep.text.lower() == "for":
  216 + continue
  217 + head_text = _surface_lower(head)
  218 + if head_text:
  219 + intersection.add(head_text)
  220 + for token in chunk:
  221 + if token == head or token.pos_ != "PROPN":
  222 + continue
  223 + intersection.add(_surface_lower(token))
  224 +
  225 + core_terms = _dedupe_preserve(
  226 + token.text.lower()
  227 + for token in doc
  228 + if _surface_lower(token) in intersection
  229 + and _surface_lower(token) not in stops
  230 + and _surface_lower(token) not in _DEMOGRAPHIC_NOUNS
  231 + and token.dep_ not in _FUNCTIONAL_DEP
  232 + and len(_surface_lower(token)) >= 2
  233 + )
  234 + projected_terms = _project_terms_to_query_tokens(query, core_terms)
  235 + if projected_terms:
  236 + return " ".join(projected_terms[:3])
  237 + return self._fallback_keywords(query)
  238 +
  239 + def _fallback_keywords(self, query: str) -> str:
  240 + tokens = [
  241 + normalize_query_text(token)
  242 + for token in simple_tokenize_query(query)
  243 + if normalize_query_text(token)
  244 + ]
  245 + if not tokens:
  246 + return ""
  247 +
  248 + filtered = [token for token in tokens if token not in _DEMOGRAPHIC_NOUNS]
  249 + if not filtered:
  250 + filtered = tokens
  251 +
  252 + # Keep the right-most likely product head plus one close modifier.
  253 + head = filtered[-1]
  254 + if len(filtered) >= 2:
  255 + return " ".join(filtered[-2:])
  256 + return head
query/keyword_extractor.py
@@ -11,6 +11,9 @@ from __future__ import annotations @@ -11,6 +11,9 @@ from __future__ import annotations
11 import logging 11 import logging
12 from typing import Any, Dict, List, Optional 12 from typing import Any, Dict, List, Optional
13 13
  14 +from .english_keyword_extractor import EnglishKeywordExtractor
  15 +from .tokenization import QueryTextAnalysisCache
  16 +
14 logger = logging.getLogger(__name__) 17 logger = logging.getLogger(__name__)
15 18
16 import hanlp # type: ignore 19 import hanlp # type: ignore
@@ -21,7 +24,7 @@ KEYWORDS_QUERY_BASE_KEY = &quot;base&quot; @@ -21,7 +24,7 @@ KEYWORDS_QUERY_BASE_KEY = &quot;base&quot;
21 # | 场景 | 推荐模型 | 24 # | 场景 | 推荐模型 |
22 # | :--------- | :------------------------------------------- | 25 # | :--------- | :------------------------------------------- |
23 # | 纯中文 + 最高精度 | CTB9_TOK_ELECTRA_BASE_CRF 或 MSR_TOK_ELECTRA_BASE_CRF | 26 # | 纯中文 + 最高精度 | CTB9_TOK_ELECTRA_BASE_CRF 或 MSR_TOK_ELECTRA_BASE_CRF |
24 -# | 纯中文 + 速度优先 | FINE_ELECTRA_SMALL_ZH(细粒度)或 COARSE_ELECTRA_SMALL_ZH(粗粒度) | 27 +# | 纯中文 + 速度优先 | FINE_ELECTRA_SMALL_ZH (细粒度)或 COARSE_ELECTRA_SMALL_ZH (粗粒度) |
25 # | **中英文混合** | `UD_TOK_MMINILMV2L6` 或 `UD_TOK_MMINILMV2L12` ( Transformer 编码器的层数不同)| 28 # | **中英文混合** | `UD_TOK_MMINILMV2L6` 或 `UD_TOK_MMINILMV2L12` ( Transformer 编码器的层数不同)|
26 29
27 30
@@ -33,23 +36,38 @@ class KeywordExtractor: @@ -33,23 +36,38 @@ class KeywordExtractor:
33 tokenizer: Optional[Any] = None, 36 tokenizer: Optional[Any] = None,
34 *, 37 *,
35 ignore_keywords: Optional[List[str]] = None, 38 ignore_keywords: Optional[List[str]] = None,
  39 + english_extractor: Optional[EnglishKeywordExtractor] = None,
36 ): 40 ):
37 if tokenizer is not None: 41 if tokenizer is not None:
38 self.tok = tokenizer 42 self.tok = tokenizer
39 else: 43 else:
40 - self.tok = hanlp.load(hanlp.pretrained.tok.UD_TOK_MMINILMV2L6) 44 + self.tok = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH)
41 self.tok.config.output_spans = True 45 self.tok.config.output_spans = True
42 self.pos_tag = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL) 46 self.pos_tag = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL)
43 self.ignore_keywords = frozenset(ignore_keywords or ["玩具"]) 47 self.ignore_keywords = frozenset(ignore_keywords or ["玩具"])
  48 + self.english_extractor = english_extractor or EnglishKeywordExtractor()
44 49
45 - def extract_keywords(self, query: str) -> str: 50 + def extract_keywords(
  51 + self,
  52 + query: str,
  53 + *,
  54 + language_hint: Optional[str] = None,
  55 + tokenizer_result: Optional[Any] = None,
  56 + ) -> str:
46 """ 57 """
47 从查询中提取关键词(名词,长度 ≥ 2),以空格分隔非连续片段。 58 从查询中提取关键词(名词,长度 ≥ 2),以空格分隔非连续片段。
48 """ 59 """
49 query = (query or "").strip() 60 query = (query or "").strip()
50 if not query: 61 if not query:
51 return "" 62 return ""
52 - tok_result_with_position = self.tok(query) 63 + normalized_language = str(language_hint or "").strip().lower()
  64 + if normalized_language == "en":
  65 + return self.english_extractor.extract_keywords(query)
  66 + if normalized_language and normalized_language != "zh":
  67 + return ""
  68 + tok_result_with_position = (
  69 + tokenizer_result if tokenizer_result is not None else self.tok(query)
  70 + )
53 tok_result = [x[0] for x in tok_result_with_position] 71 tok_result = [x[0] for x in tok_result_with_position]
54 if not tok_result: 72 if not tok_result:
55 return "" 73 return ""
@@ -72,6 +90,10 @@ def collect_keywords_queries( @@ -72,6 +90,10 @@ def collect_keywords_queries(
72 extractor: KeywordExtractor, 90 extractor: KeywordExtractor,
73 rewritten_query: str, 91 rewritten_query: str,
74 translations: Dict[str, str], 92 translations: Dict[str, str],
  93 + *,
  94 + source_language: Optional[str] = None,
  95 + text_analysis_cache: Optional[QueryTextAnalysisCache] = None,
  96 + base_keywords_query: Optional[str] = None,
75 ) -> Dict[str, str]: 97 ) -> Dict[str, str]:
76 """ 98 """
77 Build the keyword map for all lexical variants (base + translations). 99 Build the keyword map for all lexical variants (base + translations).
@@ -79,14 +101,40 @@ def collect_keywords_queries( @@ -79,14 +101,40 @@ def collect_keywords_queries(
79 Omits entries when extraction yields an empty string. 101 Omits entries when extraction yields an empty string.
80 """ 102 """
81 out: Dict[str, str] = {} 103 out: Dict[str, str] = {}
82 - base_kw = extractor.extract_keywords(rewritten_query) 104 + base_kw = base_keywords_query
  105 + if base_kw is None:
  106 + base_kw = extractor.extract_keywords(
  107 + rewritten_query,
  108 + language_hint=source_language or (
  109 + text_analysis_cache.get_language_hint(rewritten_query)
  110 + if text_analysis_cache is not None
  111 + else None
  112 + ),
  113 + tokenizer_result=(
  114 + text_analysis_cache.get_tokenizer_result(rewritten_query)
  115 + if text_analysis_cache is not None
  116 + else None
  117 + ),
  118 + )
83 if base_kw: 119 if base_kw:
84 out[KEYWORDS_QUERY_BASE_KEY] = base_kw 120 out[KEYWORDS_QUERY_BASE_KEY] = base_kw
85 for lang, text in translations.items(): 121 for lang, text in translations.items():
86 lang_key = str(lang or "").strip().lower() 122 lang_key = str(lang or "").strip().lower()
87 if not lang_key or not (text or "").strip(): 123 if not lang_key or not (text or "").strip():
88 continue 124 continue
89 - kw = extractor.extract_keywords(text) 125 + kw = extractor.extract_keywords(
  126 + text,
  127 + language_hint=lang_key or (
  128 + text_analysis_cache.get_language_hint(text)
  129 + if text_analysis_cache is not None
  130 + else None
  131 + ),
  132 + tokenizer_result=(
  133 + text_analysis_cache.get_tokenizer_result(text)
  134 + if text_analysis_cache is not None
  135 + else None
  136 + ),
  137 + )
90 if kw: 138 if kw:
91 out[lang_key] = kw 139 out[lang_key] = kw
92 return out 140 return out
query/product_title_exclusion.py
@@ -7,7 +7,7 @@ from __future__ import annotations @@ -7,7 +7,7 @@ from __future__ import annotations
7 from dataclasses import dataclass, field 7 from dataclasses import dataclass, field
8 from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Set, Tuple 8 from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Set, Tuple
9 9
10 -from .tokenization import TokenizedText, normalize_query_text, tokenize_text 10 +from .tokenization import QueryTextAnalysisCache, TokenizedText, normalize_query_text, tokenize_text
11 11
12 12
13 def _dedupe_terms(terms: Iterable[str]) -> List[str]: 13 def _dedupe_terms(terms: Iterable[str]) -> List[str]:
@@ -158,9 +158,27 @@ class ProductTitleExclusionDetector: @@ -158,9 +158,27 @@ class ProductTitleExclusionDetector:
158 self.registry = registry 158 self.registry = registry
159 self.tokenizer = tokenizer 159 self.tokenizer = tokenizer
160 160
  161 + def _tokenize_text(
  162 + self,
  163 + text: str,
  164 + *,
  165 + analysis_cache: Optional[QueryTextAnalysisCache] = None,
  166 + ) -> TokenizedText:
  167 + if analysis_cache is not None:
  168 + return analysis_cache.get_tokenized_text(
  169 + text,
  170 + max_ngram=self.registry.max_term_ngram,
  171 + )
  172 + return tokenize_text(
  173 + text,
  174 + tokenizer=self.tokenizer,
  175 + max_ngram=self.registry.max_term_ngram,
  176 + )
  177 +
161 def _build_query_variants(self, parsed_query: Any) -> Tuple[TokenizedText, ...]: 178 def _build_query_variants(self, parsed_query: Any) -> Tuple[TokenizedText, ...]:
162 seen = set() 179 seen = set()
163 variants: List[TokenizedText] = [] 180 variants: List[TokenizedText] = []
  181 + analysis_cache = getattr(parsed_query, "_text_analysis_cache", None)
164 texts = [ 182 texts = [
165 getattr(parsed_query, "original_query", None), 183 getattr(parsed_query, "original_query", None),
166 getattr(parsed_query, "query_normalized", None), 184 getattr(parsed_query, "query_normalized", None),
@@ -180,10 +198,9 @@ class ProductTitleExclusionDetector: @@ -180,10 +198,9 @@ class ProductTitleExclusionDetector:
180 continue 198 continue
181 seen.add(normalized) 199 seen.add(normalized)
182 variants.append( 200 variants.append(
183 - tokenize_text( 201 + self._tokenize_text(
184 text, 202 text,
185 - tokenizer=self.tokenizer,  
186 - max_ngram=self.registry.max_term_ngram, 203 + analysis_cache=analysis_cache,
187 ) 204 )
188 ) 205 )
189 206
query/query_parser.py
@@ -27,7 +27,7 @@ from .product_title_exclusion import ( @@ -27,7 +27,7 @@ from .product_title_exclusion import (
27 ) 27 )
28 from .query_rewriter import QueryRewriter, QueryNormalizer 28 from .query_rewriter import QueryRewriter, QueryNormalizer
29 from .style_intent import StyleIntentDetector, StyleIntentProfile, StyleIntentRegistry 29 from .style_intent import StyleIntentDetector, StyleIntentProfile, StyleIntentRegistry
30 -from .tokenization import extract_token_strings, simple_tokenize_query 30 +from .tokenization import QueryTextAnalysisCache, contains_han_text, extract_token_strings
31 from .keyword_extractor import KeywordExtractor, collect_keywords_queries 31 from .keyword_extractor import KeywordExtractor, collect_keywords_queries
32 32
33 logger = logging.getLogger(__name__) 33 logger = logging.getLogger(__name__)
@@ -119,6 +119,7 @@ class ParsedQuery: @@ -119,6 +119,7 @@ class ParsedQuery:
119 keywords_queries: Dict[str, str] = field(default_factory=dict) 119 keywords_queries: Dict[str, str] = field(default_factory=dict)
120 style_intent_profile: Optional[StyleIntentProfile] = None 120 style_intent_profile: Optional[StyleIntentProfile] = None
121 product_title_exclusion_profile: Optional[ProductTitleExclusionProfile] = None 121 product_title_exclusion_profile: Optional[ProductTitleExclusionProfile] = None
  122 + _text_analysis_cache: Optional[QueryTextAnalysisCache] = field(default=None, repr=False)
122 123
123 def text_for_rerank(self) -> str: 124 def text_for_rerank(self) -> str:
124 """See :func:`rerank_query_text`.""" 125 """See :func:`rerank_query_text`."""
@@ -238,7 +239,7 @@ class QueryParser: @@ -238,7 +239,7 @@ class QueryParser:
238 if hanlp is None: 239 if hanlp is None:
239 raise RuntimeError("HanLP is required for QueryParser tokenization") 240 raise RuntimeError("HanLP is required for QueryParser tokenization")
240 logger.info("Initializing HanLP tokenizer...") 241 logger.info("Initializing HanLP tokenizer...")
241 - tokenizer = hanlp.load(hanlp.pretrained.tok.CTB9_TOK_ELECTRA_BASE_CRF) 242 + tokenizer = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH)
242 tokenizer.config.output_spans = True 243 tokenizer.config.output_spans = True
243 logger.info("HanLP tokenizer initialized") 244 logger.info("HanLP tokenizer initialized")
244 return tokenizer 245 return tokenizer
@@ -288,6 +289,33 @@ class QueryParser: @@ -288,6 +289,33 @@ class QueryParser:
288 def _get_query_tokens(self, query: str) -> List[str]: 289 def _get_query_tokens(self, query: str) -> List[str]:
289 return self._extract_tokens(self._tokenizer(query)) 290 return self._extract_tokens(self._tokenizer(query))
290 291
  292 + @staticmethod
  293 + def _is_ascii_latin_query(text: str) -> bool:
  294 + candidate = str(text or "").strip()
  295 + if not candidate or contains_han_text(candidate):
  296 + return False
  297 + try:
  298 + candidate.encode("ascii")
  299 + except UnicodeEncodeError:
  300 + return False
  301 + return any(ch.isalpha() for ch in candidate)
  302 +
  303 + def _detect_query_language(
  304 + self,
  305 + query_text: str,
  306 + *,
  307 + target_languages: Optional[List[str]] = None,
  308 + ) -> str:
  309 + normalized_targets = self._normalize_language_codes(target_languages)
  310 + supported_languages = self._normalize_language_codes(
  311 + getattr(self.config.query_config, "supported_languages", None)
  312 + )
  313 + active_languages = normalized_targets or supported_languages
  314 + if active_languages and set(active_languages).issubset({"en", "zh"}):
  315 + if self._is_ascii_latin_query(query_text):
  316 + return "en"
  317 + return self.language_detector.detect(query_text)
  318 +
291 def parse( 319 def parse(
292 self, 320 self,
293 query: str, 321 query: str,
@@ -332,12 +360,15 @@ class QueryParser: @@ -332,12 +360,15 @@ class QueryParser:
332 active_logger.debug(msg) 360 active_logger.debug(msg)
333 361
334 # Stage 1: Normalize 362 # Stage 1: Normalize
  363 + normalize_t0 = time.perf_counter()
335 normalized = self.normalizer.normalize(query) 364 normalized = self.normalizer.normalize(query)
  365 + normalize_ms = (time.perf_counter() - normalize_t0) * 1000.0
336 log_debug(f"Normalization completed | '{query}' -> '{normalized}'") 366 log_debug(f"Normalization completed | '{query}' -> '{normalized}'")
337 if context: 367 if context:
338 context.store_intermediate_result('query_normalized', normalized) 368 context.store_intermediate_result('query_normalized', normalized)
339 369
340 # Stage 2: Query rewriting 370 # Stage 2: Query rewriting
  371 + rewrite_t0 = time.perf_counter()
341 query_text = normalized 372 query_text = normalized
342 rewritten = normalized 373 rewritten = normalized
343 if self.config.query_config.rewrite_dictionary: # Enable rewrite if dictionary exists 374 if self.config.query_config.rewrite_dictionary: # Enable rewrite if dictionary exists
@@ -348,21 +379,26 @@ class QueryParser: @@ -348,21 +379,26 @@ class QueryParser:
348 if context: 379 if context:
349 context.store_intermediate_result('rewritten_query', rewritten) 380 context.store_intermediate_result('rewritten_query', rewritten)
350 context.add_warning(f"Query was rewritten: {query_text}") 381 context.add_warning(f"Query was rewritten: {query_text}")
  382 + rewrite_ms = (time.perf_counter() - rewrite_t0) * 1000.0
  383 +
  384 + normalized_targets = self._normalize_language_codes(target_languages)
351 385
352 # Stage 3: Language detection 386 # Stage 3: Language detection
353 - detected_lang = self.language_detector.detect(query_text) 387 + language_detect_t0 = time.perf_counter()
  388 + detected_lang = self._detect_query_language(
  389 + query_text,
  390 + target_languages=normalized_targets,
  391 + )
354 # Use default language if detection failed (None or "unknown") 392 # Use default language if detection failed (None or "unknown")
355 if not detected_lang or detected_lang == "unknown": 393 if not detected_lang or detected_lang == "unknown":
356 detected_lang = self.config.query_config.default_language 394 detected_lang = self.config.query_config.default_language
  395 + language_detect_ms = (time.perf_counter() - language_detect_t0) * 1000.0
357 log_info(f"Language detection | Detected language: {detected_lang}") 396 log_info(f"Language detection | Detected language: {detected_lang}")
358 if context: 397 if context:
359 context.store_intermediate_result('detected_language', detected_lang) 398 context.store_intermediate_result('detected_language', detected_lang)
360 - # Stage 4: Query analysis (tokenization)  
361 - query_tokens = self._get_query_tokens(query_text)  
362 -  
363 - log_debug(f"Query analysis | Query tokens: {query_tokens}")  
364 - if context:  
365 - context.store_intermediate_result('query_tokens', query_tokens) 399 + text_analysis_cache = QueryTextAnalysisCache(tokenizer=self._tokenizer)
  400 + for text_variant in (query, normalized, query_text):
  401 + text_analysis_cache.set_language_hint(text_variant, detected_lang)
366 402
367 # Stage 5: Translation + embedding. Parser only coordinates async enrichment work; the 403 # Stage 5: Translation + embedding. Parser only coordinates async enrichment work; the
368 # caller decides translation targets and later search-field planning. 404 # caller decides translation targets and later search-field planning.
@@ -371,7 +407,6 @@ class QueryParser: @@ -371,7 +407,6 @@ class QueryParser:
371 future_submit_at: Dict[Any, float] = {} 407 future_submit_at: Dict[Any, float] = {}
372 async_executor: Optional[ThreadPoolExecutor] = None 408 async_executor: Optional[ThreadPoolExecutor] = None
373 detected_norm = str(detected_lang or "").strip().lower() 409 detected_norm = str(detected_lang or "").strip().lower()
374 - normalized_targets = self._normalize_language_codes(target_languages)  
375 translation_targets = [lang for lang in normalized_targets if lang != detected_norm] 410 translation_targets = [lang for lang in normalized_targets if lang != detected_norm]
376 source_language_in_index = bool(normalized_targets) and detected_norm in normalized_targets 411 source_language_in_index = bool(normalized_targets) and detected_norm in normalized_targets
377 412
@@ -398,7 +433,9 @@ class QueryParser: @@ -398,7 +433,9 @@ class QueryParser:
398 thread_name_prefix="query-enrichment", 433 thread_name_prefix="query-enrichment",
399 ) 434 )
400 435
  436 + async_submit_ms = 0.0
401 try: 437 try:
  438 + async_submit_t0 = time.perf_counter()
402 if async_executor is not None: 439 if async_executor is not None:
403 for lang in translation_targets: 440 for lang in translation_targets:
404 model_name = self._pick_query_translation_model( 441 model_name = self._pick_query_translation_model(
@@ -466,6 +503,7 @@ class QueryParser: @@ -466,6 +503,7 @@ class QueryParser:
466 future = async_executor.submit(_encode_image_query_vector) 503 future = async_executor.submit(_encode_image_query_vector)
467 future_to_task[future] = ("image_embedding", None) 504 future_to_task[future] = ("image_embedding", None)
468 future_submit_at[future] = time.perf_counter() 505 future_submit_at[future] = time.perf_counter()
  506 + async_submit_ms = (time.perf_counter() - async_submit_t0) * 1000.0
469 except Exception as e: 507 except Exception as e:
470 error_msg = f"Async query enrichment submission failed | Error: {str(e)}" 508 error_msg = f"Async query enrichment submission failed | Error: {str(e)}"
471 log_info(error_msg) 509 log_info(error_msg)
@@ -477,6 +515,33 @@ class QueryParser: @@ -477,6 +515,33 @@ class QueryParser:
477 future_to_task.clear() 515 future_to_task.clear()
478 future_submit_at.clear() 516 future_submit_at.clear()
479 517
  518 + # Stage 4: Query analysis (tokenization) now overlaps with async enrichment work.
  519 + query_analysis_t0 = time.perf_counter()
  520 + query_tokenizer_t0 = time.perf_counter()
  521 + query_tokenizer_result = text_analysis_cache.get_tokenizer_result(query_text)
  522 + query_tokenizer_ms = (time.perf_counter() - query_tokenizer_t0) * 1000.0
  523 + query_token_extract_t0 = time.perf_counter()
  524 + query_tokens = self._extract_tokens(query_tokenizer_result)
  525 + query_token_extract_ms = (time.perf_counter() - query_token_extract_t0) * 1000.0
  526 + query_analysis_ms = (time.perf_counter() - query_analysis_t0) * 1000.0
  527 +
  528 + log_debug(f"Query analysis | Query tokens: {query_tokens}")
  529 + if context:
  530 + context.store_intermediate_result('query_tokens', query_tokens)
  531 +
  532 + keywords_base_query = ""
  533 + keywords_base_ms = 0.0
  534 + try:
  535 + keywords_base_t0 = time.perf_counter()
  536 + keywords_base_query = self._keyword_extractor.extract_keywords(
  537 + query_text,
  538 + language_hint=detected_lang,
  539 + tokenizer_result=text_analysis_cache.get_tokenizer_result(query_text),
  540 + )
  541 + keywords_base_ms = (time.perf_counter() - keywords_base_t0) * 1000.0
  542 + except Exception as e:
  543 + log_info(f"Base keyword extraction failed | Error: {e}")
  544 +
480 # Wait for translation + embedding concurrently; shared budget depends on whether 545 # Wait for translation + embedding concurrently; shared budget depends on whether
481 # the detected language belongs to caller-provided target_languages. 546 # the detected language belongs to caller-provided target_languages.
482 qc = self.config.query_config 547 qc = self.config.query_config
@@ -501,7 +566,10 @@ class QueryParser: @@ -501,7 +566,10 @@ class QueryParser:
501 f"source_in_target_languages={source_in_target_languages}" 566 f"source_in_target_languages={source_in_target_languages}"
502 ) 567 )
503 568
  569 + async_wait_t0 = time.perf_counter()
504 done, not_done = wait(list(future_to_task.keys()), timeout=budget_sec) 570 done, not_done = wait(list(future_to_task.keys()), timeout=budget_sec)
  571 + async_wait_ms = (time.perf_counter() - async_wait_t0) * 1000.0
  572 + async_collect_t0 = time.perf_counter()
505 for future in done: 573 for future in done:
506 task_type, lang = future_to_task[future] 574 task_type, lang = future_to_task[future]
507 t0 = future_submit_at.pop(future, None) 575 t0 = future_submit_at.pop(future, None)
@@ -511,6 +579,7 @@ class QueryParser: @@ -511,6 +579,7 @@ class QueryParser:
511 if task_type == "translation": 579 if task_type == "translation":
512 if result: 580 if result:
513 translations[lang] = result 581 translations[lang] = result
  582 + text_analysis_cache.set_language_hint(result, lang)
514 if context: 583 if context:
515 context.store_intermediate_result(f"translation_{lang}", result) 584 context.store_intermediate_result(f"translation_{lang}", result)
516 elif task_type == "embedding": 585 elif task_type == "embedding":
@@ -561,20 +630,31 @@ class QueryParser: @@ -561,20 +630,31 @@ class QueryParser:
561 log_info(timeout_msg) 630 log_info(timeout_msg)
562 if context: 631 if context:
563 context.add_warning(timeout_msg) 632 context.add_warning(timeout_msg)
  633 + async_collect_ms = (time.perf_counter() - async_collect_t0) * 1000.0
564 634
565 if async_executor: 635 if async_executor:
566 async_executor.shutdown(wait=False) 636 async_executor.shutdown(wait=False)
567 637
568 if translations and context: 638 if translations and context:
569 context.store_intermediate_result("translations", translations) 639 context.store_intermediate_result("translations", translations)
  640 + else:
  641 + async_wait_ms = 0.0
  642 + async_collect_ms = 0.0
570 643
  644 + tail_sync_t0 = time.perf_counter()
571 keywords_queries: Dict[str, str] = {} 645 keywords_queries: Dict[str, str] = {}
  646 + keyword_tail_ms = 0.0
572 try: 647 try:
  648 + keywords_t0 = time.perf_counter()
573 keywords_queries = collect_keywords_queries( 649 keywords_queries = collect_keywords_queries(
574 self._keyword_extractor, 650 self._keyword_extractor,
575 query_text, 651 query_text,
576 translations, 652 translations,
  653 + source_language=detected_lang,
  654 + text_analysis_cache=text_analysis_cache,
  655 + base_keywords_query=keywords_base_query,
577 ) 656 )
  657 + keyword_tail_ms = (time.perf_counter() - keywords_t0) * 1000.0
578 except Exception as e: 658 except Exception as e:
579 log_info(f"Keyword extraction failed | Error: {e}") 659 log_info(f"Keyword extraction failed | Error: {e}")
580 660
@@ -589,9 +669,43 @@ class QueryParser: @@ -589,9 +669,43 @@ class QueryParser:
589 image_query_vector=image_query_vector, 669 image_query_vector=image_query_vector,
590 query_tokens=query_tokens, 670 query_tokens=query_tokens,
591 keywords_queries=keywords_queries, 671 keywords_queries=keywords_queries,
  672 + _text_analysis_cache=text_analysis_cache,
592 ) 673 )
  674 + style_intent_t0 = time.perf_counter()
593 style_intent_profile = self.style_intent_detector.detect(base_result) 675 style_intent_profile = self.style_intent_detector.detect(base_result)
  676 + style_intent_ms = (time.perf_counter() - style_intent_t0) * 1000.0
  677 + product_title_exclusion_t0 = time.perf_counter()
594 product_title_exclusion_profile = self.product_title_exclusion_detector.detect(base_result) 678 product_title_exclusion_profile = self.product_title_exclusion_detector.detect(base_result)
  679 + product_title_exclusion_ms = (
  680 + (time.perf_counter() - product_title_exclusion_t0) * 1000.0
  681 + )
  682 + tail_sync_ms = (time.perf_counter() - tail_sync_t0) * 1000.0
  683 + before_wait_ms = (
  684 + normalize_ms
  685 + + rewrite_ms
  686 + + language_detect_ms
  687 + + async_submit_ms
  688 + + query_analysis_ms
  689 + + keywords_base_ms
  690 + )
  691 + log_info(
  692 + "Query parse stage timings | "
  693 + f"normalize_ms={normalize_ms:.1f} | "
  694 + f"rewrite_ms={rewrite_ms:.1f} | "
  695 + f"language_detect_ms={language_detect_ms:.1f} | "
  696 + f"query_tokenizer_ms={query_tokenizer_ms:.1f} | "
  697 + f"query_token_extract_ms={query_token_extract_ms:.1f} | "
  698 + f"query_analysis_ms={query_analysis_ms:.1f} | "
  699 + f"async_submit_ms={async_submit_ms:.1f} | "
  700 + f"before_wait_ms={before_wait_ms:.1f} | "
  701 + f"async_wait_ms={async_wait_ms:.1f} | "
  702 + f"async_collect_ms={async_collect_ms:.1f} | "
  703 + f"base_keywords_ms={keywords_base_ms:.1f} | "
  704 + f"keyword_tail_ms={keyword_tail_ms:.1f} | "
  705 + f"style_intent_ms={style_intent_ms:.1f} | "
  706 + f"product_title_exclusion_ms={product_title_exclusion_ms:.1f} | "
  707 + f"tail_sync_ms={tail_sync_ms:.1f}"
  708 + )
595 if context: 709 if context:
596 context.store_intermediate_result( 710 context.store_intermediate_result(
597 "style_intent_profile", 711 "style_intent_profile",
@@ -614,6 +728,7 @@ class QueryParser: @@ -614,6 +728,7 @@ class QueryParser:
614 keywords_queries=keywords_queries, 728 keywords_queries=keywords_queries,
615 style_intent_profile=style_intent_profile, 729 style_intent_profile=style_intent_profile,
616 product_title_exclusion_profile=product_title_exclusion_profile, 730 product_title_exclusion_profile=product_title_exclusion_profile,
  731 + _text_analysis_cache=text_analysis_cache,
617 ) 732 )
618 733
619 parse_total_ms = (time.perf_counter() - parse_t0) * 1000.0 734 parse_total_ms = (time.perf_counter() - parse_t0) * 1000.0
query/style_intent.py
@@ -7,7 +7,7 @@ from __future__ import annotations @@ -7,7 +7,7 @@ from __future__ import annotations
7 from dataclasses import dataclass, field 7 from dataclasses import dataclass, field
8 from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Set, Tuple 8 from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Set, Tuple
9 9
10 -from .tokenization import TokenizedText, normalize_query_text, tokenize_text 10 +from .tokenization import QueryTextAnalysisCache, TokenizedText, normalize_query_text, tokenize_text
11 11
12 12
13 @dataclass(frozen=True) 13 @dataclass(frozen=True)
@@ -233,32 +233,63 @@ class StyleIntentDetector: @@ -233,32 +233,63 @@ class StyleIntentDetector:
233 self.registry = registry 233 self.registry = registry
234 self.tokenizer = tokenizer 234 self.tokenizer = tokenizer
235 235
236 - def _build_query_variants(self, parsed_query: Any) -> Tuple[TokenizedText, ...]:  
237 - seen = set()  
238 - variants: List[TokenizedText] = []  
239 - texts = [  
240 - self._get_language_query_text(parsed_query, "zh"),  
241 - self._get_language_query_text(parsed_query, "en"),  
242 - ] 236 + def _max_term_ngram(self) -> int:
  237 + return max(
  238 + (definition.max_term_ngram for definition in self.registry.definitions.values()),
  239 + default=3,
  240 + )
  241 +
  242 + def _tokenize_text(
  243 + self,
  244 + text: str,
  245 + *,
  246 + analysis_cache: Optional[QueryTextAnalysisCache] = None,
  247 + ) -> TokenizedText:
  248 + max_term_ngram = self._max_term_ngram()
  249 + if analysis_cache is not None:
  250 + return analysis_cache.get_tokenized_text(text, max_ngram=max_term_ngram)
  251 + return tokenize_text(
  252 + text,
  253 + tokenizer=self.tokenizer,
  254 + max_ngram=max_term_ngram,
  255 + )
243 256
244 - for raw_text in texts:  
245 - text = str(raw_text or "").strip() 257 + def _build_language_variants(
  258 + self,
  259 + parsed_query: Any,
  260 + *,
  261 + analysis_cache: Optional[QueryTextAnalysisCache] = None,
  262 + ) -> Dict[str, TokenizedText]:
  263 + variants: Dict[str, TokenizedText] = {}
  264 + for language in ("zh", "en"):
  265 + text = self._get_language_query_text(parsed_query, language).strip()
246 if not text: 266 if not text:
247 continue 267 continue
248 - normalized = normalize_query_text(text) 268 + variants[language] = self._tokenize_text(
  269 + text,
  270 + analysis_cache=analysis_cache,
  271 + )
  272 + return variants
  273 +
  274 + def _build_query_variants(
  275 + self,
  276 + parsed_query: Any,
  277 + *,
  278 + language_variants: Optional[Dict[str, TokenizedText]] = None,
  279 + analysis_cache: Optional[QueryTextAnalysisCache] = None,
  280 + ) -> Tuple[TokenizedText, ...]:
  281 + seen = set()
  282 + variants: List[TokenizedText] = []
  283 +
  284 + for variant in (language_variants or self._build_language_variants(
  285 + parsed_query,
  286 + analysis_cache=analysis_cache,
  287 + )).values():
  288 + normalized = variant.normalized_text
249 if not normalized or normalized in seen: 289 if not normalized or normalized in seen:
250 continue 290 continue
251 seen.add(normalized) 291 seen.add(normalized)
252 - variants.append(  
253 - tokenize_text(  
254 - text,  
255 - tokenizer=self.tokenizer,  
256 - max_ngram=max(  
257 - (definition.max_term_ngram for definition in self.registry.definitions.values()),  
258 - default=3,  
259 - ),  
260 - )  
261 - ) 292 + variants.append(variant)
262 293
263 return tuple(variants) 294 return tuple(variants)
264 295
@@ -271,26 +302,50 @@ class StyleIntentDetector: @@ -271,26 +302,50 @@ class StyleIntentDetector:
271 return str(translated) 302 return str(translated)
272 return str(getattr(parsed_query, "original_query", "") or "") 303 return str(getattr(parsed_query, "original_query", "") or "")
273 304
274 - def _tokenize_language_query(self, parsed_query: Any, language: str) -> Optional[TokenizedText]: 305 + def _tokenize_language_query(
  306 + self,
  307 + parsed_query: Any,
  308 + language: str,
  309 + *,
  310 + language_variants: Optional[Dict[str, TokenizedText]] = None,
  311 + analysis_cache: Optional[QueryTextAnalysisCache] = None,
  312 + ) -> Optional[TokenizedText]:
  313 + if language_variants is not None:
  314 + return language_variants.get(language)
275 text = self._get_language_query_text(parsed_query, language).strip() 315 text = self._get_language_query_text(parsed_query, language).strip()
276 if not text: 316 if not text:
277 return None 317 return None
278 - return tokenize_text( 318 + return self._tokenize_text(
279 text, 319 text,
280 - tokenizer=self.tokenizer,  
281 - max_ngram=max(  
282 - (definition.max_term_ngram for definition in self.registry.definitions.values()),  
283 - default=3,  
284 - ), 320 + analysis_cache=analysis_cache,
285 ) 321 )
286 322
287 def detect(self, parsed_query: Any) -> StyleIntentProfile: 323 def detect(self, parsed_query: Any) -> StyleIntentProfile:
288 if not self.registry.enabled or not self.registry.definitions: 324 if not self.registry.enabled or not self.registry.definitions:
289 return StyleIntentProfile() 325 return StyleIntentProfile()
290 326
291 - query_variants = self._build_query_variants(parsed_query)  
292 - zh_variant = self._tokenize_language_query(parsed_query, "zh")  
293 - en_variant = self._tokenize_language_query(parsed_query, "en") 327 + analysis_cache = getattr(parsed_query, "_text_analysis_cache", None)
  328 + language_variants = self._build_language_variants(
  329 + parsed_query,
  330 + analysis_cache=analysis_cache,
  331 + )
  332 + query_variants = self._build_query_variants(
  333 + parsed_query,
  334 + language_variants=language_variants,
  335 + analysis_cache=analysis_cache,
  336 + )
  337 + zh_variant = self._tokenize_language_query(
  338 + parsed_query,
  339 + "zh",
  340 + language_variants=language_variants,
  341 + analysis_cache=analysis_cache,
  342 + )
  343 + en_variant = self._tokenize_language_query(
  344 + parsed_query,
  345 + "en",
  346 + language_variants=language_variants,
  347 + analysis_cache=analysis_cache,
  348 + )
294 detected: List[DetectedStyleIntent] = [] 349 detected: List[DetectedStyleIntent] = []
295 seen_pairs = set() 350 seen_pairs = set()
296 351
query/tokenization.py
@@ -6,10 +6,11 @@ from __future__ import annotations @@ -6,10 +6,11 @@ from __future__ import annotations
6 6
7 from dataclasses import dataclass 7 from dataclasses import dataclass
8 import re 8 import re
9 -from typing import Any, Callable, Iterable, List, Optional, Sequence, Tuple 9 +from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
10 10
11 11
12 -_TOKEN_PATTERN = re.compile(r"[\u4e00-\u9fff]+|[A-Za-z0-9_]+(?:-[A-Za-z0-9_]+)*") 12 +_HAN_PATTERN = re.compile(r"[\u4e00-\u9fff]")
  13 +_TOKEN_PATTERN = re.compile(r"[\u4e00-\u9fff]+|[^\W_]+(?:[-'][^\W_]+)*", re.UNICODE)
13 14
14 15
15 def normalize_query_text(text: Optional[str]) -> str: 16 def normalize_query_text(text: Optional[str]) -> str:
@@ -30,6 +31,10 @@ def simple_tokenize_query(text: str) -&gt; List[str]: @@ -30,6 +31,10 @@ def simple_tokenize_query(text: str) -&gt; List[str]:
30 return _TOKEN_PATTERN.findall(text) 31 return _TOKEN_PATTERN.findall(text)
31 32
32 33
  34 +def contains_han_text(text: Optional[str]) -> bool:
  35 + return bool(text and _HAN_PATTERN.search(str(text)))
  36 +
  37 +
33 def extract_token_strings(tokenizer_result: Any) -> List[str]: 38 def extract_token_strings(tokenizer_result: Any) -> List[str]:
34 """Normalize tokenizer output into a flat token string list.""" 39 """Normalize tokenizer output into a flat token string list."""
35 if not tokenizer_result: 40 if not tokenizer_result:
@@ -84,6 +89,13 @@ def _build_phrase_candidates(tokens: Sequence[str], max_ngram: int) -&gt; List[str] @@ -84,6 +89,13 @@ def _build_phrase_candidates(tokens: Sequence[str], max_ngram: int) -&gt; List[str]
84 return phrases 89 return phrases
85 90
86 91
  92 +def _build_coarse_tokens(text: str, fine_tokens: Sequence[str]) -> List[str]:
  93 + coarse_tokens = _dedupe_preserve_order(simple_tokenize_query(text))
  94 + if contains_han_text(text) and fine_tokens:
  95 + return list(_dedupe_preserve_order(fine_tokens))
  96 + return coarse_tokens
  97 +
  98 +
87 @dataclass(frozen=True) 99 @dataclass(frozen=True)
88 class TokenizedText: 100 class TokenizedText:
89 text: str 101 text: str
@@ -93,30 +105,88 @@ class TokenizedText: @@ -93,30 +105,88 @@ class TokenizedText:
93 candidates: Tuple[str, ...] 105 candidates: Tuple[str, ...]
94 106
95 107
  108 +class QueryTextAnalysisCache:
  109 + """Per-parse cache for tokenizer output and derived token bundles."""
  110 +
  111 + def __init__(self, *, tokenizer: Optional[Callable[[str], Any]] = None) -> None:
  112 + self.tokenizer = tokenizer
  113 + self._tokenizer_results: Dict[str, Any] = {}
  114 + self._tokenized_texts: Dict[Tuple[str, int], TokenizedText] = {}
  115 + self._language_hints: Dict[str, str] = {}
  116 +
  117 + @staticmethod
  118 + def _normalize_input(text: Optional[str]) -> str:
  119 + return str(text or "").strip()
  120 +
  121 + def set_language_hint(self, text: Optional[str], language: Optional[str]) -> None:
  122 + normalized_input = self._normalize_input(text)
  123 + normalized_language = normalize_query_text(language)
  124 + if normalized_input and normalized_language:
  125 + self._language_hints[normalized_input] = normalized_language
  126 +
  127 + def get_language_hint(self, text: Optional[str]) -> Optional[str]:
  128 + normalized_input = self._normalize_input(text)
  129 + if not normalized_input:
  130 + return None
  131 + return self._language_hints.get(normalized_input)
  132 +
  133 + def _should_use_model_tokenizer(self, text: str) -> bool:
  134 + if self.tokenizer is None:
  135 + return False
  136 + language_hint = self.get_language_hint(text)
  137 + has_han = contains_han_text(text)
  138 + if language_hint == "zh":
  139 + return has_han
  140 + return has_han
  141 +
  142 + def get_tokenizer_result(self, text: Optional[str]) -> Any:
  143 + normalized_input = self._normalize_input(text)
  144 + if not normalized_input:
  145 + return []
  146 + if not self._should_use_model_tokenizer(normalized_input):
  147 + return simple_tokenize_query(normalized_input)
  148 + if normalized_input not in self._tokenizer_results:
  149 + self._tokenizer_results[normalized_input] = self.tokenizer(normalized_input)
  150 + return self._tokenizer_results[normalized_input]
  151 +
  152 + def get_tokenized_text(self, text: Optional[str], *, max_ngram: int = 3) -> TokenizedText:
  153 + normalized_input = self._normalize_input(text)
  154 + cache_key = (normalized_input, max(1, int(max_ngram)))
  155 + cached = self._tokenized_texts.get(cache_key)
  156 + if cached is not None:
  157 + return cached
  158 +
  159 + normalized_text = normalize_query_text(normalized_input)
  160 + fine_raw = extract_token_strings(self.get_tokenizer_result(normalized_input))
  161 + fine_tokens = _dedupe_preserve_order(fine_raw)
  162 + coarse_tokens = _build_coarse_tokens(normalized_input, fine_tokens)
  163 +
  164 + bundle = TokenizedText(
  165 + text=normalized_input,
  166 + normalized_text=normalized_text,
  167 + fine_tokens=tuple(fine_tokens),
  168 + coarse_tokens=tuple(coarse_tokens),
  169 + candidates=tuple(
  170 + _dedupe_preserve_order(
  171 + list(fine_tokens)
  172 + + list(coarse_tokens)
  173 + + _build_phrase_candidates(fine_tokens, max_ngram=max_ngram)
  174 + + _build_phrase_candidates(coarse_tokens, max_ngram=max_ngram)
  175 + + ([normalized_text] if normalized_text else [])
  176 + )
  177 + ),
  178 + )
  179 + self._tokenized_texts[cache_key] = bundle
  180 + return bundle
  181 +
  182 +
96 def tokenize_text( 183 def tokenize_text(
97 text: str, 184 text: str,
98 *, 185 *,
99 tokenizer: Optional[Callable[[str], Any]] = None, 186 tokenizer: Optional[Callable[[str], Any]] = None,
100 max_ngram: int = 3, 187 max_ngram: int = 3,
101 ) -> TokenizedText: 188 ) -> TokenizedText:
102 - normalized_text = normalize_query_text(text)  
103 - coarse_tokens = _dedupe_preserve_order(simple_tokenize_query(text))  
104 -  
105 - fine_raw = extract_token_strings(tokenizer(text)) if tokenizer is not None and text else []  
106 - fine_tokens = _dedupe_preserve_order(fine_raw)  
107 -  
108 - candidates = _dedupe_preserve_order(  
109 - list(fine_tokens)  
110 - + list(coarse_tokens)  
111 - + _build_phrase_candidates(fine_tokens, max_ngram=max_ngram)  
112 - + _build_phrase_candidates(coarse_tokens, max_ngram=max_ngram)  
113 - + ([normalized_text] if normalized_text else [])  
114 - )  
115 -  
116 - return TokenizedText(  
117 - text=text,  
118 - normalized_text=normalized_text,  
119 - fine_tokens=tuple(fine_tokens),  
120 - coarse_tokens=tuple(coarse_tokens),  
121 - candidates=tuple(candidates), 189 + return QueryTextAnalysisCache(tokenizer=tokenizer).get_tokenized_text(
  190 + text,
  191 + max_ngram=max_ngram,
122 ) 192 )
suggestion/service.py
@@ -7,7 +7,7 @@ import time @@ -7,7 +7,7 @@ import time
7 from typing import Any, Dict, List, Optional 7 from typing import Any, Dict, List, Optional
8 8
9 from config.tenant_config_loader import get_tenant_config_loader 9 from config.tenant_config_loader import get_tenant_config_loader
10 -from query.query_parser import simple_tokenize_query 10 +from query.tokenization import simple_tokenize_query
11 from suggestion.builder import get_suggestion_alias_name 11 from suggestion.builder import get_suggestion_alias_name
12 from utils.es_client import ESClient 12 from utils.es_client import ESClient
13 13
tests/test_query_parser_mixed_language.py
@@ -77,3 +77,79 @@ def test_parse_waits_for_translation_when_source_in_index_languages(monkeypatch) @@ -77,3 +77,79 @@ def test_parse_waits_for_translation_when_source_in_index_languages(monkeypatch)
77 assert result.detected_language == "en" 77 assert result.detected_language == "en"
78 assert result.translations.get("zh") == "off shoulder top-zh" 78 assert result.translations.get("zh") == "off shoulder top-zh"
79 assert not hasattr(result, "source_in_index_languages") 79 assert not hasattr(result, "source_in_index_languages")
  80 +
  81 +
  82 +def test_parse_reuses_tokenization_across_tail_stages(monkeypatch):
  83 + tokenize_calls = []
  84 +
  85 + def counting_tokenizer(text):
  86 + tokenize_calls.append(str(text))
  87 + return str(text).split()
  88 +
  89 + config = SearchConfig(
  90 + es_index_name="test_products",
  91 + field_boosts={"title.en": 3.0, "title.zh": 3.0},
  92 + indexes=[IndexConfig(name="default", label="default", fields=["title.en", "title.zh"])],
  93 + query_config=QueryConfig(
  94 + enable_text_embedding=False,
  95 + enable_query_rewrite=False,
  96 + supported_languages=["en", "zh"],
  97 + default_language="en",
  98 + style_intent_terms={
  99 + "color": [
  100 + {"en_terms": ["black"], "zh_terms": ["黑色"], "attribute_terms": ["black"]}
  101 + ],
  102 + },
  103 + style_intent_dimension_aliases={"color": ["color", "颜色"]},
  104 + product_title_exclusion_rules=[
  105 + {
  106 + "zh_trigger_terms": ["修身"],
  107 + "en_trigger_terms": ["fitted"],
  108 + "zh_title_exclusions": ["宽松"],
  109 + "en_title_exclusions": ["loose"],
  110 + }
  111 + ],
  112 + ),
  113 + function_score=FunctionScoreConfig(),
  114 + rerank=RerankConfig(),
  115 + spu_config=SPUConfig(enabled=False),
  116 + )
  117 + parser = QueryParser(
  118 + config,
  119 + translator=_DummyTranslator(),
  120 + tokenizer=counting_tokenizer,
  121 + )
  122 + monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en")
  123 +
  124 + result = parser.parse(
  125 + "black fitted dress",
  126 + tenant_id="0",
  127 + generate_vector=False,
  128 + target_languages=["en", "zh"],
  129 + )
  130 +
  131 + assert result.translations == {"zh": "black fitted dress-zh"}
  132 + assert result.style_intent_profile is not None
  133 + assert result.style_intent_profile.is_active is True
  134 + assert result.product_title_exclusion_profile is not None
  135 + assert result.product_title_exclusion_profile.is_active is True
  136 + assert tokenize_calls == []
  137 +
  138 +
  139 +def test_parse_fast_path_detects_ascii_query_as_english_without_lingua(monkeypatch):
  140 + parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
  141 + monkeypatch.setattr(
  142 + parser.language_detector,
  143 + "detect",
  144 + lambda text: (_ for _ in ()).throw(AssertionError("Lingua path should not be used")),
  145 + )
  146 +
  147 + result = parser.parse(
  148 + "street t-shirt women",
  149 + tenant_id="0",
  150 + generate_vector=False,
  151 + target_languages=["en", "zh"],
  152 + )
  153 +
  154 + assert result.detected_language == "en"
  155 + assert result.query_tokens == ["street", "t-shirt", "women"]
tests/test_style_intent.py
@@ -58,3 +58,37 @@ def test_style_intent_detector_uses_original_query_when_language_translation_mis @@ -58,3 +58,37 @@ def test_style_intent_detector_uses_original_query_when_language_translation_mis
58 58
59 assert profile.get_canonical_values("color") == {"black"} 59 assert profile.get_canonical_values("color") == {"black"}
60 assert profile.intents[0].attribute_terms == ("black",) 60 assert profile.intents[0].attribute_terms == ("black",)
  61 +
  62 +
  63 +def test_style_intent_detector_tokenizes_each_language_once():
  64 + query_config = QueryConfig(
  65 + style_intent_terms={
  66 + "color": [{"en_terms": ["black"], "zh_terms": ["黑色"], "attribute_terms": ["black"]}],
  67 + "size": [{"en_terms": ["xl"], "zh_terms": ["加大码"], "attribute_terms": ["xl"]}],
  68 + },
  69 + style_intent_dimension_aliases={
  70 + "color": ["color", "颜色"],
  71 + "size": ["size", "尺码"],
  72 + },
  73 + )
  74 + tokenize_calls = []
  75 +
  76 + def counting_tokenizer(text):
  77 + tokenize_calls.append(text)
  78 + return str(text).split()
  79 +
  80 + detector = StyleIntentDetector(
  81 + StyleIntentRegistry.from_query_config(query_config),
  82 + tokenizer=counting_tokenizer,
  83 + )
  84 + parsed_query = SimpleNamespace(
  85 + original_query="黑色 连衣裙",
  86 + query_normalized="黑色 连衣裙",
  87 + rewritten_query="黑色 连衣裙",
  88 + translations={"en": "black dress xl"},
  89 + )
  90 +
  91 + profile = detector.detect(parsed_query)
  92 +
  93 + assert profile.is_active is True
  94 + assert tokenize_calls == ["黑色 连衣裙"]
tests/test_tokenization.py 0 → 100644
@@ -0,0 +1,13 @@ @@ -0,0 +1,13 @@
  1 +from query.tokenization import QueryTextAnalysisCache
  2 +
  3 +
  4 +def test_han_coarse_tokens_follow_model_tokens_instead_of_whole_sentence():
  5 + cache = QueryTextAnalysisCache(
  6 + tokenizer=lambda text: [("路上", 0, 2), ("穿着", 2, 4), ("女性", 4, 6), ("黑色", 10, 12)]
  7 + )
  8 + cache.set_language_hint("路上穿着女性的衣服是黑色的", "zh")
  9 +
  10 + tokenized = cache.get_tokenized_text("路上穿着女性的衣服是黑色的")
  11 +
  12 + assert tokenized.fine_tokens == ("路上", "穿着", "女性", "黑色")
  13 + assert tokenized.coarse_tokens == ("路上", "穿着", "女性", "黑色")