Commit 837d5d768d32d026403512a77fb989ddb4266508

Authored by tangwang
1 parent b712a831

sku筛选匹配规则优化,按 token/短语序列匹配,fixbadcase

config/config.yaml
@@ -114,7 +114,7 @@ query_config: @@ -114,7 +114,7 @@ query_config:
114 # 查询解析阶段:翻译与 query 向量并发执行,共用同一等待预算(毫秒)。 114 # 查询解析阶段:翻译与 query 向量并发执行,共用同一等待预算(毫秒)。
115 # 检测语言已在租户 index_languages 内:较短;不在索引语言内:较长(翻译对召回更关键)。 115 # 检测语言已在租户 index_languages 内:较短;不在索引语言内:较长(翻译对召回更关键)。
116 translation_embedding_wait_budget_ms_source_in_index: 500 # 80 116 translation_embedding_wait_budget_ms_source_in_index: 500 # 80
117 - translation_embedding_wait_budget_ms_source_not_in_index: 500 #200 117 + translation_embedding_wait_budget_ms_source_not_in_index: 700 #200
118 118
119 style_intent: 119 style_intent:
120 enabled: true 120 enabled: true
@@ -380,7 +380,7 @@ services: @@ -380,7 +380,7 @@ services:
380 max_docs: 1000 380 max_docs: 1000
381 normalize: true 381 normalize: true
382 # 服务内后端(reranker 进程启动时读取) 382 # 服务内后端(reranker 进程启动时读取)
383 - backend: "bge" # bge | qwen3_vllm | qwen3_transformers | dashscope_rerank 383 + backend: "qwen3_vllm" # bge | qwen3_vllm | qwen3_transformers | dashscope_rerank
384 backends: 384 backends:
385 bge: 385 bge:
386 model_name: "BAAI/bge-reranker-v2-m3" 386 model_name: "BAAI/bge-reranker-v2-m3"
search/sku_intent_selector.py
@@ -8,7 +8,7 @@ from dataclasses import dataclass, field @@ -8,7 +8,7 @@ from dataclasses import dataclass, field
8 from typing import Any, Callable, Dict, List, Optional, Tuple 8 from typing import Any, Callable, Dict, List, Optional, Tuple
9 9
10 from query.style_intent import StyleIntentProfile, StyleIntentRegistry 10 from query.style_intent import StyleIntentProfile, StyleIntentRegistry
11 -from query.tokenization import normalize_query_text 11 +from query.tokenization import normalize_query_text, simple_tokenize_query
12 12
13 13
14 @dataclass(frozen=True) 14 @dataclass(frozen=True)
@@ -35,6 +35,7 @@ class SkuSelectionDecision: @@ -35,6 +35,7 @@ class SkuSelectionDecision:
35 class _SelectionContext: 35 class _SelectionContext:
36 attribute_terms_by_intent: Dict[str, Tuple[str, ...]] 36 attribute_terms_by_intent: Dict[str, Tuple[str, ...]]
37 normalized_text_cache: Dict[str, str] = field(default_factory=dict) 37 normalized_text_cache: Dict[str, str] = field(default_factory=dict)
  38 + tokenized_text_cache: Dict[str, Tuple[str, ...]] = field(default_factory=dict)
38 text_match_cache: Dict[Tuple[str, str], bool] = field(default_factory=dict) 39 text_match_cache: Dict[Tuple[str, str], bool] = field(default_factory=dict)
39 40
40 41
@@ -194,10 +195,60 @@ class StyleSkuSelector: @@ -194,10 +195,60 @@ class StyleSkuSelector:
194 return cached 195 return cached
195 196
196 attribute_terms = selection_context.attribute_terms_by_intent.get(intent_type, ()) 197 attribute_terms = selection_context.attribute_terms_by_intent.get(intent_type, ())
197 - matched = any(term in normalized_value for term in attribute_terms if term) 198 + value_tokens = self._tokenize_cached(selection_context, normalized_value)
  199 + matched = any(
  200 + self._matches_term_tokens(
  201 + term=term,
  202 + value_tokens=value_tokens,
  203 + selection_context=selection_context,
  204 + normalized_value=normalized_value,
  205 + )
  206 + for term in attribute_terms
  207 + if term
  208 + )
198 selection_context.text_match_cache[cache_key] = matched 209 selection_context.text_match_cache[cache_key] = matched
199 return matched 210 return matched
200 211
  212 + @staticmethod
  213 + def _tokenize_cached(selection_context: _SelectionContext, value: str) -> Tuple[str, ...]:
  214 + normalized_value = normalize_query_text(value)
  215 + if not normalized_value:
  216 + return ()
  217 + cached = selection_context.tokenized_text_cache.get(normalized_value)
  218 + if cached is not None:
  219 + return cached
  220 + tokens = tuple(normalize_query_text(token) for token in simple_tokenize_query(normalized_value) if token)
  221 + selection_context.tokenized_text_cache[normalized_value] = tokens
  222 + return tokens
  223 +
  224 + def _matches_term_tokens(
  225 + self,
  226 + *,
  227 + term: str,
  228 + value_tokens: Tuple[str, ...],
  229 + selection_context: _SelectionContext,
  230 + normalized_value: str,
  231 + ) -> bool:
  232 + normalized_term = normalize_query_text(term)
  233 + if not normalized_term:
  234 + return False
  235 + if normalized_term == normalized_value:
  236 + return True
  237 +
  238 + term_tokens = self._tokenize_cached(selection_context, normalized_term)
  239 + if not term_tokens or not value_tokens:
  240 + return normalized_term in normalized_value
  241 +
  242 + term_length = len(term_tokens)
  243 + value_length = len(value_tokens)
  244 + if term_length > value_length:
  245 + return False
  246 +
  247 + for start in range(value_length - term_length + 1):
  248 + if value_tokens[start:start + term_length] == term_tokens:
  249 + return True
  250 + return False
  251 +
201 def _find_first_text_match( 252 def _find_first_text_match(
202 self, 253 self,
203 skus: List[Dict[str, Any]], 254 skus: List[Dict[str, Any]],
tests/test_sku_intent_selector.py
@@ -104,3 +104,94 @@ def test_style_sku_selector_returns_no_match_without_attribute_contains(): @@ -104,3 +104,94 @@ def test_style_sku_selector_returns_no_match_without_attribute_contains():
104 104
105 assert decisions["spu-1"].selected_sku_id is None 105 assert decisions["spu-1"].selected_sku_id is None
106 assert decisions["spu-1"].matched_stage == "no_match" 106 assert decisions["spu-1"].matched_stage == "no_match"
  107 +
  108 +
  109 +def test_is_text_match_uses_token_boundaries_for_sizes():
  110 + registry = StyleIntentRegistry.from_query_config(
  111 + QueryConfig(
  112 + style_intent_terms={
  113 + "size": [{"en_terms": ["l"], "zh_terms": ["大码"], "attribute_terms": ["l"]}],
  114 + },
  115 + style_intent_dimension_aliases={"size": ["size", "尺码"]},
  116 + )
  117 + )
  118 + selector = StyleSkuSelector(registry)
  119 + style_profile = StyleIntentProfile(
  120 + intents=(
  121 + DetectedStyleIntent(
  122 + intent_type="size",
  123 + canonical_value="l",
  124 + matched_term="l",
  125 + matched_query_text="l",
  126 + attribute_terms=("l",),
  127 + dimension_aliases=("size", "尺码"),
  128 + ),
  129 + ),
  130 + )
  131 + selection_context = selector._build_selection_context(style_profile)
  132 +
  133 + assert selector._is_text_match("size", selection_context, normalized_value="l")
  134 + assert not selector._is_text_match("size", selection_context, normalized_value="xl")
  135 + assert not selector._is_text_match("size", selection_context, normalized_value="xxl")
  136 +
  137 +
  138 +def test_is_text_match_handles_punctuation_and_descriptive_attribute_values():
  139 + registry = StyleIntentRegistry.from_query_config(
  140 + QueryConfig(
  141 + style_intent_terms={
  142 + "color": [{"en_terms": ["blue"], "zh_terms": ["蓝色"], "attribute_terms": ["blue"]}],
  143 + "style": [{"en_terms": ["off-white"], "zh_terms": ["米白"], "attribute_terms": ["off-white"]}],
  144 + "accessory": [{"en_terms": ["headscarf"], "zh_terms": ["头巾"], "attribute_terms": ["headscarf"]}],
  145 + "size": [{"en_terms": ["2xl"], "zh_terms": ["2xl"], "attribute_terms": ["2xl"]}],
  146 + },
  147 + style_intent_dimension_aliases={
  148 + "color": ["color", "颜色"],
  149 + "style": ["style", "风格"],
  150 + "accessory": ["accessory", "配饰"],
  151 + "size": ["size", "尺码"],
  152 + },
  153 + )
  154 + )
  155 + selector = StyleSkuSelector(registry)
  156 + style_profile = StyleIntentProfile(
  157 + intents=(
  158 + DetectedStyleIntent(
  159 + intent_type="color",
  160 + canonical_value="blue",
  161 + matched_term="blue",
  162 + matched_query_text="blue",
  163 + attribute_terms=("blue",),
  164 + dimension_aliases=("color", "颜色"),
  165 + ),
  166 + DetectedStyleIntent(
  167 + intent_type="style",
  168 + canonical_value="off-white",
  169 + matched_term="off-white",
  170 + matched_query_text="off-white",
  171 + attribute_terms=("off-white",),
  172 + dimension_aliases=("style", "风格"),
  173 + ),
  174 + DetectedStyleIntent(
  175 + intent_type="accessory",
  176 + canonical_value="headscarf",
  177 + matched_term="headscarf",
  178 + matched_query_text="headscarf",
  179 + attribute_terms=("headscarf",),
  180 + dimension_aliases=("accessory", "配饰"),
  181 + ),
  182 + DetectedStyleIntent(
  183 + intent_type="size",
  184 + canonical_value="2xl",
  185 + matched_term="2xl",
  186 + matched_query_text="2xl",
  187 + attribute_terms=("2xl",),
  188 + dimension_aliases=("size", "尺码"),
  189 + ),
  190 + ),
  191 + )
  192 + selection_context = selector._build_selection_context(style_profile)
  193 +
  194 + assert selector._is_text_match("color", selection_context, normalized_value="gray blue")
  195 + assert selector._is_text_match("style", selection_context, normalized_value="off-white/lined")
  196 + assert selector._is_text_match("accessory", selection_context, normalized_value="army green + headscarf")
  197 + assert selector._is_text_match("size", selection_context, normalized_value="2xl recommended 65-70kg")