Commit 837d5d768d32d026403512a77fb989ddb4266508
1 parent
b712a831
sku筛选匹配规则优化,按 token/短语序列匹配,fixbadcase
Showing
3 changed files
with
146 additions
and
4 deletions
Show diff stats
config/config.yaml
| ... | ... | @@ -114,7 +114,7 @@ query_config: |
| 114 | 114 | # 查询解析阶段:翻译与 query 向量并发执行,共用同一等待预算(毫秒)。 |
| 115 | 115 | # 检测语言已在租户 index_languages 内:较短;不在索引语言内:较长(翻译对召回更关键)。 |
| 116 | 116 | translation_embedding_wait_budget_ms_source_in_index: 500 # 80 |
| 117 | - translation_embedding_wait_budget_ms_source_not_in_index: 500 #200 | |
| 117 | + translation_embedding_wait_budget_ms_source_not_in_index: 700 #200 | |
| 118 | 118 | |
| 119 | 119 | style_intent: |
| 120 | 120 | enabled: true |
| ... | ... | @@ -380,7 +380,7 @@ services: |
| 380 | 380 | max_docs: 1000 |
| 381 | 381 | normalize: true |
| 382 | 382 | # 服务内后端(reranker 进程启动时读取) |
| 383 | - backend: "bge" # bge | qwen3_vllm | qwen3_transformers | dashscope_rerank | |
| 383 | + backend: "qwen3_vllm" # bge | qwen3_vllm | qwen3_transformers | dashscope_rerank | |
| 384 | 384 | backends: |
| 385 | 385 | bge: |
| 386 | 386 | model_name: "BAAI/bge-reranker-v2-m3" | ... | ... |
search/sku_intent_selector.py
| ... | ... | @@ -8,7 +8,7 @@ from dataclasses import dataclass, field |
| 8 | 8 | from typing import Any, Callable, Dict, List, Optional, Tuple |
| 9 | 9 | |
| 10 | 10 | from query.style_intent import StyleIntentProfile, StyleIntentRegistry |
| 11 | -from query.tokenization import normalize_query_text | |
| 11 | +from query.tokenization import normalize_query_text, simple_tokenize_query | |
| 12 | 12 | |
| 13 | 13 | |
| 14 | 14 | @dataclass(frozen=True) |
| ... | ... | @@ -35,6 +35,7 @@ class SkuSelectionDecision: |
| 35 | 35 | class _SelectionContext: |
| 36 | 36 | attribute_terms_by_intent: Dict[str, Tuple[str, ...]] |
| 37 | 37 | normalized_text_cache: Dict[str, str] = field(default_factory=dict) |
| 38 | + tokenized_text_cache: Dict[str, Tuple[str, ...]] = field(default_factory=dict) | |
| 38 | 39 | text_match_cache: Dict[Tuple[str, str], bool] = field(default_factory=dict) |
| 39 | 40 | |
| 40 | 41 | |
| ... | ... | @@ -194,10 +195,60 @@ class StyleSkuSelector: |
| 194 | 195 | return cached |
| 195 | 196 | |
| 196 | 197 | attribute_terms = selection_context.attribute_terms_by_intent.get(intent_type, ()) |
| 197 | - matched = any(term in normalized_value for term in attribute_terms if term) | |
| 198 | + value_tokens = self._tokenize_cached(selection_context, normalized_value) | |
| 199 | + matched = any( | |
| 200 | + self._matches_term_tokens( | |
| 201 | + term=term, | |
| 202 | + value_tokens=value_tokens, | |
| 203 | + selection_context=selection_context, | |
| 204 | + normalized_value=normalized_value, | |
| 205 | + ) | |
| 206 | + for term in attribute_terms | |
| 207 | + if term | |
| 208 | + ) | |
| 198 | 209 | selection_context.text_match_cache[cache_key] = matched |
| 199 | 210 | return matched |
| 200 | 211 | |
| 212 | + @staticmethod | |
| 213 | + def _tokenize_cached(selection_context: _SelectionContext, value: str) -> Tuple[str, ...]: | |
| 214 | + normalized_value = normalize_query_text(value) | |
| 215 | + if not normalized_value: | |
| 216 | + return () | |
| 217 | + cached = selection_context.tokenized_text_cache.get(normalized_value) | |
| 218 | + if cached is not None: | |
| 219 | + return cached | |
| 220 | + tokens = tuple(normalize_query_text(token) for token in simple_tokenize_query(normalized_value) if token) | |
| 221 | + selection_context.tokenized_text_cache[normalized_value] = tokens | |
| 222 | + return tokens | |
| 223 | + | |
| 224 | + def _matches_term_tokens( | |
| 225 | + self, | |
| 226 | + *, | |
| 227 | + term: str, | |
| 228 | + value_tokens: Tuple[str, ...], | |
| 229 | + selection_context: _SelectionContext, | |
| 230 | + normalized_value: str, | |
| 231 | + ) -> bool: | |
| 232 | + normalized_term = normalize_query_text(term) | |
| 233 | + if not normalized_term: | |
| 234 | + return False | |
| 235 | + if normalized_term == normalized_value: | |
| 236 | + return True | |
| 237 | + | |
| 238 | + term_tokens = self._tokenize_cached(selection_context, normalized_term) | |
| 239 | + if not term_tokens or not value_tokens: | |
| 240 | + return normalized_term in normalized_value | |
| 241 | + | |
| 242 | + term_length = len(term_tokens) | |
| 243 | + value_length = len(value_tokens) | |
| 244 | + if term_length > value_length: | |
| 245 | + return False | |
| 246 | + | |
| 247 | + for start in range(value_length - term_length + 1): | |
| 248 | + if value_tokens[start:start + term_length] == term_tokens: | |
| 249 | + return True | |
| 250 | + return False | |
| 251 | + | |
| 201 | 252 | def _find_first_text_match( |
| 202 | 253 | self, |
| 203 | 254 | skus: List[Dict[str, Any]], | ... | ... |
tests/test_sku_intent_selector.py
| ... | ... | @@ -104,3 +104,94 @@ def test_style_sku_selector_returns_no_match_without_attribute_contains(): |
| 104 | 104 | |
| 105 | 105 | assert decisions["spu-1"].selected_sku_id is None |
| 106 | 106 | assert decisions["spu-1"].matched_stage == "no_match" |
| 107 | + | |
| 108 | + | |
| 109 | +def test_is_text_match_uses_token_boundaries_for_sizes(): | |
| 110 | + registry = StyleIntentRegistry.from_query_config( | |
| 111 | + QueryConfig( | |
| 112 | + style_intent_terms={ | |
| 113 | + "size": [{"en_terms": ["l"], "zh_terms": ["大码"], "attribute_terms": ["l"]}], | |
| 114 | + }, | |
| 115 | + style_intent_dimension_aliases={"size": ["size", "尺码"]}, | |
| 116 | + ) | |
| 117 | + ) | |
| 118 | + selector = StyleSkuSelector(registry) | |
| 119 | + style_profile = StyleIntentProfile( | |
| 120 | + intents=( | |
| 121 | + DetectedStyleIntent( | |
| 122 | + intent_type="size", | |
| 123 | + canonical_value="l", | |
| 124 | + matched_term="l", | |
| 125 | + matched_query_text="l", | |
| 126 | + attribute_terms=("l",), | |
| 127 | + dimension_aliases=("size", "尺码"), | |
| 128 | + ), | |
| 129 | + ), | |
| 130 | + ) | |
| 131 | + selection_context = selector._build_selection_context(style_profile) | |
| 132 | + | |
| 133 | + assert selector._is_text_match("size", selection_context, normalized_value="l") | |
| 134 | + assert not selector._is_text_match("size", selection_context, normalized_value="xl") | |
| 135 | + assert not selector._is_text_match("size", selection_context, normalized_value="xxl") | |
| 136 | + | |
| 137 | + | |
| 138 | +def test_is_text_match_handles_punctuation_and_descriptive_attribute_values(): | |
| 139 | + registry = StyleIntentRegistry.from_query_config( | |
| 140 | + QueryConfig( | |
| 141 | + style_intent_terms={ | |
| 142 | + "color": [{"en_terms": ["blue"], "zh_terms": ["蓝色"], "attribute_terms": ["blue"]}], | |
| 143 | + "style": [{"en_terms": ["off-white"], "zh_terms": ["米白"], "attribute_terms": ["off-white"]}], | |
| 144 | + "accessory": [{"en_terms": ["headscarf"], "zh_terms": ["头巾"], "attribute_terms": ["headscarf"]}], | |
| 145 | + "size": [{"en_terms": ["2xl"], "zh_terms": ["2xl"], "attribute_terms": ["2xl"]}], | |
| 146 | + }, | |
| 147 | + style_intent_dimension_aliases={ | |
| 148 | + "color": ["color", "颜色"], | |
| 149 | + "style": ["style", "风格"], | |
| 150 | + "accessory": ["accessory", "配饰"], | |
| 151 | + "size": ["size", "尺码"], | |
| 152 | + }, | |
| 153 | + ) | |
| 154 | + ) | |
| 155 | + selector = StyleSkuSelector(registry) | |
| 156 | + style_profile = StyleIntentProfile( | |
| 157 | + intents=( | |
| 158 | + DetectedStyleIntent( | |
| 159 | + intent_type="color", | |
| 160 | + canonical_value="blue", | |
| 161 | + matched_term="blue", | |
| 162 | + matched_query_text="blue", | |
| 163 | + attribute_terms=("blue",), | |
| 164 | + dimension_aliases=("color", "颜色"), | |
| 165 | + ), | |
| 166 | + DetectedStyleIntent( | |
| 167 | + intent_type="style", | |
| 168 | + canonical_value="off-white", | |
| 169 | + matched_term="off-white", | |
| 170 | + matched_query_text="off-white", | |
| 171 | + attribute_terms=("off-white",), | |
| 172 | + dimension_aliases=("style", "风格"), | |
| 173 | + ), | |
| 174 | + DetectedStyleIntent( | |
| 175 | + intent_type="accessory", | |
| 176 | + canonical_value="headscarf", | |
| 177 | + matched_term="headscarf", | |
| 178 | + matched_query_text="headscarf", | |
| 179 | + attribute_terms=("headscarf",), | |
| 180 | + dimension_aliases=("accessory", "配饰"), | |
| 181 | + ), | |
| 182 | + DetectedStyleIntent( | |
| 183 | + intent_type="size", | |
| 184 | + canonical_value="2xl", | |
| 185 | + matched_term="2xl", | |
| 186 | + matched_query_text="2xl", | |
| 187 | + attribute_terms=("2xl",), | |
| 188 | + dimension_aliases=("size", "尺码"), | |
| 189 | + ), | |
| 190 | + ), | |
| 191 | + ) | |
| 192 | + selection_context = selector._build_selection_context(style_profile) | |
| 193 | + | |
| 194 | + assert selector._is_text_match("color", selection_context, normalized_value="gray blue") | |
| 195 | + assert selector._is_text_match("style", selection_context, normalized_value="off-white/lined") | |
| 196 | + assert selector._is_text_match("accessory", selection_context, normalized_value="army green + headscarf") | |
| 197 | + assert selector._is_text_match("size", selection_context, normalized_value="2xl recommended 65-70kg") | ... | ... |