Commit 837d5d768d32d026403512a77fb989ddb4266508
1 parent
b712a831
sku筛选匹配规则优化,按 token/短语序列匹配,fixbadcase
Showing
3 changed files
with
146 additions
and
4 deletions
Show diff stats
config/config.yaml
| @@ -114,7 +114,7 @@ query_config: | @@ -114,7 +114,7 @@ query_config: | ||
| 114 | # 查询解析阶段:翻译与 query 向量并发执行,共用同一等待预算(毫秒)。 | 114 | # 查询解析阶段:翻译与 query 向量并发执行,共用同一等待预算(毫秒)。 |
| 115 | # 检测语言已在租户 index_languages 内:较短;不在索引语言内:较长(翻译对召回更关键)。 | 115 | # 检测语言已在租户 index_languages 内:较短;不在索引语言内:较长(翻译对召回更关键)。 |
| 116 | translation_embedding_wait_budget_ms_source_in_index: 500 # 80 | 116 | translation_embedding_wait_budget_ms_source_in_index: 500 # 80 |
| 117 | - translation_embedding_wait_budget_ms_source_not_in_index: 500 #200 | 117 | + translation_embedding_wait_budget_ms_source_not_in_index: 700 #200 |
| 118 | 118 | ||
| 119 | style_intent: | 119 | style_intent: |
| 120 | enabled: true | 120 | enabled: true |
| @@ -380,7 +380,7 @@ services: | @@ -380,7 +380,7 @@ services: | ||
| 380 | max_docs: 1000 | 380 | max_docs: 1000 |
| 381 | normalize: true | 381 | normalize: true |
| 382 | # 服务内后端(reranker 进程启动时读取) | 382 | # 服务内后端(reranker 进程启动时读取) |
| 383 | - backend: "bge" # bge | qwen3_vllm | qwen3_transformers | dashscope_rerank | 383 | + backend: "qwen3_vllm" # bge | qwen3_vllm | qwen3_transformers | dashscope_rerank |
| 384 | backends: | 384 | backends: |
| 385 | bge: | 385 | bge: |
| 386 | model_name: "BAAI/bge-reranker-v2-m3" | 386 | model_name: "BAAI/bge-reranker-v2-m3" |
search/sku_intent_selector.py
| @@ -8,7 +8,7 @@ from dataclasses import dataclass, field | @@ -8,7 +8,7 @@ from dataclasses import dataclass, field | ||
| 8 | from typing import Any, Callable, Dict, List, Optional, Tuple | 8 | from typing import Any, Callable, Dict, List, Optional, Tuple |
| 9 | 9 | ||
| 10 | from query.style_intent import StyleIntentProfile, StyleIntentRegistry | 10 | from query.style_intent import StyleIntentProfile, StyleIntentRegistry |
| 11 | -from query.tokenization import normalize_query_text | 11 | +from query.tokenization import normalize_query_text, simple_tokenize_query |
| 12 | 12 | ||
| 13 | 13 | ||
| 14 | @dataclass(frozen=True) | 14 | @dataclass(frozen=True) |
| @@ -35,6 +35,7 @@ class SkuSelectionDecision: | @@ -35,6 +35,7 @@ class SkuSelectionDecision: | ||
| 35 | class _SelectionContext: | 35 | class _SelectionContext: |
| 36 | attribute_terms_by_intent: Dict[str, Tuple[str, ...]] | 36 | attribute_terms_by_intent: Dict[str, Tuple[str, ...]] |
| 37 | normalized_text_cache: Dict[str, str] = field(default_factory=dict) | 37 | normalized_text_cache: Dict[str, str] = field(default_factory=dict) |
| 38 | + tokenized_text_cache: Dict[str, Tuple[str, ...]] = field(default_factory=dict) | ||
| 38 | text_match_cache: Dict[Tuple[str, str], bool] = field(default_factory=dict) | 39 | text_match_cache: Dict[Tuple[str, str], bool] = field(default_factory=dict) |
| 39 | 40 | ||
| 40 | 41 | ||
| @@ -194,10 +195,60 @@ class StyleSkuSelector: | @@ -194,10 +195,60 @@ class StyleSkuSelector: | ||
| 194 | return cached | 195 | return cached |
| 195 | 196 | ||
| 196 | attribute_terms = selection_context.attribute_terms_by_intent.get(intent_type, ()) | 197 | attribute_terms = selection_context.attribute_terms_by_intent.get(intent_type, ()) |
| 197 | - matched = any(term in normalized_value for term in attribute_terms if term) | 198 | + value_tokens = self._tokenize_cached(selection_context, normalized_value) |
| 199 | + matched = any( | ||
| 200 | + self._matches_term_tokens( | ||
| 201 | + term=term, | ||
| 202 | + value_tokens=value_tokens, | ||
| 203 | + selection_context=selection_context, | ||
| 204 | + normalized_value=normalized_value, | ||
| 205 | + ) | ||
| 206 | + for term in attribute_terms | ||
| 207 | + if term | ||
| 208 | + ) | ||
| 198 | selection_context.text_match_cache[cache_key] = matched | 209 | selection_context.text_match_cache[cache_key] = matched |
| 199 | return matched | 210 | return matched |
| 200 | 211 | ||
| 212 | + @staticmethod | ||
| 213 | + def _tokenize_cached(selection_context: _SelectionContext, value: str) -> Tuple[str, ...]: | ||
| 214 | + normalized_value = normalize_query_text(value) | ||
| 215 | + if not normalized_value: | ||
| 216 | + return () | ||
| 217 | + cached = selection_context.tokenized_text_cache.get(normalized_value) | ||
| 218 | + if cached is not None: | ||
| 219 | + return cached | ||
| 220 | + tokens = tuple(normalize_query_text(token) for token in simple_tokenize_query(normalized_value) if token) | ||
| 221 | + selection_context.tokenized_text_cache[normalized_value] = tokens | ||
| 222 | + return tokens | ||
| 223 | + | ||
| 224 | + def _matches_term_tokens( | ||
| 225 | + self, | ||
| 226 | + *, | ||
| 227 | + term: str, | ||
| 228 | + value_tokens: Tuple[str, ...], | ||
| 229 | + selection_context: _SelectionContext, | ||
| 230 | + normalized_value: str, | ||
| 231 | + ) -> bool: | ||
| 232 | + normalized_term = normalize_query_text(term) | ||
| 233 | + if not normalized_term: | ||
| 234 | + return False | ||
| 235 | + if normalized_term == normalized_value: | ||
| 236 | + return True | ||
| 237 | + | ||
| 238 | + term_tokens = self._tokenize_cached(selection_context, normalized_term) | ||
| 239 | + if not term_tokens or not value_tokens: | ||
| 240 | + return normalized_term in normalized_value | ||
| 241 | + | ||
| 242 | + term_length = len(term_tokens) | ||
| 243 | + value_length = len(value_tokens) | ||
| 244 | + if term_length > value_length: | ||
| 245 | + return False | ||
| 246 | + | ||
| 247 | + for start in range(value_length - term_length + 1): | ||
| 248 | + if value_tokens[start:start + term_length] == term_tokens: | ||
| 249 | + return True | ||
| 250 | + return False | ||
| 251 | + | ||
| 201 | def _find_first_text_match( | 252 | def _find_first_text_match( |
| 202 | self, | 253 | self, |
| 203 | skus: List[Dict[str, Any]], | 254 | skus: List[Dict[str, Any]], |
tests/test_sku_intent_selector.py
| @@ -104,3 +104,94 @@ def test_style_sku_selector_returns_no_match_without_attribute_contains(): | @@ -104,3 +104,94 @@ def test_style_sku_selector_returns_no_match_without_attribute_contains(): | ||
| 104 | 104 | ||
| 105 | assert decisions["spu-1"].selected_sku_id is None | 105 | assert decisions["spu-1"].selected_sku_id is None |
| 106 | assert decisions["spu-1"].matched_stage == "no_match" | 106 | assert decisions["spu-1"].matched_stage == "no_match" |
| 107 | + | ||
| 108 | + | ||
| 109 | +def test_is_text_match_uses_token_boundaries_for_sizes(): | ||
| 110 | + registry = StyleIntentRegistry.from_query_config( | ||
| 111 | + QueryConfig( | ||
| 112 | + style_intent_terms={ | ||
| 113 | + "size": [{"en_terms": ["l"], "zh_terms": ["大码"], "attribute_terms": ["l"]}], | ||
| 114 | + }, | ||
| 115 | + style_intent_dimension_aliases={"size": ["size", "尺码"]}, | ||
| 116 | + ) | ||
| 117 | + ) | ||
| 118 | + selector = StyleSkuSelector(registry) | ||
| 119 | + style_profile = StyleIntentProfile( | ||
| 120 | + intents=( | ||
| 121 | + DetectedStyleIntent( | ||
| 122 | + intent_type="size", | ||
| 123 | + canonical_value="l", | ||
| 124 | + matched_term="l", | ||
| 125 | + matched_query_text="l", | ||
| 126 | + attribute_terms=("l",), | ||
| 127 | + dimension_aliases=("size", "尺码"), | ||
| 128 | + ), | ||
| 129 | + ), | ||
| 130 | + ) | ||
| 131 | + selection_context = selector._build_selection_context(style_profile) | ||
| 132 | + | ||
| 133 | + assert selector._is_text_match("size", selection_context, normalized_value="l") | ||
| 134 | + assert not selector._is_text_match("size", selection_context, normalized_value="xl") | ||
| 135 | + assert not selector._is_text_match("size", selection_context, normalized_value="xxl") | ||
| 136 | + | ||
| 137 | + | ||
| 138 | +def test_is_text_match_handles_punctuation_and_descriptive_attribute_values(): | ||
| 139 | + registry = StyleIntentRegistry.from_query_config( | ||
| 140 | + QueryConfig( | ||
| 141 | + style_intent_terms={ | ||
| 142 | + "color": [{"en_terms": ["blue"], "zh_terms": ["蓝色"], "attribute_terms": ["blue"]}], | ||
| 143 | + "style": [{"en_terms": ["off-white"], "zh_terms": ["米白"], "attribute_terms": ["off-white"]}], | ||
| 144 | + "accessory": [{"en_terms": ["headscarf"], "zh_terms": ["头巾"], "attribute_terms": ["headscarf"]}], | ||
| 145 | + "size": [{"en_terms": ["2xl"], "zh_terms": ["2xl"], "attribute_terms": ["2xl"]}], | ||
| 146 | + }, | ||
| 147 | + style_intent_dimension_aliases={ | ||
| 148 | + "color": ["color", "颜色"], | ||
| 149 | + "style": ["style", "风格"], | ||
| 150 | + "accessory": ["accessory", "配饰"], | ||
| 151 | + "size": ["size", "尺码"], | ||
| 152 | + }, | ||
| 153 | + ) | ||
| 154 | + ) | ||
| 155 | + selector = StyleSkuSelector(registry) | ||
| 156 | + style_profile = StyleIntentProfile( | ||
| 157 | + intents=( | ||
| 158 | + DetectedStyleIntent( | ||
| 159 | + intent_type="color", | ||
| 160 | + canonical_value="blue", | ||
| 161 | + matched_term="blue", | ||
| 162 | + matched_query_text="blue", | ||
| 163 | + attribute_terms=("blue",), | ||
| 164 | + dimension_aliases=("color", "颜色"), | ||
| 165 | + ), | ||
| 166 | + DetectedStyleIntent( | ||
| 167 | + intent_type="style", | ||
| 168 | + canonical_value="off-white", | ||
| 169 | + matched_term="off-white", | ||
| 170 | + matched_query_text="off-white", | ||
| 171 | + attribute_terms=("off-white",), | ||
| 172 | + dimension_aliases=("style", "风格"), | ||
| 173 | + ), | ||
| 174 | + DetectedStyleIntent( | ||
| 175 | + intent_type="accessory", | ||
| 176 | + canonical_value="headscarf", | ||
| 177 | + matched_term="headscarf", | ||
| 178 | + matched_query_text="headscarf", | ||
| 179 | + attribute_terms=("headscarf",), | ||
| 180 | + dimension_aliases=("accessory", "配饰"), | ||
| 181 | + ), | ||
| 182 | + DetectedStyleIntent( | ||
| 183 | + intent_type="size", | ||
| 184 | + canonical_value="2xl", | ||
| 185 | + matched_term="2xl", | ||
| 186 | + matched_query_text="2xl", | ||
| 187 | + attribute_terms=("2xl",), | ||
| 188 | + dimension_aliases=("size", "尺码"), | ||
| 189 | + ), | ||
| 190 | + ), | ||
| 191 | + ) | ||
| 192 | + selection_context = selector._build_selection_context(style_profile) | ||
| 193 | + | ||
| 194 | + assert selector._is_text_match("color", selection_context, normalized_value="gray blue") | ||
| 195 | + assert selector._is_text_match("style", selection_context, normalized_value="off-white/lined") | ||
| 196 | + assert selector._is_text_match("accessory", selection_context, normalized_value="army green + headscarf") | ||
| 197 | + assert selector._is_text_match("size", selection_context, normalized_value="2xl recommended 65-70kg") |