diff --git a/config/config.yaml b/config/config.yaml index c045a72..9e10130 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -114,7 +114,7 @@ query_config: # 查询解析阶段:翻译与 query 向量并发执行,共用同一等待预算(毫秒)。 # 检测语言已在租户 index_languages 内:较短;不在索引语言内:较长(翻译对召回更关键)。 translation_embedding_wait_budget_ms_source_in_index: 500 # 80 - translation_embedding_wait_budget_ms_source_not_in_index: 500 #200 + translation_embedding_wait_budget_ms_source_not_in_index: 700 #200 style_intent: enabled: true @@ -380,7 +380,7 @@ services: max_docs: 1000 normalize: true # 服务内后端(reranker 进程启动时读取) - backend: "bge" # bge | qwen3_vllm | qwen3_transformers | dashscope_rerank + backend: "qwen3_vllm" # bge | qwen3_vllm | qwen3_transformers | dashscope_rerank backends: bge: model_name: "BAAI/bge-reranker-v2-m3" diff --git a/search/sku_intent_selector.py b/search/sku_intent_selector.py index de32799..a51a3cf 100644 --- a/search/sku_intent_selector.py +++ b/search/sku_intent_selector.py @@ -8,7 +8,7 @@ from dataclasses import dataclass, field from typing import Any, Callable, Dict, List, Optional, Tuple from query.style_intent import StyleIntentProfile, StyleIntentRegistry -from query.tokenization import normalize_query_text +from query.tokenization import normalize_query_text, simple_tokenize_query @dataclass(frozen=True) @@ -35,6 +35,7 @@ class SkuSelectionDecision: class _SelectionContext: attribute_terms_by_intent: Dict[str, Tuple[str, ...]] normalized_text_cache: Dict[str, str] = field(default_factory=dict) + tokenized_text_cache: Dict[str, Tuple[str, ...]] = field(default_factory=dict) text_match_cache: Dict[Tuple[str, str], bool] = field(default_factory=dict) @@ -194,10 +195,60 @@ class StyleSkuSelector: return cached attribute_terms = selection_context.attribute_terms_by_intent.get(intent_type, ()) - matched = any(term in normalized_value for term in attribute_terms if term) + value_tokens = self._tokenize_cached(selection_context, normalized_value) + matched = any( + self._matches_term_tokens( + term=term, + value_tokens=value_tokens, + selection_context=selection_context, + normalized_value=normalized_value, + ) + for term in attribute_terms + if term + ) selection_context.text_match_cache[cache_key] = matched return matched + @staticmethod + def _tokenize_cached(selection_context: _SelectionContext, value: str) -> Tuple[str, ...]: + normalized_value = normalize_query_text(value) + if not normalized_value: + return () + cached = selection_context.tokenized_text_cache.get(normalized_value) + if cached is not None: + return cached + tokens = tuple(normalize_query_text(token) for token in simple_tokenize_query(normalized_value) if token) + selection_context.tokenized_text_cache[normalized_value] = tokens + return tokens + + def _matches_term_tokens( + self, + *, + term: str, + value_tokens: Tuple[str, ...], + selection_context: _SelectionContext, + normalized_value: str, + ) -> bool: + normalized_term = normalize_query_text(term) + if not normalized_term: + return False + if normalized_term == normalized_value: + return True + + term_tokens = self._tokenize_cached(selection_context, normalized_term) + if not term_tokens or not value_tokens: + return normalized_term in normalized_value + + term_length = len(term_tokens) + value_length = len(value_tokens) + if term_length > value_length: + return False + + for start in range(value_length - term_length + 1): + if value_tokens[start:start + term_length] == term_tokens: + return True + return False + def _find_first_text_match( self, skus: List[Dict[str, Any]], diff --git a/tests/test_sku_intent_selector.py b/tests/test_sku_intent_selector.py index 20174e8..b387747 100644 --- a/tests/test_sku_intent_selector.py +++ b/tests/test_sku_intent_selector.py @@ -104,3 +104,94 @@ def test_style_sku_selector_returns_no_match_without_attribute_contains(): assert decisions["spu-1"].selected_sku_id is None assert decisions["spu-1"].matched_stage == "no_match" + + +def test_is_text_match_uses_token_boundaries_for_sizes(): + registry = StyleIntentRegistry.from_query_config( + QueryConfig( + style_intent_terms={ + "size": [{"en_terms": ["l"], "zh_terms": ["大码"], "attribute_terms": ["l"]}], + }, + style_intent_dimension_aliases={"size": ["size", "尺码"]}, + ) + ) + selector = StyleSkuSelector(registry) + style_profile = StyleIntentProfile( + intents=( + DetectedStyleIntent( + intent_type="size", + canonical_value="l", + matched_term="l", + matched_query_text="l", + attribute_terms=("l",), + dimension_aliases=("size", "尺码"), + ), + ), + ) + selection_context = selector._build_selection_context(style_profile) + + assert selector._is_text_match("size", selection_context, normalized_value="l") + assert not selector._is_text_match("size", selection_context, normalized_value="xl") + assert not selector._is_text_match("size", selection_context, normalized_value="xxl") + + +def test_is_text_match_handles_punctuation_and_descriptive_attribute_values(): + registry = StyleIntentRegistry.from_query_config( + QueryConfig( + style_intent_terms={ + "color": [{"en_terms": ["blue"], "zh_terms": ["蓝色"], "attribute_terms": ["blue"]}], + "style": [{"en_terms": ["off-white"], "zh_terms": ["米白"], "attribute_terms": ["off-white"]}], + "accessory": [{"en_terms": ["headscarf"], "zh_terms": ["头巾"], "attribute_terms": ["headscarf"]}], + "size": [{"en_terms": ["2xl"], "zh_terms": ["2xl"], "attribute_terms": ["2xl"]}], + }, + style_intent_dimension_aliases={ + "color": ["color", "颜色"], + "style": ["style", "风格"], + "accessory": ["accessory", "配饰"], + "size": ["size", "尺码"], + }, + ) + ) + selector = StyleSkuSelector(registry) + style_profile = StyleIntentProfile( + intents=( + DetectedStyleIntent( + intent_type="color", + canonical_value="blue", + matched_term="blue", + matched_query_text="blue", + attribute_terms=("blue",), + dimension_aliases=("color", "颜色"), + ), + DetectedStyleIntent( + intent_type="style", + canonical_value="off-white", + matched_term="off-white", + matched_query_text="off-white", + attribute_terms=("off-white",), + dimension_aliases=("style", "风格"), + ), + DetectedStyleIntent( + intent_type="accessory", + canonical_value="headscarf", + matched_term="headscarf", + matched_query_text="headscarf", + attribute_terms=("headscarf",), + dimension_aliases=("accessory", "配饰"), + ), + DetectedStyleIntent( + intent_type="size", + canonical_value="2xl", + matched_term="2xl", + matched_query_text="2xl", + attribute_terms=("2xl",), + dimension_aliases=("size", "尺码"), + ), + ), + ) + selection_context = selector._build_selection_context(style_profile) + + assert selector._is_text_match("color", selection_context, normalized_value="gray blue") + assert selector._is_text_match("style", selection_context, normalized_value="off-white/lined") + assert selector._is_text_match("accessory", selection_context, normalized_value="army green + headscarf") + assert selector._is_text_match("size", selection_context, normalized_value="2xl recommended 65-70kg") -- libgit2 0.21.2