diff --git a/.gitignore b/.gitignore index 36f6814..ee45387 100644 --- a/.gitignore +++ b/.gitignore @@ -82,3 +82,4 @@ model_cache/ artifacts/search_evaluation/*.sqlite3 artifacts/search_evaluation/batch_reports/ artifacts/search_evaluation/tuning_runs/ +artifacts/search_evaluation/datasets/ diff --git a/api/models.py b/api/models.py index 4c767b7..f97b13d 100644 --- a/api/models.py +++ b/api/models.py @@ -276,6 +276,14 @@ class SpuResult(BaseModel): None, description="规格列表(与 ES specifications 字段对应)" ) + enriched_attributes: Optional[List[Dict[str, Any]]] = Field( + None, + description="LLM 富化属性(ES enriched_attributes 字段)" + ) + enriched_taxonomy_attributes: Optional[List[Dict[str, Any]]] = Field( + None, + description="类目体系化属性(ES enriched_taxonomy_attributes 字段,例如 Color/Material)" + ) skus: List[SkuResult] = Field(default_factory=list, description="SKU列表") relevance_score: float = Field(..., ge=0.0, description="相关性分数(ES原始分数)") diff --git a/api/result_formatter.py b/api/result_formatter.py index 4ad608f..07e911d 100644 --- a/api/result_formatter.py +++ b/api/result_formatter.py @@ -150,6 +150,8 @@ class ResultFormatter: option2_name=source.get('option2_name'), option3_name=source.get('option3_name'), specifications=source.get('specifications'), + enriched_attributes=source.get('enriched_attributes'), + enriched_taxonomy_attributes=source.get('enriched_taxonomy_attributes'), skus=skus, relevance_score=relevance_score ) diff --git a/config/config.yaml b/config/config.yaml index b19a3ba..f669d89 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -224,8 +224,8 @@ query_config: # - keywords # - qanchors # - enriched_tags - # - enriched_attributes - # - # enriched_taxonomy_attributes.value + - enriched_attributes + - enriched_taxonomy_attributes - min_price - compare_at_price diff --git a/docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md b/docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md index 9d2e3d5..f66dd81 100644 --- a/docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md +++ b/docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md @@ -275,7 +275,7 @@ curl "http://localhost:6007/health" | `target_lang` | string | Y | 目标语言:`zh`、`en`、`ru` 等 | | `source_lang` | string | N | 源语言。云端模型可不传;`nllb-200-distilled-600m` 建议显式传入 | | `model` | string | N | 已启用 capability 名称,如 `qwen-mt`、`llm`、`deepl`、`nllb-200-distilled-600m`、`opus-mt-zh-en`、`opus-mt-en-zh` | -| `scene` | string | N | 翻译场景参数,与 `model` 配套使用;当前标准值为 `sku_name`、`ecommerce_search_query`、`general` | +| `scene` | string | N | 翻译场景参数,与 `model` 配套使用;当前标准值为 `sku_name`、`ecommerce_search_query`、`sku_attribute`、`general` | 说明: - 外部接口不接受 `prompt`;LLM prompt 由服务端按 `scene` 自动生成。 @@ -287,7 +287,7 @@ curl "http://localhost:6007/health" - 如果是en-zh互译、期待更高的速度,可以考虑`opus-mt-zh-en` / `opus-mt-en-zh`。(质量未详细评测,一些文章说比blib-200-600m更好,但是我看了些case感觉要差不少) **实时翻译选型建议**: -- 在线 query 翻译如果只是 `en/zh` 互译,极致要求耗时使用 `opus-mt-zh-en / opus-mt-en-zh`,`nllb-200-distilled-600m`支持多语言,效果略好一点,但是耗时长很多(70-150ms之间) +- 在线 query 翻译如果只是 `en/zh` 互译,极致要求耗时使用 `opus-mt-zh-en / opus-mt-en-zh`,`nllb-200-distilled-600m`支持多语言,效果略好一点,但是耗时长很多(120-190ms左右) - 如果涉及其他语言,或对质量要求高于本地轻量模型,优先考虑 `deepl`。 **Batch Size / 调用方式建议**: diff --git a/query/style_intent.py b/query/style_intent.py index c9a1f50..76931a6 100644 --- a/query/style_intent.py +++ b/query/style_intent.py @@ -134,6 +134,10 @@ class DetectedStyleIntent: matched_query_text: str attribute_terms: Tuple[str, ...] dimension_aliases: Tuple[str, ...] + # Union of zh_terms + en_terms + attribute_terms for the matched term definition. + # Downstream SKU-selection treats every entry as a valid attribute-value match candidate + # so a Chinese user query like "卡其色" can match a Chinese option value "卡其色裙". + all_terms: Tuple[str, ...] = () def to_dict(self) -> Dict[str, Any]: return { @@ -143,8 +147,14 @@ class DetectedStyleIntent: "matched_query_text": self.matched_query_text, "attribute_terms": list(self.attribute_terms), "dimension_aliases": list(self.dimension_aliases), + "all_terms": list(self.all_terms), } + @property + def matching_terms(self) -> Tuple[str, ...]: + """Terms usable for attribute-value matching; falls back to attribute_terms for old callers.""" + return self.all_terms or self.attribute_terms + @dataclass(frozen=True) class StyleIntentProfile: @@ -370,6 +380,15 @@ class StyleIntentDetector: if pair in seen_pairs: continue seen_pairs.add(pair) + all_terms = tuple( + dict.fromkeys( + ( + *term_definition.zh_terms, + *term_definition.en_terms, + *term_definition.attribute_terms, + ) + ) + ) detected.append( DetectedStyleIntent( intent_type=intent_type, @@ -378,6 +397,7 @@ class StyleIntentDetector: matched_query_text=variant.text, attribute_terms=term_definition.attribute_terms, dimension_aliases=definition.dimension_aliases, + all_terms=all_terms, ) ) break diff --git a/search/es_query_builder.py b/search/es_query_builder.py index 56fdec5..93086de 100644 --- a/search/es_query_builder.py +++ b/search/es_query_builder.py @@ -213,6 +213,13 @@ class ESQueryBuilder: "_name": query_name, "query": {"knn": image_knn_query}, "score_mode": "max", + # Expose the best-matching image entry (url, score) so SKU selection + # can promote the SKU whose image_src matches the winning url. + "inner_hits": { + "name": f"{query_name}_hits", + "size": 1, + "_source": ["url"], + }, } } return { @@ -276,6 +283,13 @@ class ESQueryBuilder: "_name": query_name, "score_mode": "max", "query": {"script_score": script_score_query}, + # Same rationale as build_image_knn_clause: carry the winning url + score + # so downstream SKU selection can consume it without a second ES round-trip. + "inner_hits": { + "name": f"{query_name}_hits", + "size": 1, + "_source": ["url"], + }, } } return {"script_score": {"_name": query_name, **script_score_query}} diff --git a/search/searcher.py b/search/searcher.py index 0b9ce97..b5f2a59 100644 --- a/search/searcher.py +++ b/search/searcher.py @@ -354,7 +354,9 @@ class Searcher: if not includes: includes.add("title") - if self._has_style_intent(parsed_query): + if self._should_run_sku_selection(parsed_query): + # SKU-level fields are needed both by text matching (optionN_value) and + # by the image pick (image_src) of the unified SKU selector. includes.update( { "skus", @@ -363,6 +365,10 @@ class Searcher: "option3_name", } ) + if self._has_style_intent(parsed_query): + # Treated as an additional value source for attribute matching + # (on the same dimension as optionN). + includes.add("enriched_taxonomy_attributes") return {"includes": sorted(includes)} @@ -435,6 +441,23 @@ class Searcher: profile = getattr(parsed_query, "style_intent_profile", None) return bool(getattr(profile, "is_active", False)) + def _has_image_signal(self, parsed_query: Optional[ParsedQuery]) -> bool: + """True when the query carries an image vector that can drive an image-based SKU pick.""" + if parsed_query is None: + return False + if not getattr(self.config.query_config, "image_embedding_field", None): + return False + return getattr(parsed_query, "image_query_vector", None) is not None + + def _should_run_sku_selection(self, parsed_query: Optional[ParsedQuery]) -> bool: + """Trigger unified SKU selection when either signal is present. + + Text-intent alone drives attribute-value matching; image signal alone drives + image-nearest SKU promotion; together, image is a visual tie-breaker inside + the text-matched set. + """ + return self._has_style_intent(parsed_query) or self._has_image_signal(parsed_query) + def _apply_style_intent_to_hits( self, es_hits: List[Dict[str, Any]], @@ -1067,7 +1090,7 @@ class Searcher: if fill_took: es_response["took"] = int((es_response.get("took", 0) or 0) + fill_took) - if self._has_style_intent(parsed_query): + if self._should_run_sku_selection(parsed_query): style_intent_decisions = self._apply_style_intent_to_hits( es_response.get("hits", {}).get("hits") or [], parsed_query, @@ -1075,7 +1098,7 @@ class Searcher: ) if style_intent_decisions: context.logger.info( - "款式意图 SKU 预筛选完成 | hits=%s", + "SKU 选择预处理完成 | hits=%s", len(style_intent_decisions), extra={'reqid': context.reqid, 'uid': context.uid} ) @@ -1221,8 +1244,8 @@ class Searcher: extra={'reqid': context.reqid, 'uid': context.uid} ) - # 非重排窗口:款式意图在 result_processing 之前执行,便于单独计时且与 ES 召回阶段衔接 - if self._has_style_intent(parsed_query) and not in_rank_window: + # 非重排窗口:SKU 选择(款式意图 OR 图像信号)在 result_processing 之前执行,便于单独计时 + if self._should_run_sku_selection(parsed_query) and not in_rank_window: es_hits_pre = es_response.get("hits", {}).get("hits") or [] style_intent_decisions = self._apply_style_intent_to_hits( es_hits_pre, @@ -1251,12 +1274,11 @@ class Searcher: coarse_debug_by_doc = _index_debug_rows_by_doc(context.get_intermediate_result('coarse_rank_scores', None)) fine_debug_by_doc = _index_debug_rows_by_doc(context.get_intermediate_result('fine_rank_scores', None)) - if self._has_style_intent(parsed_query): - if style_intent_decisions: - self.style_sku_selector.apply_precomputed_decisions( - es_hits, - style_intent_decisions, - ) + if self._should_run_sku_selection(parsed_query) and style_intent_decisions: + self.style_sku_selector.apply_precomputed_decisions( + es_hits, + style_intent_decisions, + ) # Format results using ResultFormatter formatted_results = ResultFormatter.format_search_results( diff --git a/search/sku_intent_selector.py b/search/sku_intent_selector.py index a51a3cf..0005036 100644 --- a/search/sku_intent_selector.py +++ b/search/sku_intent_selector.py @@ -1,14 +1,108 @@ """ -SKU selection for style-intent-aware search results. +SKU selection for style-intent-aware and image-aware search results. + +Unified algorithm (one pass per hit, no cascading fallback stages): + +1. Per active style intent, a SKU's attribute value for that dimension comes + from ONE of two sources, in priority order: + - ``option``: the SKU's own ``optionN_value`` on the slot resolved by the + intent's dimension aliases — authoritative whenever non-empty. + - ``taxonomy``: the SPU-level ``enriched_taxonomy_attributes`` value on the + same dimension — used only when the SKU has no own value (slot unresolved + or value empty). Never overrides a contradicting SKU-level value. +2. A SKU is "text-matched" iff every active intent finds a match on its + selected value source (tokens of zh/en/attribute synonyms; values are first + passed through ``_with_segment_boundaries_for_matching`` so brackets and + common separators split segments; pure-CJK terms still use a substring + fallback when the value is one undivided CJK run, e.g. ``卡其色棉``). We + remember the matching source and the raw matched + text per intent so the final decision can surface it. +3. The image-pick comes straight from the nested ``image_embedding`` inner_hits + (``exact_image_knn_query_hits`` preferred, ``image_knn_query_hits`` + otherwise): the SKU whose ``image_src`` equals the top-scoring url. +4. Unified selection: + - if the text-matched set is non-empty → pick image_pick when it lies in + that set (visual tie-break among text-matched), otherwise the first + text-matched SKU; + - else → pick image_pick if any; + - else → no decision (``final_source == "none"``). + +``final_source`` values (weakest → strongest text evidence, reversed): + ``option`` > ``taxonomy`` > ``image`` > ``none``. If any intent was satisfied + only via taxonomy the overall source degrades to ``taxonomy`` so downstream + callers can decide whether to differentiate the SPU-level signal from a + true SKU-level option match. + +No embedding fallback, no stage cascade, no score thresholds. """ from __future__ import annotations from dataclasses import dataclass, field from typing import Any, Callable, Dict, List, Optional, Tuple +from urllib.parse import urlsplit + +from query.style_intent import ( + DetectedStyleIntent, + StyleIntentProfile, + StyleIntentRegistry, +) +from query.tokenization import ( + contains_han_text, + normalize_query_text, + simple_tokenize_query, +) + +import re + +_NON_HAN_RE = re.compile(r"[^\u4e00-\u9fff]") +# Zero-width / BOM (often pasted from Excel or CMS). +_ZW_AND_BOM_RE = re.compile(r"[\u200b-\u200d\ufeff\u2060]") +# Brackets, slashes, and common commerce/list punctuation → segment boundaries so +# tokenization can align intent terms (e.g. 卡其色) with the leading segment of +# 卡其色(无内衬) / 卡其色/常规 / 卡其色·麻 等,without relying only on substring. +_ATTRIBUTE_BOUNDARY_RE = re.compile( + r"[\s\u3000]" # ASCII / ideographic space + r"|[\(\)\[\]\{\}()【】{}〈〉《》「」『』[]「」]" + r"|[/\\||/\︱丨]" + r"|[,,、;;::.。]" + r"|[·•・]" + r"|[~~]" + r"|[+\=#%&*×※]" + r"|[\u2010-\u2015\u2212]" # hyphen, en dash, minus, etc. +) + + +def _is_pure_han(value: str) -> bool: + """True if the string is non-empty and contains only CJK Unified Ideographs.""" + return bool(value) and not _NON_HAN_RE.search(value) + + +def _with_segment_boundaries_for_matching(normalized_value: str) -> str: + """Normalize commerce-style option/taxonomy strings for token matching. + + Inserts word boundaries at brackets and typical separators so + ``simple_tokenize_query`` yields segments like ``['卡其色', '无内衬']`` instead + of one undifferentiated CJK blob when unusual punctuation appears. + """ + if not normalized_value: + return "" + s = _ZW_AND_BOM_RE.sub("", normalized_value) + s = _ATTRIBUTE_BOUNDARY_RE.sub(" ", s) + return " ".join(s.split()) + + +_IMAGE_INNER_HITS_KEYS: Tuple[str, ...] = ( + "exact_image_knn_query_hits", + "image_knn_query_hits", +) -from query.style_intent import StyleIntentProfile, StyleIntentRegistry -from query.tokenization import normalize_query_text, simple_tokenize_query + +@dataclass(frozen=True) +class ImagePick: + sku_id: str + url: str + score: float @dataclass(frozen=True) @@ -16,31 +110,52 @@ class SkuSelectionDecision: selected_sku_id: Optional[str] rerank_suffix: str selected_text: str - matched_stage: str - similarity_score: Optional[float] = None + # "option" | "taxonomy" | "image" | "none" + final_source: str resolved_dimensions: Dict[str, Optional[str]] = field(default_factory=dict) + # Per-intent matching-source breakdown, e.g. {"color": "option", "size": "taxonomy"}. + matched_sources: Dict[str, str] = field(default_factory=dict) + image_pick_sku_id: Optional[str] = None + image_pick_url: Optional[str] = None + image_pick_score: Optional[float] = None + + # Backward-compat alias; some older callers/tests look at ``matched_stage``. + @property + def matched_stage(self) -> str: + return self.final_source def to_dict(self) -> Dict[str, Any]: return { "selected_sku_id": self.selected_sku_id, "rerank_suffix": self.rerank_suffix, "selected_text": self.selected_text, - "matched_stage": self.matched_stage, - "similarity_score": self.similarity_score, + "final_source": self.final_source, + "matched_sources": dict(self.matched_sources), "resolved_dimensions": dict(self.resolved_dimensions), + "image_pick": ( + { + "sku_id": self.image_pick_sku_id, + "url": self.image_pick_url, + "score": self.image_pick_score, + } + if self.image_pick_sku_id or self.image_pick_url + else None + ), } @dataclass class _SelectionContext: - attribute_terms_by_intent: Dict[str, Tuple[str, ...]] + """Request-scoped memo for term tokenization and substring match probes.""" + + terms_by_intent: Dict[str, Tuple[str, ...]] normalized_text_cache: Dict[str, str] = field(default_factory=dict) tokenized_text_cache: Dict[str, Tuple[str, ...]] = field(default_factory=dict) text_match_cache: Dict[Tuple[str, str], bool] = field(default_factory=dict) class StyleSkuSelector: - """Selects the best SKU for an SPU based on detected style intent.""" + """Selects the best SKU per hit from style-intent text match + image KNN.""" def __init__( self, @@ -49,29 +164,47 @@ class StyleSkuSelector: text_encoder_getter: Optional[Callable[[], Any]] = None, ) -> None: self.registry = registry + # Retained for API back-compat; no longer used now that embedding fallback is gone. self._text_encoder_getter = text_encoder_getter + # ------------------------------------------------------------------ + # Public entry points + # ------------------------------------------------------------------ def prepare_hits( self, es_hits: List[Dict[str, Any]], parsed_query: Any, ) -> Dict[str, SkuSelectionDecision]: + """Compute selection decisions (without mutating ``_source``). + + Runs if either a style intent is active OR any hit carries image + inner_hits. Decisions are keyed by ES ``_id`` and meant to be applied + later via :meth:`apply_precomputed_decisions` (after page fill). + """ decisions: Dict[str, SkuSelectionDecision] = {} style_profile = getattr(parsed_query, "style_intent_profile", None) - if not isinstance(style_profile, StyleIntentProfile) or not style_profile.is_active: - return decisions - - selection_context = self._build_selection_context(style_profile) + style_active = ( + isinstance(style_profile, StyleIntentProfile) and style_profile.is_active + ) + selection_context = ( + self._build_selection_context(style_profile) if style_active else None + ) for hit in es_hits: source = hit.get("_source") if not isinstance(source, dict): continue - decision = self._select_for_source( - source, - style_profile=style_profile, + image_pick = self._pick_sku_by_image(hit, source) + if not style_active and image_pick is None: + # Nothing to do for this hit. + continue + + decision = self._select( + source=source, + style_profile=style_profile if style_active else None, selection_context=selection_context, + image_pick=image_pick, ) if decision is None: continue @@ -94,7 +227,6 @@ class StyleSkuSelector: ) -> None: if not es_hits or not decisions: return - for hit in es_hits: doc_id = hit.get("_id") if doc_id is None: @@ -111,122 +243,90 @@ class StyleSkuSelector: else: hit.pop("_style_rerank_suffix", None) + # ------------------------------------------------------------------ + # Selection context & text matching + # ------------------------------------------------------------------ def _build_selection_context( self, style_profile: StyleIntentProfile, ) -> _SelectionContext: - attribute_terms_by_intent: Dict[str, List[str]] = {} + terms_by_intent: Dict[str, List[str]] = {} for intent in style_profile.intents: - terms = attribute_terms_by_intent.setdefault(intent.intent_type, []) - for raw_term in intent.attribute_terms: + terms = terms_by_intent.setdefault(intent.intent_type, []) + for raw_term in intent.matching_terms: normalized_term = normalize_query_text(raw_term) - if not normalized_term or normalized_term in terms: - continue - terms.append(normalized_term) - + if normalized_term and normalized_term not in terms: + terms.append(normalized_term) return _SelectionContext( - attribute_terms_by_intent={ + terms_by_intent={ intent_type: tuple(terms) - for intent_type, terms in attribute_terms_by_intent.items() + for intent_type, terms in terms_by_intent.items() }, ) - @staticmethod - def _normalize_cached(selection_context: _SelectionContext, value: Any) -> str: + def _normalize_cached(self, ctx: _SelectionContext, value: Any) -> str: raw = str(value or "").strip() if not raw: return "" - cached = selection_context.normalized_text_cache.get(raw) + cached = ctx.normalized_text_cache.get(raw) if cached is not None: return cached normalized = normalize_query_text(raw) - selection_context.normalized_text_cache[raw] = normalized + ctx.normalized_text_cache[raw] = normalized return normalized - def _resolve_dimensions( - self, - source: Dict[str, Any], - style_profile: StyleIntentProfile, - ) -> Dict[str, Optional[str]]: - option_names = { - "option1_value": normalize_query_text(source.get("option1_name")), - "option2_value": normalize_query_text(source.get("option2_name")), - "option3_value": normalize_query_text(source.get("option3_name")), - } - resolved: Dict[str, Optional[str]] = {} - for intent in style_profile.intents: - if intent.intent_type in resolved: - continue - aliases = set(intent.dimension_aliases or self.registry.get_dimension_aliases(intent.intent_type)) - matched_field = None - for field_name, option_name in option_names.items(): - if option_name and option_name in aliases: - matched_field = field_name - break - resolved[intent.intent_type] = matched_field - return resolved - - @staticmethod - def _empty_decision( - resolved_dimensions: Dict[str, Optional[str]], - matched_stage: str, - ) -> SkuSelectionDecision: - return SkuSelectionDecision( - selected_sku_id=None, - rerank_suffix="", - selected_text="", - matched_stage=matched_stage, - resolved_dimensions=dict(resolved_dimensions), + def _tokenize_cached(self, ctx: _SelectionContext, value: str) -> Tuple[str, ...]: + normalized_value = normalize_query_text(value) + if not normalized_value: + return () + cached = ctx.tokenized_text_cache.get(normalized_value) + if cached is not None: + return cached + tokens = tuple( + normalize_query_text(token) + for token in simple_tokenize_query(normalized_value) + if token ) + ctx.tokenized_text_cache[normalized_value] = tokens + return tokens def _is_text_match( self, intent_type: str, - selection_context: _SelectionContext, + ctx: _SelectionContext, *, normalized_value: str, ) -> bool: + """True iff any intent term token-boundary matches the given value.""" if not normalized_value: return False - cache_key = (intent_type, normalized_value) - cached = selection_context.text_match_cache.get(cache_key) + cached = ctx.text_match_cache.get(cache_key) if cached is not None: return cached - attribute_terms = selection_context.attribute_terms_by_intent.get(intent_type, ()) - value_tokens = self._tokenize_cached(selection_context, normalized_value) + terms = ctx.terms_by_intent.get(intent_type, ()) + segmented = _with_segment_boundaries_for_matching(normalized_value) + value_tokens = self._tokenize_cached(ctx, segmented) matched = any( self._matches_term_tokens( term=term, value_tokens=value_tokens, - selection_context=selection_context, + ctx=ctx, normalized_value=normalized_value, ) - for term in attribute_terms + for term in terms if term ) - selection_context.text_match_cache[cache_key] = matched + ctx.text_match_cache[cache_key] = matched return matched - @staticmethod - def _tokenize_cached(selection_context: _SelectionContext, value: str) -> Tuple[str, ...]: - normalized_value = normalize_query_text(value) - if not normalized_value: - return () - cached = selection_context.tokenized_text_cache.get(normalized_value) - if cached is not None: - return cached - tokens = tuple(normalize_query_text(token) for token in simple_tokenize_query(normalized_value) if token) - selection_context.tokenized_text_cache[normalized_value] = tokens - return tokens - def _matches_term_tokens( self, *, term: str, value_tokens: Tuple[str, ...], - selection_context: _SelectionContext, + ctx: _SelectionContext, normalized_value: str, ) -> bool: normalized_term = normalize_query_text(term) @@ -234,8 +334,13 @@ class StyleSkuSelector: return False if normalized_term == normalized_value: return True - - term_tokens = self._tokenize_cached(selection_context, normalized_term) + # Pure-CJK terms can't be split further by the whitespace/regex tokenizer + # ("卡其色棉" is one token), so sliding-window token match would miss the prefix. + # Fall back to normalized substring containment — safe because this branch + # never triggers for Latin tokens where substring would cause "l" ⊂ "xl" issues. + if _is_pure_han(normalized_term) and contains_han_text(normalized_value): + return normalized_term in normalized_value + term_tokens = self._tokenize_cached(ctx, normalized_term) if not term_tokens or not value_tokens: return normalized_term in normalized_value @@ -243,106 +348,383 @@ class StyleSkuSelector: value_length = len(value_tokens) if term_length > value_length: return False - for start in range(value_length - term_length + 1): - if value_tokens[start:start + term_length] == term_tokens: + if value_tokens[start : start + term_length] == term_tokens: return True return False - def _find_first_text_match( + # ------------------------------------------------------------------ + # Dimension resolution (option slot + taxonomy values) + # ------------------------------------------------------------------ + def _resolve_dimensions( self, - skus: List[Dict[str, Any]], - resolved_dimensions: Dict[str, Optional[str]], - selection_context: _SelectionContext, - ) -> Optional[Tuple[str, str]]: - for sku in skus: - selection_parts: List[str] = [] - seen_parts: set[str] = set() - matched = True - - for intent_type, field_name in resolved_dimensions.items(): - if not field_name: - matched = False - break - - raw_value = str(sku.get(field_name) or "").strip() - normalized_value = self._normalize_cached(selection_context, raw_value) - if not self._is_text_match( - intent_type, - selection_context, - normalized_value=normalized_value, - ): - matched = False + source: Dict[str, Any], + style_profile: StyleIntentProfile, + ) -> Dict[str, Optional[str]]: + option_fields = ( + ("option1_value", source.get("option1_name")), + ("option2_value", source.get("option2_name")), + ("option3_value", source.get("option3_name")), + ) + option_aliases = [ + (field_name, normalize_query_text(name)) + for field_name, name in option_fields + ] + resolved: Dict[str, Optional[str]] = {} + for intent in style_profile.intents: + if intent.intent_type in resolved: + continue + aliases = set( + intent.dimension_aliases + or self.registry.get_dimension_aliases(intent.intent_type) + ) + matched_field: Optional[str] = None + for field_name, option_name in option_aliases: + if option_name and option_name in aliases: + matched_field = field_name break + resolved[intent.intent_type] = matched_field + return resolved - if raw_value and normalized_value not in seen_parts: - seen_parts.add(normalized_value) - selection_parts.append(raw_value) + def _collect_taxonomy_values( + self, + source: Dict[str, Any], + style_profile: StyleIntentProfile, + ) -> Dict[str, Tuple[Tuple[str, str], ...]]: + """Extract SPU-level enriched_taxonomy_attributes values per intent dimension. + + Returns a mapping ``intent_type -> ((normalized, raw), ...)`` so the + selection layer can (a) match against ``normalized`` and (b) surface + the human-readable ``raw`` form in ``selected_text``. + """ + attrs = source.get("enriched_taxonomy_attributes") + if not isinstance(attrs, list) or not attrs: + return {} + aliases_by_intent = { + intent.intent_type: set( + intent.dimension_aliases + or self.registry.get_dimension_aliases(intent.intent_type) + ) + for intent in style_profile.intents + } + values_by_intent: Dict[str, List[Tuple[str, str]]] = { + t: [] for t in aliases_by_intent + } + for attr in attrs: + if not isinstance(attr, dict): + continue + attr_name = normalize_query_text(attr.get("name")) + if not attr_name: + continue + matching_intents = [ + t for t, aliases in aliases_by_intent.items() if attr_name in aliases + ] + if not matching_intents: + continue + for raw_text in _iter_multilingual_texts(attr.get("value")): + raw = str(raw_text).strip() + if not raw: + continue + normalized = normalize_query_text(raw) + if not normalized: + continue + for intent_type in matching_intents: + bucket = values_by_intent[intent_type] + if not any(existing_norm == normalized for existing_norm, _ in bucket): + bucket.append((normalized, raw)) + return {t: tuple(v) for t, v in values_by_intent.items() if v} + + # ------------------------------------------------------------------ + # Image pick + # ------------------------------------------------------------------ + @staticmethod + def _normalize_url(url: Any) -> str: + raw = str(url or "").strip() + if not raw: + return "" + # Accept protocol-relative URLs like "//cdn/..." or full URLs. + if raw.startswith("//"): + raw = "https:" + raw + try: + parts = urlsplit(raw) + except ValueError: + return raw.casefold() + host = (parts.netloc or "").casefold() + path = parts.path or "" + return f"{host}{path}".casefold() + + def _pick_sku_by_image( + self, + hit: Dict[str, Any], + source: Dict[str, Any], + ) -> Optional[ImagePick]: + inner_hits = hit.get("inner_hits") + if not isinstance(inner_hits, dict): + return None + top_url: Optional[str] = None + top_score: Optional[float] = None + for key in _IMAGE_INNER_HITS_KEYS: + payload = inner_hits.get(key) + if not isinstance(payload, dict): + continue + hits_block = payload.get("hits") + inner_list = hits_block.get("hits") if isinstance(hits_block, dict) else None + if not isinstance(inner_list, list) or not inner_list: + continue + for entry in inner_list: + if not isinstance(entry, dict): + continue + url = (entry.get("_source") or {}).get("url") + if not url: + continue + try: + score = float(entry.get("_score") or 0.0) + except (TypeError, ValueError): + score = 0.0 + if top_score is None or score > top_score: + top_url = str(url) + top_score = score + if top_url is not None: + break # Prefer the first listed inner_hits source (exact > approx). + if top_url is None: + return None - if matched: - return str(sku.get("sku_id") or ""), " ".join(selection_parts).strip() + skus = source.get("skus") + if not isinstance(skus, list): + return None + target = self._normalize_url(top_url) + for sku in skus: + sku_url = self._normalize_url(sku.get("image_src") or sku.get("imageSrc")) + if sku_url and sku_url == target: + return ImagePick( + sku_id=str(sku.get("sku_id") or ""), + url=top_url, + score=float(top_score or 0.0), + ) return None - def _select_for_source( + # ------------------------------------------------------------------ + # Unified per-hit selection + # ------------------------------------------------------------------ + def _select( self, - source: Dict[str, Any], *, - style_profile: StyleIntentProfile, - selection_context: _SelectionContext, + source: Dict[str, Any], + style_profile: Optional[StyleIntentProfile], + selection_context: Optional[_SelectionContext], + image_pick: Optional[ImagePick], ) -> Optional[SkuSelectionDecision]: skus = source.get("skus") if not isinstance(skus, list) or not skus: return None - resolved_dimensions = self._resolve_dimensions(source, style_profile) - if not resolved_dimensions or any(not field_name for field_name in resolved_dimensions.values()): - return self._empty_decision(resolved_dimensions, matched_stage="unresolved") + resolved_dimensions: Dict[str, Optional[str]] = {} + text_matched: List[Tuple[Dict[str, Any], Dict[str, Tuple[str, str]]]] = [] + + if style_profile is not None and selection_context is not None: + resolved_dimensions = self._resolve_dimensions(source, style_profile) + taxonomy_values = self._collect_taxonomy_values(source, style_profile) + # Only attempt text match when there is at least one value source + # per intent (SKU option or SPU taxonomy). + if all( + resolved_dimensions.get(intent.intent_type) is not None + or taxonomy_values.get(intent.intent_type) + for intent in style_profile.intents + ): + text_matched = self._find_text_matched_skus( + skus=skus, + style_profile=style_profile, + resolved_dimensions=resolved_dimensions, + taxonomy_values=taxonomy_values, + ctx=selection_context, + ) + + selected_sku_id: Optional[str] = None + selected_text = "" + final_source = "none" + matched_sources: Dict[str, str] = {} + + if text_matched: + chosen_sku, per_intent = self._choose_among_text_matched( + text_matched, image_pick + ) + selected_sku_id = str(chosen_sku.get("sku_id") or "") or None + selected_text = self._text_from_matches(per_intent) + matched_sources = { + intent_type: src for intent_type, (src, _) in per_intent.items() + } + final_source = ( + "taxonomy" if "taxonomy" in matched_sources.values() else "option" + ) + elif image_pick is not None: + image_sku = self._find_sku_by_id(skus, image_pick.sku_id) + if image_sku is not None: + selected_sku_id = image_pick.sku_id or None + selected_text = self._build_selected_text(image_sku, resolved_dimensions) + final_source = "image" - text_match = self._find_first_text_match(skus, resolved_dimensions, selection_context) - if text_match is None: - return self._empty_decision(resolved_dimensions, matched_stage="no_match") - return self._build_decision( - selected_sku_id=text_match[0], - selected_text=text_match[1], + return SkuSelectionDecision( + selected_sku_id=selected_sku_id, + rerank_suffix=selected_text, + selected_text=selected_text, + final_source=final_source, resolved_dimensions=resolved_dimensions, - matched_stage="text", + matched_sources=matched_sources, + image_pick_sku_id=(image_pick.sku_id or None) if image_pick else None, + image_pick_url=image_pick.url if image_pick else None, + image_pick_score=image_pick.score if image_pick else None, ) - @staticmethod - def _build_decision( - selected_sku_id: str, - selected_text: str, - resolved_dimensions: Dict[str, Optional[str]], + def _find_text_matched_skus( + self, *, - matched_stage: str, - similarity_score: Optional[float] = None, - ) -> SkuSelectionDecision: - return SkuSelectionDecision( - selected_sku_id=selected_sku_id or None, - rerank_suffix=str(selected_text or "").strip(), - selected_text=str(selected_text or "").strip(), - matched_stage=matched_stage, - similarity_score=similarity_score, - resolved_dimensions=dict(resolved_dimensions), - ) + skus: List[Dict[str, Any]], + style_profile: StyleIntentProfile, + resolved_dimensions: Dict[str, Optional[str]], + taxonomy_values: Dict[str, Tuple[Tuple[str, str], ...]], + ctx: _SelectionContext, + ) -> List[Tuple[Dict[str, Any], Dict[str, Tuple[str, str]]]]: + """Return every SKU that satisfies every active intent, with match meta. + + Authority rule per intent: + - If the SKU has a non-empty value on the resolved option slot, that + value ALONE decides the match (source = ``option``). Taxonomy cannot + override a contradicting SKU-level value. + - Only when the SKU has no own value on the dimension (slot unresolved + or value empty) does the SPU-level taxonomy serve as the fallback + value source (source = ``taxonomy``). + + For each matched SKU we also return a per-intent dict mapping + ``intent_type -> (source, raw_matched_text)`` so the final decision can + surface the genuinely matched string in ``selected_text`` / + ``rerank_suffix`` rather than, e.g., a SKU's unrelated option value. + """ + matched: List[Tuple[Dict[str, Any], Dict[str, Tuple[str, str]]]] = [] + for sku in skus: + per_intent: Dict[str, Tuple[str, str]] = {} + all_ok = True + for intent in style_profile.intents: + slot = resolved_dimensions.get(intent.intent_type) + sku_raw = str(sku.get(slot) or "").strip() if slot else "" + sku_norm = normalize_query_text(sku_raw) if sku_raw else "" + + if sku_norm: + if self._is_text_match( + intent.intent_type, ctx, normalized_value=sku_norm + ): + per_intent[intent.intent_type] = ("option", sku_raw) + else: + all_ok = False + break + else: + matched_raw: Optional[str] = None + for tax_norm, tax_raw in taxonomy_values.get( + intent.intent_type, () + ): + if self._is_text_match( + intent.intent_type, ctx, normalized_value=tax_norm + ): + matched_raw = tax_raw + break + if matched_raw is None: + all_ok = False + break + per_intent[intent.intent_type] = ("taxonomy", matched_raw) + if all_ok: + matched.append((sku, per_intent)) + return matched + + @staticmethod + def _choose_among_text_matched( + text_matched: List[Tuple[Dict[str, Any], Dict[str, Tuple[str, str]]]], + image_pick: Optional[ImagePick], + ) -> Tuple[Dict[str, Any], Dict[str, Tuple[str, str]]]: + """Image-visual tie-break inside the text-matched set; else first match.""" + if image_pick and image_pick.sku_id: + for sku, per_intent in text_matched: + if str(sku.get("sku_id") or "") == image_pick.sku_id: + return sku, per_intent + return text_matched[0] + + @staticmethod + def _text_from_matches(per_intent: Dict[str, Tuple[str, str]]) -> str: + """Join the genuinely matched raw strings in intent declaration order.""" + parts: List[str] = [] + seen: set[str] = set() + for _, raw in per_intent.values(): + if raw and raw not in seen: + seen.add(raw) + parts.append(raw) + return " ".join(parts).strip() @staticmethod - def _apply_decision_to_source(source: Dict[str, Any], decision: SkuSelectionDecision) -> None: + def _find_sku_by_id( + skus: List[Dict[str, Any]], sku_id: Optional[str] + ) -> Optional[Dict[str, Any]]: + if not sku_id: + return None + for sku in skus: + if str(sku.get("sku_id") or "") == sku_id: + return sku + return None + + @staticmethod + def _build_selected_text( + sku: Dict[str, Any], + resolved_dimensions: Dict[str, Optional[str]], + ) -> str: + """Text carried into rerank doc suffix: joined raw values on the resolved slots.""" + parts: List[str] = [] + seen: set[str] = set() + for slot in resolved_dimensions.values(): + if not slot: + continue + raw = str(sku.get(slot) or "").strip() + if raw and raw not in seen: + seen.add(raw) + parts.append(raw) + return " ".join(parts).strip() + + # ------------------------------------------------------------------ + # Source mutation (applied after page fill) + # ------------------------------------------------------------------ + @staticmethod + def _apply_decision_to_source( + source: Dict[str, Any], decision: SkuSelectionDecision + ) -> None: + if not decision.selected_sku_id: + return skus = source.get("skus") - if not isinstance(skus, list) or not skus or not decision.selected_sku_id: + if not isinstance(skus, list) or not skus: return - - selected_index = None + selected_index: Optional[int] = None for index, sku in enumerate(skus): if str(sku.get("sku_id") or "") == decision.selected_sku_id: selected_index = index break if selected_index is None: return - selected_sku = skus.pop(selected_index) skus.insert(0, selected_sku) - image_src = selected_sku.get("image_src") or selected_sku.get("imageSrc") if image_src: source["image_url"] = image_src + + +def _iter_multilingual_texts(value: Any) -> List[str]: + """Flatten a value that may be str, list, or multilingual dict {zh, en, ...}.""" + if value is None: + return [] + if isinstance(value, str): + return [value] if value.strip() else [] + if isinstance(value, dict): + out: List[str] = [] + for v in value.values(): + out.extend(_iter_multilingual_texts(v)) + return out + if isinstance(value, (list, tuple)): + out = [] + for v in value: + out.extend(_iter_multilingual_texts(v)) + return out + return [] diff --git a/tests/test_search_rerank_window.py b/tests/test_search_rerank_window.py index b336e33..d905bf1 100644 --- a/tests/test_search_rerank_window.py +++ b/tests/test_search_rerank_window.py @@ -231,19 +231,6 @@ def _build_searcher(config: SearchConfig, es_client: _FakeESClient) -> Searcher: return searcher -class _FakeTextEncoder: - def __init__(self, vectors: Dict[str, List[float]]): - self.vectors = { - key: np.array(value, dtype=np.float32) - for key, value in vectors.items() - } - - def encode(self, sentences, priority: int = 0, **kwargs): - if isinstance(sentences, str): - sentences = [sentences] - return np.array([self.vectors[text] for text in sentences], dtype=object) - - def test_config_loader_rerank_enabled_defaults_true(tmp_path: Path): config_data = { "es_index_name": "test_products", @@ -611,7 +598,14 @@ def test_searcher_rerank_prefetch_source_includes_sku_fields_when_style_intent_a assert es_client.calls[0]["body"]["_source"] is False assert es_client.calls[1]["body"]["_source"] == { - "includes": ["option1_name", "option2_name", "option3_name", "skus", "title"] + "includes": [ + "enriched_taxonomy_attributes", + "option1_name", + "option2_name", + "option3_name", + "skus", + "title", + ] } @@ -944,78 +938,6 @@ def test_searcher_skips_sku_selection_when_option_name_does_not_match_dimension_ assert result.results[0].image_url == "https://img/default.jpg" -def test_searcher_promotes_sku_by_embedding_when_query_has_no_direct_option_match(monkeypatch): - es_client = _FakeESClient(total_hits=1) - searcher = _build_searcher(_build_search_config(rerank_enabled=False), es_client) - context = create_request_context(reqid="sku-embed", uid="u-sku-embed") - - monkeypatch.setattr( - "search.searcher.get_tenant_config_loader", - lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en"]}), - ) - - encoder = _FakeTextEncoder( - { - "linen summer dress": [0.8, 0.2], - "red": [1.0, 0.0], - "blue": [0.0, 1.0], - } - ) - - class _EmbeddingQueryParser: - text_encoder = encoder - - def parse( - self, - query: str, - tenant_id: str, - generate_vector: bool, - context: Any, - target_languages: Any = None, - ): - return _FakeParsedQuery( - original_query=query, - query_normalized=query, - rewritten_query=query, - translations={}, - query_vector=np.array([0.0, 1.0], dtype=np.float32), - style_intent_profile=_build_style_intent_profile( - "color", "blue", "color", "colors", "颜色" - ), - ) - - searcher.query_parser = _EmbeddingQueryParser() - - def _full_source_with_skus(doc_id: str) -> Dict[str, Any]: - return { - "spu_id": doc_id, - "title": {"en": f"product-{doc_id}"}, - "brief": {"en": f"brief-{doc_id}"}, - "vendor": {"en": f"vendor-{doc_id}"}, - "option1_name": "Color", - "image_url": "https://img/default.jpg", - "skus": [ - {"sku_id": "sku-red", "option1_value": "Red", "image_src": "https://img/red.jpg"}, - {"sku_id": "sku-blue", "option1_value": "Blue", "image_src": "https://img/blue.jpg"}, - ], - } - - monkeypatch.setattr(_FakeESClient, "_full_source", staticmethod(_full_source_with_skus)) - - result = searcher.search( - query="linen summer dress", - tenant_id="162", - from_=0, - size=1, - context=context, - enable_rerank=False, - ) - - assert len(result.results) == 1 - assert result.results[0].skus[0].sku_id == "sku-blue" - assert result.results[0].image_url == "https://img/blue.jpg" - - def test_searcher_debug_info_uses_initial_es_max_score_for_normalization(monkeypatch): es_client = _FakeESClient(total_hits=3) cfg = _build_search_config(rerank_enabled=False) diff --git a/tests/test_sku_intent_selector.py b/tests/test_sku_intent_selector.py index b387747..19dc705 100644 --- a/tests/test_sku_intent_selector.py +++ b/tests/test_sku_intent_selector.py @@ -1,5 +1,7 @@ from types import SimpleNamespace +import pytest + from config import QueryConfig from query.style_intent import DetectedStyleIntent, StyleIntentProfile, StyleIntentRegistry from search.sku_intent_selector import StyleSkuSelector @@ -57,7 +59,9 @@ def test_style_sku_selector_matches_first_sku_by_attribute_terms(): assert decision.selected_sku_id == "2" assert decision.selected_text == "Navy Blue X-Large" - assert decision.matched_stage == "text" + assert decision.final_source == "option" + assert decision.matched_sources == {"color": "option", "size": "option"} + assert decision.matched_stage == "option" # back-compat alias selector.apply_precomputed_decisions(hits, decisions) @@ -103,7 +107,7 @@ def test_style_sku_selector_returns_no_match_without_attribute_contains(): decisions = selector.prepare_hits(hits, parsed_query) assert decisions["spu-1"].selected_sku_id is None - assert decisions["spu-1"].matched_stage == "no_match" + assert decisions["spu-1"].final_source == "none" def test_is_text_match_uses_token_boundaries_for_sizes(): @@ -195,3 +199,341 @@ def test_is_text_match_handles_punctuation_and_descriptive_attribute_values(): assert selector._is_text_match("style", selection_context, normalized_value="off-white/lined") assert selector._is_text_match("accessory", selection_context, normalized_value="army green + headscarf") assert selector._is_text_match("size", selection_context, normalized_value="2xl recommended 65-70kg") + + +def _khaki_intent() -> DetectedStyleIntent: + """Mirrors what StyleIntentDetector now emits (all_terms union of zh/en/attribute).""" + return DetectedStyleIntent( + intent_type="color", + canonical_value="beige", + matched_term="卡其色", + matched_query_text="卡其色", + attribute_terms=("beige", "khaki"), + dimension_aliases=("color", "颜色"), + all_terms=("米色", "卡其色", "beige", "khaki"), + ) + + +def _color_registry() -> StyleIntentRegistry: + return StyleIntentRegistry.from_query_config( + QueryConfig( + style_intent_terms={ + "color": [ + { + "en_terms": ["beige", "khaki"], + "zh_terms": ["米色", "卡其色"], + "attribute_terms": ["beige", "khaki"], + } + ], + }, + style_intent_dimension_aliases={"color": ["color", "颜色"]}, + ) + ) + + +def test_zh_color_intent_matches_noisy_option_value(): + """卡其色裙子 → SKU 的 option1_value 以"卡其色"开头但带 V 领等后缀,也应命中。""" + selector = StyleSkuSelector(_color_registry()) + parsed_query = SimpleNamespace( + style_intent_profile=StyleIntentProfile(intents=(_khaki_intent(),)) + ) + hits = [ + { + "_id": "spu-1", + "_source": { + "option1_name": "颜色", + "skus": [ + {"sku_id": "1", "option1_value": "黑色长裙"}, + {"sku_id": "2", "option1_value": "卡其色v领收腰长裙【常规款】"}, + ], + }, + } + ] + decisions = selector.prepare_hits(hits, parsed_query) + assert decisions["spu-1"].selected_sku_id == "2" + assert decisions["spu-1"].final_source == "option" + + +@pytest.mark.parametrize( + "option_value", + [ + "卡其色(无内衬)", + "卡其色(无内衬)", + "卡其色【常规款】", + "卡其色/常规款", + "卡其色·无内衬", + "卡其色 - 常规", + "卡其色,常规", + "卡其色|常规", + "卡其色—加厚", + ], +) +def test_zh_color_intent_matches_various_brackets_and_separators(option_value: str): + selector = StyleSkuSelector(_color_registry()) + parsed_query = SimpleNamespace( + style_intent_profile=StyleIntentProfile(intents=(_khaki_intent(),)) + ) + hits = [ + { + "_id": "spu-1", + "_source": { + "option1_name": "颜色", + "skus": [ + {"sku_id": "441670", "option1_value": "白色(无内衬)"}, + {"sku_id": "441679", "option1_value": option_value}, + ], + }, + } + ] + assert selector.prepare_hits(hits, parsed_query)["spu-1"].selected_sku_id == "441679" + + +def test_zh_color_intent_matches_noisy_option_value_with_fullwidth_parens(): + """卡其色(无内衬) 是前面 taxonomy-override bug 的实地复现;option 分支现在必须命中。""" + selector = StyleSkuSelector(_color_registry()) + parsed_query = SimpleNamespace( + style_intent_profile=StyleIntentProfile(intents=(_khaki_intent(),)) + ) + hits = [ + { + "_id": "spu-1", + "_source": { + "option1_name": "颜色", + # Even if SPU-level taxonomy existed, white SKU must NOT leak in. + "enriched_taxonomy_attributes": [ + {"name": "Color", "value": {"zh": "卡其色"}} + ], + "skus": [ + {"sku_id": "441670", "option1_value": "白色(无内衬)"}, + {"sku_id": "441679", "option1_value": "卡其色(无内衬)"}, + ], + }, + } + ] + decisions = selector.prepare_hits(hits, parsed_query) + d = decisions["spu-1"] + assert d.selected_sku_id == "441679" + assert d.selected_text == "卡其色(无内衬)" + assert d.final_source == "option" + assert d.matched_sources == {"color": "option"} + + +def test_taxonomy_attribute_extends_text_matching_source(): + """即使 optionN 无法区分 SKU,enriched_taxonomy_attributes 的 Color 也可让 SPU 全部 SKU 通过文本 + 匹配,之后由图像 pick(若有)决定具体 SKU;无图像则取首个。""" + selector = StyleSkuSelector(_color_registry()) + parsed_query = SimpleNamespace( + style_intent_profile=StyleIntentProfile(intents=(_khaki_intent(),)) + ) + hits = [ + { + "_id": "spu-1", + "_source": { + "option1_name": "Style", # unrelated dimension → slot unresolved + "enriched_taxonomy_attributes": [ + {"name": "Color", "value": {"zh": "卡其色", "en": "khaki"}} + ], + "skus": [ + {"sku_id": "a", "option1_value": "A"}, + {"sku_id": "b", "option1_value": "B"}, + ], + }, + } + ] + decisions = selector.prepare_hits(hits, parsed_query) + # Taxonomy matches → both SKUs text-matched; no image pick → first one wins. + d = decisions["spu-1"] + assert d.selected_sku_id == "a" + assert d.final_source == "taxonomy" + # selected_text reflects the real matched taxonomy value, not SKU's unrelated option. + assert d.selected_text == "卡其色" + assert d.matched_sources == {"color": "taxonomy"} + + +def test_taxonomy_does_not_override_contradicting_sku_option_value(): + """SPU 级 taxonomy 说"卡其色",但 SKU 自己的 option1_value 是"白色(无内衬)", + 该 SKU 不应被视作文本命中——避免 SPU 级信号把错色 SKU 顶上去。""" + selector = StyleSkuSelector(_color_registry()) + parsed_query = SimpleNamespace( + style_intent_profile=StyleIntentProfile(intents=(_khaki_intent(),)) + ) + hits = [ + { + "_id": "spu-1", + "_source": { + "option1_name": "颜色", + "enriched_taxonomy_attributes": [ + {"name": "Color", "value": {"zh": "卡其色", "en": "khaki"}} + ], + "skus": [ + {"sku_id": "white", "option1_value": "白色(无内衬)"}, + {"sku_id": "khaki", "option1_value": "卡其色棉"}, + ], + }, + } + ] + decisions = selector.prepare_hits(hits, parsed_query) + # 只有 khaki 自有值匹配;taxonomy 不会把 white 顶出来。 + assert decisions["spu-1"].selected_sku_id == "khaki" + assert decisions["spu-1"].final_source == "option" + + +def test_taxonomy_fills_in_only_when_sku_self_value_is_empty(): + """混合场景:SKU 1 无 option1_value → taxonomy 接管;SKU 2 自带白色 → 不匹配。""" + selector = StyleSkuSelector(_color_registry()) + parsed_query = SimpleNamespace( + style_intent_profile=StyleIntentProfile(intents=(_khaki_intent(),)) + ) + hits = [ + { + "_id": "spu-1", + "_source": { + "option1_name": "颜色", + "enriched_taxonomy_attributes": [ + {"name": "Color", "value": {"zh": "卡其色"}} + ], + "skus": [ + {"sku_id": "no-value", "option1_value": ""}, + {"sku_id": "white", "option1_value": "白色"}, + ], + }, + } + ] + decisions = selector.prepare_hits(hits, parsed_query) + d = decisions["spu-1"] + assert d.selected_sku_id == "no-value" + assert d.final_source == "taxonomy" + assert d.selected_text == "卡其色" + + +def test_image_pick_serves_as_visual_tiebreak_within_text_matched(): + selector = StyleSkuSelector(_color_registry()) + parsed_query = SimpleNamespace( + style_intent_profile=StyleIntentProfile(intents=(_khaki_intent(),)) + ) + hits = [ + { + "_id": "spu-1", + "_source": { + "option1_name": "颜色", + "skus": [ + { + "sku_id": "khaki-cotton", + "option1_value": "卡其色棉", + "image_src": "https://cdn/x/khaki-cotton.jpg", + }, + { + "sku_id": "khaki-linen", + "option1_value": "卡其色麻", + "image_src": "https://cdn/x/khaki-linen.jpg", + }, + ], + }, + "inner_hits": { + "exact_image_knn_query_hits": { + "hits": { + "hits": [ + { + "_score": 0.87, + "_source": {"url": "https://cdn/x/khaki-linen.jpg"}, + } + ] + } + } + }, + } + ] + decisions = selector.prepare_hits(hits, parsed_query) + decision = decisions["spu-1"] + assert decision.selected_sku_id == "khaki-linen" + assert decision.final_source == "option" + assert decision.image_pick_sku_id == "khaki-linen" + assert decision.image_pick_score == 0.87 + + +def test_image_only_selection_when_no_style_intent(): + """无款式意图:仅凭 image_embedding 最近邻 SKU,直接把该 SKU 置顶。""" + selector = StyleSkuSelector(_color_registry()) + parsed_query = SimpleNamespace(style_intent_profile=None) + hits = [ + { + "_id": "spu-1", + "_source": { + "option1_name": "Color", + "image_url": "https://cdn/x/default.jpg", + "skus": [ + { + "sku_id": "red", + "option1_value": "Red", + "image_src": "https://cdn/x/red.jpg", + }, + { + "sku_id": "blue", + "option1_value": "Blue", + "image_src": "https://cdn/x/blue.jpg", + }, + ], + }, + "inner_hits": { + "image_knn_query_hits": { + "hits": { + "hits": [ + {"_score": 0.74, "_source": {"url": "https://cdn/x/blue.jpg"}} + ] + } + } + }, + } + ] + decisions = selector.prepare_hits(hits, parsed_query) + decision = decisions["spu-1"] + assert decision.selected_sku_id == "blue" + assert decision.final_source == "image" + + selector.apply_precomputed_decisions(hits, decisions) + source = hits[0]["_source"] + assert source["skus"][0]["sku_id"] == "blue" + assert source["image_url"] == "https://cdn/x/blue.jpg" + + +def test_image_pick_ignored_when_text_matches_but_visual_url_not_in_text_set(): + """文本命中优先:image-pick 若落在非文本命中 SKU,则不接管。""" + selector = StyleSkuSelector(_color_registry()) + parsed_query = SimpleNamespace( + style_intent_profile=StyleIntentProfile(intents=(_khaki_intent(),)) + ) + hits = [ + { + "_id": "spu-1", + "_source": { + "option1_name": "颜色", + "skus": [ + { + "sku_id": "khaki", + "option1_value": "卡其色", + "image_src": "https://cdn/x/khaki.jpg", + }, + { + "sku_id": "black", + "option1_value": "黑色", + "image_src": "https://cdn/x/black.jpg", + }, + ], + }, + "inner_hits": { + "exact_image_knn_query_hits": { + "hits": { + "hits": [ + {"_score": 0.9, "_source": {"url": "https://cdn/x/black.jpg"}} + ] + } + } + }, + } + ] + decisions = selector.prepare_hits(hits, parsed_query) + decision = decisions["spu-1"] + # Hard text-first: khaki stays, though image pointed at black. + assert decision.selected_sku_id == "khaki" + assert decision.final_source == "option" + assert decision.image_pick_sku_id == "black" -- libgit2 0.21.2