From 2efad04b21072915abc94097289cee2cb1287888 Mon Sep 17 00:00:00 2001 From: tangwang Date: Tue, 24 Mar 2026 15:42:46 +0800 Subject: [PATCH] 意图匹配的性能优化: 上面一版实现,性能上完全无法接受。因此进行了一轮策略简化 --- context/request_context.py | 2 -- docs/TODO-意图判断.md | 46 ++++++++++++++++++++++++++++++++++++++++++++++ frontend/static/js/app.js | 44 +++++++++++++++++++++++++++++++++++++++++++- search/searcher.py | 7 ++----- search/sku_intent_selector.py | 245 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------------------------------------------------------------------------------------- tests/test_search_rerank_window.py | 133 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 6 files changed, 360 insertions(+), 117 deletions(-) diff --git a/context/request_context.py b/context/request_context.py index 22d6875..4539ad4 100644 --- a/context/request_context.py +++ b/context/request_context.py @@ -42,7 +42,6 @@ class QueryAnalysisResult: query_vector: Optional[List[float]] = None boolean_ast: Optional[str] = None is_simple_query: bool = True - domain: str = "default" @dataclass @@ -281,7 +280,6 @@ class RequestContext: 'query_normalized': self.query_analysis.query_normalized, 'rewritten_query': self.query_analysis.rewritten_query, 'detected_language': self.query_analysis.detected_language, - 'domain': self.query_analysis.domain, 'is_simple_query': self.query_analysis.is_simple_query }, 'performance': { diff --git a/docs/TODO-意图判断.md b/docs/TODO-意图判断.md index 4686d4d..ce70c55 100644 --- a/docs/TODO-意图判断.md +++ b/docs/TODO-意图判断.md @@ -51,3 +51,49 @@ QueryParser 负责产出统一的“款式意图分析结果”,包含:命 Searcher 只编排:解析 query -> 首次 ES 召回 -> 若有款式意图则对 rerank 窗口内全部 hits 做 SKU 预筛选并补充 rerank doc 文本 -> run_rerank -> 分页 -> page fill -> ResultFormatter。这样旧的页内 SKU 置顶逻辑可以下沉或删除,避免前后两个阶段重复做同一件事。我再补看一下你最近写的意图草稿文档,确认有没有现成词表示例或边界条件,避免我这边把词表格式和你预期做偏。 + + + +意图匹配的性能优化: +上面一版实现,性能上完全无法接受。因此进行了一轮策略简化 + +prompt +style_sku_prepare_hits阶段耗时太长。请根据需求,思考优化的方法,给出性能优化的方案。 +1. _select_by_embedding,有缓存吗,option_value的值是有限的,之前已经算过的,就不用再算了。不仅仅是embedding相似的结果,整个option_value的匹配结果,是有包含、还是没包含,相似度多少,都不用重新计算。比如之前已经有一个sku的某个属性值叫做“卡其色”,已经算出来是否文本匹配了,那么不需要再去做文本匹配。如果已经算出来向量的相似度,那么不需要再去取向量以及计算相似度。 +2. 匹配可以适当的优化: +匹配流程简化: +1)找到第一个文本匹配的,如果有直接匹配成功。不需要考虑匹配多个的情况。 +2)如果全部都没有匹配,那么进行embedding筛选。 + +匹配规则: +option_name的匹配,直接看规范化后的option_name是不是意图维度的泛化词之一(比如颜色、color、colour),如果没有匹配的,现在应该是把所有维度都算上,这样匹配成本和比较成本太高了,去掉这些逻辑,这种情况不需要加后缀、不需要选择sku。 +ption_value的匹配。意图检测的时候,有匹配的query中的命中的词,这个词被包含在属性值中,那么就算匹配。属性值被包含在query(包括翻译文本)中,也算匹配。提高匹配的覆盖率。 + +3. 这一阶段得到sku选择的结果即可(选中的sku的id,也可以为空值表示没找到匹配成功的,这种情况不需要拼接title后缀给重排输入),但是不用着急做image_url的替换和sku的置顶。等最后填充的时候判断有选中sku的时候直接做替换和置顶即可。 +请你思考如何进行设计,提高性能的时候不带来复杂度的提升,可以适当的重构以降低修改后的代码行数。 +@search/sku_intent_selector.py @query/style_intent.py + + +修改结果: +核心变化在 `search/sku_intent_selector.py` 和 `search/searcher.py`。 + +现在的策略是: +- `option_name` 只做规范化后 alias 精确匹配;任何意图维度没 resolve 到字段,就直接不选 SKU、不加 suffix。 +- `option_value` 文本匹配改成顺序扫描 SKU,找到第一个命中的就直接返回,不再收集多个候选再比较。 +- 文本匹配规则支持两种: + - query 命中的意图词被属性值包含 + - 属性值被 query/translation 包含 +- 只有当所有 SKU 都没有文本命中时,才做 embedding 选择。 +- `prepare_hits()` 现在只产出决策和 `_style_rerank_suffix`,不再提前改 `_source`;真正的 `image_url` 替换和 SKU 置顶统一后移到 `apply_precomputed_decisions()`。 +- 加了请求级缓存,避免同一个 `option_value`/`selection_text` 在一次请求里重复做文本判断、取向量和算相似度。 +- 顺手删掉了旧的 generalized match / fallback_text 路径,代码比之前更短也更直。 + +另外补了回归测试,覆盖了: +- 翻译文本命中后正确选 SKU +- 多个文本命中时取第一个 +- `option_name` 不命中 alias 时不做 SKU 选择 +- 无文本命中时走 embedding fallback + +验证过: +- `pytest tests/test_search_rerank_window.py -q` 通过 +- 变更文件 lint 无报错 diff --git a/frontend/static/js/app.js b/frontend/static/js/app.js index a8a0630..23363ce 100644 --- a/frontend/static/js/app.js +++ b/frontend/static/js/app.js @@ -885,6 +885,40 @@ function goToPage(page) { window.scrollTo({ top: 0, behavior: 'smooth' }); } +/** Query-analysis intent block: dimensions, matched surface form, canonical value, source query variant. */ +function formatIntentDetectionHtml(intent) { + const profile = intent || null; + let block = '
intent_detection:
'; + if (!profile || typeof profile !== 'object') { + block += '
(no intent payload — style intent may be disabled or context missing)
'; + return block; + } + const active = !!profile.active; + block += `
active: ${active ? 'yes' : 'no'}
`; + const intents = Array.isArray(profile.intents) ? profile.intents : []; + if (!intents.length) { + block += '
intents: (none — no vocabulary match on query variants)
'; + return block; + } + block += '
intents:
'; + if (Array.isArray(profile.query_variants) && profile.query_variants.length > 0) { + block += '
query_variants:
'; + block += `
${escapeHtml(customStringify(profile.query_variants))}
`; + } + return block; +} + // Display debug info function displayDebugInfo(data) { const debugInfoDiv = document.getElementById('debugInfo'); @@ -916,7 +950,6 @@ function displayDebugInfo(data) { html += `
detected_language: ${escapeHtml(debugInfo.query_analysis.detected_language || 'N/A')}
`; html += `
index_languages: ${escapeHtml((debugInfo.query_analysis.index_languages || []).join(', ') || 'N/A')}
`; html += `
query_tokens: ${escapeHtml((debugInfo.query_analysis.query_tokens || []).join(', ') || 'N/A')}
`; - html += `
domain: ${escapeHtml(debugInfo.query_analysis.domain || 'default')}
`; html += `
is_simple_query: ${debugInfo.query_analysis.is_simple_query ? 'yes' : 'no'}
`; if (debugInfo.query_analysis.translations && Object.keys(debugInfo.query_analysis.translations).length > 0) { @@ -932,6 +965,9 @@ function displayDebugInfo(data) { if (debugInfo.query_analysis.boolean_ast) { html += `
boolean_ast: ${escapeHtml(debugInfo.query_analysis.boolean_ast)}
`; } + + const intentPayload = debugInfo.query_analysis.intent_detection ?? debugInfo.query_analysis.style_intent_profile; + html += formatIntentDetectionHtml(intentPayload); html += ''; } @@ -942,6 +978,12 @@ function displayDebugInfo(data) { html += `
translation_enabled: ${debugInfo.feature_flags.translation_enabled ? 'enabled' : 'disabled'}
`; html += `
embedding_enabled: ${debugInfo.feature_flags.embedding_enabled ? 'enabled' : 'disabled'}
`; html += `
rerank_enabled: ${debugInfo.feature_flags.rerank_enabled ? 'enabled' : 'disabled'}
`; + if (debugInfo.feature_flags.style_intent_enabled !== undefined) { + html += `
style_intent_enabled: ${debugInfo.feature_flags.style_intent_enabled ? 'enabled' : 'disabled'}
`; + } + if (debugInfo.feature_flags.style_intent_active !== undefined) { + html += `
style_intent_active: ${debugInfo.feature_flags.style_intent_active ? 'yes' : 'no'}
`; + } html += ''; } diff --git a/search/searcher.py b/search/searcher.py index e3e7138..d7bccea 100644 --- a/search/searcher.py +++ b/search/searcher.py @@ -119,7 +119,6 @@ class Searcher: self.style_sku_selector = StyleSkuSelector( self.style_intent_registry, text_encoder_getter=lambda: getattr(self.query_parser, "text_encoder", None), - tokenizer_getter=lambda: getattr(self.query_parser, "_tokenizer", None), ) # Query builder - simplified single-layer architecture @@ -397,7 +396,6 @@ class Searcher: detected_language=parsed_query.detected_language, translations=parsed_query.translations, query_vector=parsed_query.query_vector.tolist() if parsed_query.query_vector is not None else None, - domain="default", is_simple_query=True ) context.metadata["feature_flags"]["style_intent_active"] = self._has_style_intent(parsed_query) @@ -732,7 +730,7 @@ class Searcher: rerank_debug_by_doc[str(doc_id)] = item if self._has_style_intent(parsed_query): - if in_rerank_window and style_intent_decisions: + if style_intent_decisions: self.style_sku_selector.apply_precomputed_decisions( es_hits, style_intent_decisions, @@ -867,8 +865,7 @@ class Searcher: "has_vector": context.query_analysis.query_vector is not None, "query_tokens": getattr(parsed_query, "query_tokens", []), "is_simple_query": context.query_analysis.is_simple_query, - "domain": context.query_analysis.domain, - "style_intent_profile": context.get_intermediate_result("style_intent_profile"), + "intent_detection": context.get_intermediate_result("style_intent_profile"), }, "es_query": context.get_intermediate_result('es_query', {}), "es_query_context": { diff --git a/search/sku_intent_selector.py b/search/sku_intent_selector.py index c832573..4f9216a 100644 --- a/search/sku_intent_selector.py +++ b/search/sku_intent_selector.py @@ -5,7 +5,7 @@ SKU selection for style-intent-aware search results. from __future__ import annotations from dataclasses import dataclass, field -from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple +from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple import numpy as np @@ -39,7 +39,18 @@ class _SkuCandidate: sku_id: str sku: Dict[str, Any] selection_text: str - intent_texts: Dict[str, str] + normalized_selection_text: str + intent_values: Dict[str, str] + + +@dataclass +class _SelectionContext: + query_texts: Tuple[str, ...] + matched_terms_by_intent: Dict[str, Tuple[str, ...]] + query_vector: Optional[np.ndarray] + text_match_cache: Dict[Tuple[str, str], bool] = field(default_factory=dict) + selection_vector_cache: Dict[str, Optional[np.ndarray]] = field(default_factory=dict) + similarity_cache: Dict[str, Optional[float]] = field(default_factory=dict) class StyleSkuSelector: @@ -50,11 +61,9 @@ class StyleSkuSelector: registry: StyleIntentRegistry, *, text_encoder_getter: Optional[Callable[[], Any]] = None, - tokenizer_getter: Optional[Callable[[], Any]] = None, ) -> None: self.registry = registry self._text_encoder_getter = text_encoder_getter - self._tokenizer_getter = tokenizer_getter def prepare_hits( self, @@ -66,9 +75,7 @@ class StyleSkuSelector: if not isinstance(style_profile, StyleIntentProfile) or not style_profile.is_active: return decisions - query_texts = self._build_query_texts(parsed_query, style_profile) - query_vector = self._get_query_vector(parsed_query) - tokenizer = self._get_tokenizer() + selection_context = self._build_selection_context(parsed_query, style_profile) for hit in es_hits: source = hit.get("_source") @@ -78,16 +85,15 @@ class StyleSkuSelector: decision = self._select_for_source( source, style_profile=style_profile, - query_texts=query_texts, - query_vector=query_vector, - tokenizer=tokenizer, + selection_context=selection_context, ) if decision is None: continue - self._apply_decision_to_source(source, decision) if decision.rerank_suffix: hit["_style_rerank_suffix"] = decision.rerank_suffix + else: + hit.pop("_style_rerank_suffix", None) doc_id = hit.get("_id") if doc_id is not None: @@ -116,6 +122,8 @@ class StyleSkuSelector: self._apply_decision_to_source(source, decision) if decision.rerank_suffix: hit["_style_rerank_suffix"] = decision.rerank_suffix + else: + hit.pop("_style_rerank_suffix", None) def _build_query_texts( self, @@ -165,25 +173,34 @@ class StyleSkuSelector: return None return np.asarray(vectors[0], dtype=np.float32) + def _build_selection_context( + self, + parsed_query: Any, + style_profile: StyleIntentProfile, + ) -> _SelectionContext: + matched_terms_by_intent: Dict[str, List[str]] = {} + for intent in style_profile.intents: + normalized_term = normalize_query_text(intent.matched_term) + if not normalized_term: + continue + matched_terms = matched_terms_by_intent.setdefault(intent.intent_type, []) + if normalized_term not in matched_terms: + matched_terms.append(normalized_term) + + return _SelectionContext( + query_texts=tuple(self._build_query_texts(parsed_query, style_profile)), + matched_terms_by_intent={ + intent_type: tuple(terms) + for intent_type, terms in matched_terms_by_intent.items() + }, + query_vector=self._get_query_vector(parsed_query), + ) + def _get_text_encoder(self) -> Any: if self._text_encoder_getter is None: return None return self._text_encoder_getter() - def _get_tokenizer(self) -> Any: - if self._tokenizer_getter is None: - return None - return self._tokenizer_getter() - - @staticmethod - def _fallback_sku_text(sku: Dict[str, Any]) -> str: - parts = [] - for field_name in ("option1_value", "option2_value", "option3_value"): - value = str(sku.get(field_name) or "").strip() - if value: - parts.append(value) - return " ".join(parts) - def _resolve_dimensions( self, source: Dict[str, Any], @@ -212,157 +229,171 @@ class StyleSkuSelector: skus: List[Dict[str, Any]], resolved_dimensions: Dict[str, Optional[str]], ) -> List[_SkuCandidate]: + if not resolved_dimensions or any(not field_name for field_name in resolved_dimensions.values()): + return [] + candidates: List[_SkuCandidate] = [] for index, sku in enumerate(skus): - fallback_text = self._fallback_sku_text(sku) - intent_texts: Dict[str, str] = {} + intent_values: Dict[str, str] = {} for intent_type, field_name in resolved_dimensions.items(): - if field_name: - value = str(sku.get(field_name) or "").strip() - intent_texts[intent_type] = value or fallback_text - else: - intent_texts[intent_type] = fallback_text + if not field_name: + continue + intent_values[intent_type] = str(sku.get(field_name) or "").strip() selection_parts: List[str] = [] seen = set() - for value in intent_texts.values(): + for value in intent_values.values(): normalized = normalize_query_text(value) if not normalized or normalized in seen: continue seen.add(normalized) - selection_parts.append(str(value).strip()) + selection_parts.append(value) - selection_text = " ".join(selection_parts).strip() or fallback_text + selection_text = " ".join(selection_parts).strip() candidates.append( _SkuCandidate( index=index, sku_id=str(sku.get("sku_id") or ""), sku=sku, selection_text=selection_text, - intent_texts=intent_texts, + normalized_selection_text=normalize_query_text(selection_text), + intent_values=intent_values, ) ) return candidates @staticmethod - def _is_direct_match( - candidate: _SkuCandidate, - query_texts: Sequence[str], - ) -> bool: - if not candidate.intent_texts or not query_texts: - return False - for value in candidate.intent_texts.values(): - normalized_value = normalize_query_text(value) - if not normalized_value: - return False - if not any(normalized_value in query_text for query_text in query_texts): - return False - return True - - def _is_generalized_match( + def _empty_decision( + resolved_dimensions: Dict[str, Optional[str]], + matched_stage: str, + ) -> SkuSelectionDecision: + return SkuSelectionDecision( + selected_sku_id=None, + rerank_suffix="", + selected_text="", + matched_stage=matched_stage, + resolved_dimensions=dict(resolved_dimensions), + ) + + def _is_text_match( self, - candidate: _SkuCandidate, - style_profile: StyleIntentProfile, - tokenizer: Any, + intent_type: str, + value: str, + selection_context: _SelectionContext, ) -> bool: - if not candidate.intent_texts: + normalized_value = normalize_query_text(value) + if not normalized_value: return False - for intent_type, value in candidate.intent_texts.items(): - definition = self.registry.get_definition(intent_type) - if definition is None: - return False - matched_canonicals = definition.match_text(value, tokenizer=tokenizer) - if not matched_canonicals.intersection(style_profile.get_canonical_values(intent_type)): - return False - return True + cache_key = (intent_type, normalized_value) + cached = selection_context.text_match_cache.get(cache_key) + if cached is not None: + return cached + + matched_terms = selection_context.matched_terms_by_intent.get(intent_type, ()) + has_term_match = any(term in normalized_value for term in matched_terms if term) + query_contains_value = any( + normalized_value in query_text + for query_text in selection_context.query_texts + ) + matched = bool(has_term_match or query_contains_value) + selection_context.text_match_cache[cache_key] = matched + return matched + + def _find_first_text_match( + self, + candidates: Sequence[_SkuCandidate], + selection_context: _SelectionContext, + ) -> Optional[_SkuCandidate]: + for candidate in candidates: + if candidate.intent_values and all( + self._is_text_match(intent_type, value, selection_context) + for intent_type, value in candidate.intent_values.items() + ): + return candidate + return None def _select_by_embedding( self, candidates: Sequence[_SkuCandidate], - query_vector: Optional[np.ndarray], + selection_context: _SelectionContext, ) -> Tuple[Optional[_SkuCandidate], Optional[float]]: if not candidates: return None, None text_encoder = self._get_text_encoder() - if query_vector is None or text_encoder is None: - return candidates[0], None + if selection_context.query_vector is None or text_encoder is None: + return None, None unique_texts = list( dict.fromkeys( - normalize_query_text(candidate.selection_text) + candidate.normalized_selection_text for candidate in candidates - if normalize_query_text(candidate.selection_text) + if candidate.normalized_selection_text + and candidate.normalized_selection_text not in selection_context.selection_vector_cache ) ) - if not unique_texts: - return candidates[0], None - - vectors = text_encoder.encode(unique_texts, priority=1) - vector_map: Dict[str, np.ndarray] = {} - for key, vector in zip(unique_texts, vectors): - if vector is None: - continue - vector_map[key] = np.asarray(vector, dtype=np.float32) + if unique_texts: + vectors = text_encoder.encode(unique_texts, priority=1) + for key, vector in zip(unique_texts, vectors): + selection_context.selection_vector_cache[key] = ( + np.asarray(vector, dtype=np.float32) if vector is not None else None + ) best_candidate: Optional[_SkuCandidate] = None best_score: Optional[float] = None - query_vector_array = np.asarray(query_vector, dtype=np.float32) + query_vector_array = np.asarray(selection_context.query_vector, dtype=np.float32) for candidate in candidates: - normalized_text = normalize_query_text(candidate.selection_text) - candidate_vector = vector_map.get(normalized_text) - if candidate_vector is None: + normalized_text = candidate.normalized_selection_text + if not normalized_text: + continue + + score = selection_context.similarity_cache.get(normalized_text) + if score is None: + candidate_vector = selection_context.selection_vector_cache.get(normalized_text) + if candidate_vector is None: + selection_context.similarity_cache[normalized_text] = None + continue + score = float(np.inner(query_vector_array, candidate_vector)) + selection_context.similarity_cache[normalized_text] = score + + if score is None: continue - score = float(np.inner(query_vector_array, candidate_vector)) if best_score is None or score > best_score: best_candidate = candidate best_score = score - return best_candidate or candidates[0], best_score + return best_candidate, best_score def _select_for_source( self, source: Dict[str, Any], *, style_profile: StyleIntentProfile, - query_texts: Sequence[str], - query_vector: Optional[np.ndarray], - tokenizer: Any, + selection_context: _SelectionContext, ) -> Optional[SkuSelectionDecision]: skus = source.get("skus") if not isinstance(skus, list) or not skus: return None resolved_dimensions = self._resolve_dimensions(source, style_profile) + if not resolved_dimensions or any(not field_name for field_name in resolved_dimensions.values()): + return self._empty_decision(resolved_dimensions, matched_stage="unresolved") + candidates = self._build_candidates(skus, resolved_dimensions) if not candidates: - return None + return self._empty_decision(resolved_dimensions, matched_stage="no_candidates") - direct_matches = [candidate for candidate in candidates if self._is_direct_match(candidate, query_texts)] - if len(direct_matches) == 1: - chosen = direct_matches[0] - return self._build_decision(chosen, resolved_dimensions, matched_stage="direct") + text_match = self._find_first_text_match(candidates, selection_context) + if text_match is not None: + return self._build_decision(text_match, resolved_dimensions, matched_stage="text") - generalized_matches: List[_SkuCandidate] = [] - if not direct_matches: - generalized_matches = [ - candidate - for candidate in candidates - if self._is_generalized_match(candidate, style_profile, tokenizer) - ] - if len(generalized_matches) == 1: - chosen = generalized_matches[0] - return self._build_decision(chosen, resolved_dimensions, matched_stage="generalized") - - embedding_pool = direct_matches or generalized_matches or candidates - chosen, similarity_score = self._select_by_embedding(embedding_pool, query_vector) + chosen, similarity_score = self._select_by_embedding(candidates, selection_context) if chosen is None: - return None - stage = "embedding_from_matches" if direct_matches or generalized_matches else "embedding_from_all" + return self._empty_decision(resolved_dimensions, matched_stage="no_match") return self._build_decision( chosen, resolved_dimensions, - matched_stage=stage, + matched_stage="embedding", similarity_score=similarity_score, ) diff --git a/tests/test_search_rerank_window.py b/tests/test_search_rerank_window.py index cfb798e..2ca38f0 100644 --- a/tests/test_search_rerank_window.py +++ b/tests/test_search_rerank_window.py @@ -30,7 +30,6 @@ class _FakeParsedQuery: detected_language: str = "en" translations: Dict[str, str] = None query_vector: Any = None - domain: str = "default" style_intent_profile: Any = None def to_dict(self) -> Dict[str, Any]: @@ -40,7 +39,6 @@ class _FakeParsedQuery: "rewritten_query": self.rewritten_query, "detected_language": self.detected_language, "translations": self.translations or {}, - "domain": self.domain, "style_intent_profile": ( self.style_intent_profile.to_dict() if self.style_intent_profile is not None else None ), @@ -542,6 +540,137 @@ def test_searcher_promotes_sku_when_option1_matches_translated_query(monkeypatch assert result.results[0].image_url == "https://img/black.jpg" +def test_searcher_uses_first_text_match_without_comparing_all_matches(monkeypatch): + es_client = _FakeESClient(total_hits=1) + searcher = _build_searcher(_build_search_config(rerank_enabled=False), es_client) + context = create_request_context(reqid="sku-first-text", uid="u-sku-first-text") + + monkeypatch.setattr( + "search.searcher.get_tenant_config_loader", + lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en"]}), + ) + + class _TextMatchQueryParser: + text_encoder = None + + def parse( + self, + query: str, + tenant_id: str, + generate_vector: bool, + context: Any, + target_languages: Any = None, + ): + return _FakeParsedQuery( + original_query=query, + query_normalized=query, + rewritten_query=query, + translations={}, + style_intent_profile=_build_style_intent_profile( + "color", "black", "color", "colors", "颜色" + ), + ) + + searcher.query_parser = _TextMatchQueryParser() + + def _full_source_with_multiple_text_matches(doc_id: str) -> Dict[str, Any]: + return { + "spu_id": doc_id, + "title": {"en": f"product-{doc_id}"}, + "brief": {"en": f"brief-{doc_id}"}, + "vendor": {"en": f"vendor-{doc_id}"}, + "option1_name": "Color", + "image_url": "https://img/default.jpg", + "skus": [ + {"sku_id": "sku-red", "option1_value": "Red", "image_src": "https://img/red.jpg"}, + { + "sku_id": "sku-gloss-black", + "option1_value": "Gloss Black", + "image_src": "https://img/gloss-black.jpg", + }, + {"sku_id": "sku-black", "option1_value": "Black", "image_src": "https://img/black.jpg"}, + ], + } + + monkeypatch.setattr(_FakeESClient, "_full_source", staticmethod(_full_source_with_multiple_text_matches)) + + result = searcher.search( + query="black dress", + tenant_id="162", + from_=0, + size=1, + context=context, + enable_rerank=False, + ) + + assert len(result.results) == 1 + assert result.results[0].skus[0].sku_id == "sku-gloss-black" + assert result.results[0].image_url == "https://img/gloss-black.jpg" + + +def test_searcher_skips_sku_selection_when_option_name_does_not_match_dimension_alias(monkeypatch): + es_client = _FakeESClient(total_hits=1) + searcher = _build_searcher(_build_search_config(rerank_enabled=False), es_client) + context = create_request_context(reqid="sku-unresolved-dimension", uid="u-sku-unresolved-dimension") + + monkeypatch.setattr( + "search.searcher.get_tenant_config_loader", + lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en", "zh"]}), + ) + + class _UnresolvedDimensionQueryParser: + text_encoder = None + + def parse( + self, + query: str, + tenant_id: str, + generate_vector: bool, + context: Any, + target_languages: Any = None, + ): + return _FakeParsedQuery( + original_query=query, + query_normalized=query, + rewritten_query=query, + translations={"en": "black dress"}, + style_intent_profile=_build_style_intent_profile( + "color", "black", "color", "colors", "颜色" + ), + ) + + searcher.query_parser = _UnresolvedDimensionQueryParser() + + def _full_source_with_unmatched_option_name(doc_id: str) -> Dict[str, Any]: + return { + "spu_id": doc_id, + "title": {"en": f"product-{doc_id}"}, + "brief": {"en": f"brief-{doc_id}"}, + "vendor": {"en": f"vendor-{doc_id}"}, + "option1_name": "Tone", + "image_url": "https://img/default.jpg", + "skus": [ + {"sku_id": "sku-red", "option1_value": "Red", "image_src": "https://img/red.jpg"}, + {"sku_id": "sku-black", "option1_value": "Black", "image_src": "https://img/black.jpg"}, + ], + } + + monkeypatch.setattr(_FakeESClient, "_full_source", staticmethod(_full_source_with_unmatched_option_name)) + + result = searcher.search( + query="黑色 连衣裙", + tenant_id="162", + from_=0, + size=1, + context=context, + enable_rerank=False, + ) + + assert len(result.results) == 1 + assert result.results[0].skus[0].sku_id == "sku-red" + assert result.results[0].image_url == "https://img/default.jpg" + + def test_searcher_promotes_sku_by_embedding_when_query_has_no_direct_option_match(monkeypatch): es_client = _FakeESClient(total_hits=1) searcher = _build_searcher(_build_search_config(rerank_enabled=False), es_client) -- libgit2 0.21.2