Commit 2efad04b21072915abc94097289cee2cb1287888
1 parent
6075aa91
意图匹配的性能优化:
上面一版实现,性能上完全无法接受。因此进行了一轮策略简化 style_sku_prepare_hits阶段耗时太长。请根据需求,思考优化的方法,给出性能优化的方案。 1. _select_by_embedding,有缓存吗,option_value的值是有限的,之前已经算过的,就不用再算了。不仅仅是embedding相似的结果,整个option_value的匹配结果,是有包含、还是没包含,相似度多少,都不用重新计算。比如之前已经有一个sku的某个属性值叫做“卡其色”,已经算出来是否文本匹配了,那么不需要再去做文本匹配。如果已经算出来向量的相似度,那么不需要再去取向量以及计算相似度。 2. 匹配可以适当的优化: 匹配流程简化: 1)找到第一个文本匹配的,如果有直接匹配成功。不需要考虑匹配多个的情况。 2)如果全部都没有匹配,那么进行embedding筛选。 匹配规则: option_name的匹配,直接看规范化后的option_name是不是意图维度的泛化词之一(比如颜色、color、colour),如果没有匹配的,现在应该是把所有维度都算上,这样匹配成本和比较成本太高了,去掉这些逻辑,这种情况不需要加后缀、不需要选择sku。 ption_value的匹配。意图检测的时候,有匹配的query中的命中的词,这个词被包含在属性值中,那么就算匹配。属性值被包含在query(包括翻译文本)中,也算匹配。提高匹配的覆盖率。 3. 这一阶段得到sku选择的结果即可(选中的sku的id,也可以为空值表示没找到匹配成功的,这种情况不需要拼接title后缀给重排输入),但是不用着急做image_url的替换和sku的置顶。等最后填充的时候判断有选中sku的时候直接做替换和置顶即可。 请你思考如何进行设计,提高性能的时候不带来复杂度的提升,可以适当的重构以降低修改后的代码行数。 @search/sku_intent_selector.py @query/style_intent.py
Showing
6 changed files
with
360 additions
and
117 deletions
Show diff stats
context/request_context.py
| @@ -42,7 +42,6 @@ class QueryAnalysisResult: | @@ -42,7 +42,6 @@ class QueryAnalysisResult: | ||
| 42 | query_vector: Optional[List[float]] = None | 42 | query_vector: Optional[List[float]] = None |
| 43 | boolean_ast: Optional[str] = None | 43 | boolean_ast: Optional[str] = None |
| 44 | is_simple_query: bool = True | 44 | is_simple_query: bool = True |
| 45 | - domain: str = "default" | ||
| 46 | 45 | ||
| 47 | 46 | ||
| 48 | @dataclass | 47 | @dataclass |
| @@ -281,7 +280,6 @@ class RequestContext: | @@ -281,7 +280,6 @@ class RequestContext: | ||
| 281 | 'query_normalized': self.query_analysis.query_normalized, | 280 | 'query_normalized': self.query_analysis.query_normalized, |
| 282 | 'rewritten_query': self.query_analysis.rewritten_query, | 281 | 'rewritten_query': self.query_analysis.rewritten_query, |
| 283 | 'detected_language': self.query_analysis.detected_language, | 282 | 'detected_language': self.query_analysis.detected_language, |
| 284 | - 'domain': self.query_analysis.domain, | ||
| 285 | 'is_simple_query': self.query_analysis.is_simple_query | 283 | 'is_simple_query': self.query_analysis.is_simple_query |
| 286 | }, | 284 | }, |
| 287 | 'performance': { | 285 | 'performance': { |
docs/TODO-意图判断.md
| @@ -51,3 +51,49 @@ QueryParser 雍溯エ」莠ァ蜃コ扈滉ク逧懈ャセ蠑乗э蝗セ蛻梵扈捺棡窶晢シ悟桁蜷ォ壼多荳 | @@ -51,3 +51,49 @@ QueryParser 雍溯エ」莠ァ蜃コ扈滉ク逧懈ャセ蠑乗э蝗セ蛻梵扈捺棡窶晢シ悟桁蜷ォ壼多荳 | ||
| 51 | Searcher 蜿ェ郛匁賜夊ァ」譫 query -> 鬥匁ャ。 ES 蜿ャ蝗 -> 闍・譛画ャセ蠑乗э蝗セ蛻吝ッケ rerank 遯怜哨蜀驛ィ hits 蛛 SKU 鬚ュ幃牙ケカ陦・蜈 rerank doc 譁悽 -> run_rerank -> 蛻。オ -> page fill -> ResultFormatter縲りソ呎キ譌ァ逧。オ蜀 SKU 鄂ョ鬘カ騾サ霎大庄莉・荳区イ画蛻髯、碁∩蜈榊燕蜷惹ク、荳ェ髦カ谿オ驥榊、榊★蜷御ク莉カ莠九よ蜀崎。・逵倶ク荳倶ス譛霑大逧э蝗セ闕臥ィソ譁。」檎。ョ隶、譛画イ。譛臥鴫謌占ッ崎。ィ遉コ萓区霎ケ逡梧擅莉カ碁∩蜈肴霑呵セケ謚願ッ崎。ィ譬シ蠑丞柱菴鬚悄蛛壼¥縲 | 51 | Searcher 蜿ェ郛匁賜夊ァ」譫 query -> 鬥匁ャ。 ES 蜿ャ蝗 -> 闍・譛画ャセ蠑乗э蝗セ蛻吝ッケ rerank 遯怜哨蜀驛ィ hits 蛛 SKU 鬚ュ幃牙ケカ陦・蜈 rerank doc 譁悽 -> run_rerank -> 蛻。オ -> page fill -> ResultFormatter縲りソ呎キ譌ァ逧。オ蜀 SKU 鄂ョ鬘カ騾サ霎大庄莉・荳区イ画蛻髯、碁∩蜈榊燕蜷惹ク、荳ェ髦カ谿オ驥榊、榊★蜷御ク莉カ莠九よ蜀崎。・逵倶ク荳倶ス譛霑大逧э蝗セ闕臥ィソ譁。」檎。ョ隶、譛画イ。譛臥鴫謌占ッ崎。ィ遉コ萓区霎ケ逡梧擅莉カ碁∩蜈肴霑呵セケ謚願ッ崎。ィ譬シ蠑丞柱菴鬚悄蛛壼¥縲 |
| 52 | 52 | ||
| 53 | 53 | ||
| 54 | + | ||
| 55 | + | ||
| 56 | + | ||
| 57 | +諢丞崟蛹ケ驟咲噪諤ァ閭ス莨伜喧 | ||
| 58 | +荳企擇荳迚亥ョ樒鴫梧ァ閭ス荳雁ョ悟譌豕墓磁蜿励ょ屏豁、霑幄。御コク霓ョ遲也払邂蛹 | ||
| 59 | + | ||
| 60 | +prompt | ||
| 61 | +style_sku_prepare_hits髦カ谿オ閠玲慮螟ェ髟ソ縲りッキ譬ケ謐ョ髴豎ゑシ梧晁シ伜喧逧婿豕包シ檎サ吝諤ァ閭ス莨伜喧逧婿譯医 | ||
| 62 | +1. _select_by_embedding梧怏郛灘ュ伜雛経ption_value逧シ譏ッ譛蛾剞逧シ御ケ句燕蟾イ扈冗ョ苓ソ噪悟ーア荳咲畑蜀咲ョ嶺コゆク堺ササ弍embedding逶ク莨シ逧サ捺棡梧紛荳ェoption_value逧源驟咲サ捺棡梧弍譛牙桁蜷ォ縲∬ソ俶弍豐。蛹性檎嶌莨シ蠎ヲ螟壼ー托シ碁荳咲畑驥肴眠隶。邂励よッ泌ヲゆケ句燕蟾イ扈乗怏荳荳ェsku逧汾荳ェ螻樊ァ蛟シ蜿ォ蛛壺懷今蜈カ濶イ窶晢シ悟キイ扈冗ョ怜譚・譏ッ蜷ヲ譁悽蛹ケ驟堺コシ碁ぅ荵井ク埼怙隕∝蜴サ蛛壽枚譛ャ蛹ケ驟阪ょヲよ棡蟾イ扈冗ョ怜譚・蜷鷹逧嶌莨シ蠎ヲ碁ぅ荵井ク埼怙隕∝蜴サ蜿門髄驥丈サ・蜿願ョ。邂礼嶌莨シ蠎ヲ縲 | ||
| 63 | +2. 蛹ケ驟榊庄莉・騾ょス鍋噪莨伜喧 | ||
| 64 | +蛹ケ驟肴オ∫ィ狗ョ蛹厄シ | ||
| 65 | +1画伽蛻ー隨ャ荳荳ェ譁悽蛹ケ驟咲噪悟ヲよ棡譛臥峩謗・蛹ケ驟肴蜉溘ゆク埼怙隕∬剔蛹ケ驟榊、壻クェ逧ュ蜀オ縲 | ||
| 66 | +2牙ヲよ棡蜈ィ驛ィ驛ス豐。譛牙源驟搾シ碁ぅ荵郁ソ幄。憩mbedding遲幃峨 | ||
| 67 | + | ||
| 68 | +蛹ケ驟崎ァ | ||
| 69 | +option_name逧源驟搾シ檎峩謗・逵玖ァ激蛹門錘逧ption_name譏ッ荳肴弍諢丞崟扈エ蠎ヲ逧ウ帛喧隸堺ケ倶ク域ッ泌ヲる「懆牡縲…olor縲…olour会シ悟ヲよ棡豐。譛牙源驟咲噪檎鴫蝨ィ蠎碑ッ・譏ッ謚頑園譛臥サエ蠎ヲ驛ス邂嶺ク奇シ瑚ソ呎キ蛹ケ驟肴譛ャ蜥梧ッ碑セ譛ャ螟ェ鬮倅コシ悟悉謗芽ソ吩コ幃サ霎托シ瑚ソ咏ァ肴ュ蜀オ荳埼怙隕∝刈蜷守シ縲∽ク埼怙隕画叫sku縲 | ||
| 70 | +ption_value逧源驟阪よэ蝗セ譽豬狗噪譌カ蛟呻シ梧怏蛹ケ驟咲噪query荳ュ逧多荳ュ逧ッ搾シ瑚ソ吩クェ隸崎「ォ蛹性蝨ィ螻樊ァ蛟シ荳ュ碁ぅ荵亥ーア邂怜源驟阪ょア樊ァ蛟シ陲ォ蛹性蝨ィquery亥桁諡ャ鄙サ隸第枚譛ャ我クュ御ケ溽ョ怜源驟阪よ署鬮伜源驟咲噪隕尠邇 | ||
| 71 | + | ||
| 72 | +3. 霑吩ク髦カ谿オ蠕怜芦sku騾画叫逧サ捺棡蜊ウ蜿ッ磯我クュ逧гku逧d御ケ溷庄莉・荳コ遨コ蛟シ陦ィ遉コ豐。謇セ蛻ー蛹ケ驟肴蜉溽噪瑚ソ咏ァ肴ュ蜀オ荳埼怙隕∵蕎謗・title蜷守シ扈咎謗定セ灘会シ御ス弍荳咲畑逹諤・蛛喨mage_url逧崛謐「蜥茎ku逧スョ鬘カ縲らュ画怙蜷主。ォ蜈噪譌カ蛟吝愛譁ュ譛蛾我クュsku逧慮蛟咏峩謗・蛛壽崛謐「蜥檎スョ鬘カ蜊ウ蜿ッ縲 | ||
| 73 | +隸キ菴諤晁ヲゆス戊ソ幄。瑚ョセ隶。梧署鬮俶ァ閭ス逧慮蛟吩ク榊クヲ譚・螟肴揩蠎ヲ逧署蜊シ悟庄莉・騾ょス鍋噪驥肴桷莉・髯堺ス惹ソョ謾ケ蜷守噪莉」遐∬。梧焚縲 | ||
| 74 | +@search/sku_intent_selector.py @query/style_intent.py | ||
| 75 | + | ||
| 76 | + | ||
| 77 | +菫ョ謾ケ扈捺棡 | ||
| 78 | +譬ク蠢序蛹門惠 `search/sku_intent_selector.py` 蜥 `search/searcher.py`縲 | ||
| 79 | + | ||
| 80 | +邇ー蝨ィ逧ュ也払譏ッ | ||
| 81 | +- `option_name` 蜿ェ蛛夊ァ激蛹門錘 alias 邊セ遑ョ蛹ケ驟搾シ帑ササ菴墓э蝗セ扈エ蠎ヲ豐。 resolve 蛻ー蟄玲ョオ悟ーア逶エ謗・荳埼 SKU縲∽ク榊刈 suffix縲 | ||
| 82 | +- `option_value` 譁悽蛹ケ驟肴隼謌宣。コ蠎乗沖謠 SKU梧伽蛻ー隨ャ荳荳ェ蜻ス荳ュ逧ーア逶エ謗・霑泌屓御ク榊謾カ髮、壻クェ蛟咎牙豈碑セ | ||
| 83 | +- 譁悽蛹ケ驟崎ァ謾ッ謖∽ク、遘搾シ | ||
| 84 | + - query 蜻ス荳ュ逧э蝗セ隸崎「ォ螻樊ァ蛟シ蛹性 | ||
| 85 | + - 螻樊ァ蛟シ陲ォ query/translation 蛹性 | ||
| 86 | +- 蜿ェ譛牙ス捺園譛 SKU 驛ス豐。譛画枚譛ャ蜻ス荳ュ譌カ梧燕蛛 embedding 騾画叫縲 | ||
| 87 | +- `prepare_hits()` 邇ー蝨ィ蜿ェ莠ァ蜃コ蜀ウ遲門柱 `_style_rerank_suffix`御ク榊謠仙燕謾ケ `_source`帷悄豁」逧 `image_url` 譖ソ謐「蜥 SKU 鄂ョ鬘カ扈滉ク蜷守ァサ蛻ー `apply_precomputed_decisions()`縲 | ||
| 88 | +- 蜉莠ッキ豎らコァ郛灘ュ假シ碁∩蜈榊酔荳荳ェ `option_value`/`selection_text` 蝨ィ荳谺。隸キ豎る驥榊、榊★譁悽蛻、譁ュ縲∝叙蜷鷹蜥檎ョ礼嶌莨シ蠎ヲ縲 | ||
| 89 | +- 鬘コ謇句唖謗我コ立逧 generalized match / fallback_text 霍ッ蠕シ御サ」遐∵ッ比ケ句燕譖エ遏ュ荵滓峩逶エ縲 | ||
| 90 | + | ||
| 91 | +蜿ヲ螟冶。・莠屓蠖呈オ玖ッ包シ瑚ヲ尠莠シ | ||
| 92 | +- 鄙サ隸第枚譛ャ蜻ス荳ュ蜷取ュ」遑ョ騾 SKU | ||
| 93 | +- 螟壻クェ譁悽蜻ス荳ュ譌カ蜿也ャャ荳荳ェ | ||
| 94 | +- `option_name` 荳榊多荳ュ alias 譌カ荳榊★ SKU 騾画叫 | ||
| 95 | +- 譌譁悽蜻ス荳ュ譌カ襍ー embedding fallback | ||
| 96 | + | ||
| 97 | +鬪瑚ッ∬ソシ | ||
| 98 | +- `pytest tests/test_search_rerank_window.py -q` 騾夊ソ | ||
| 99 | +- 蜿俶峩譁サカ lint 譌謚・髞 |
frontend/static/js/app.js
| @@ -885,6 +885,40 @@ function goToPage(page) { | @@ -885,6 +885,40 @@ function goToPage(page) { | ||
| 885 | window.scrollTo({ top: 0, behavior: 'smooth' }); | 885 | window.scrollTo({ top: 0, behavior: 'smooth' }); |
| 886 | } | 886 | } |
| 887 | 887 | ||
| 888 | +/** Query-analysis intent block: dimensions, matched surface form, canonical value, source query variant. */ | ||
| 889 | +function formatIntentDetectionHtml(intent) { | ||
| 890 | + const profile = intent || null; | ||
| 891 | + let block = '<div style="margin-top: 10px;"><strong style="font-size: 13px;">intent_detection:</strong></div>'; | ||
| 892 | + if (!profile || typeof profile !== 'object') { | ||
| 893 | + block += '<div>(no intent payload — style intent may be disabled or context missing)</div>'; | ||
| 894 | + return block; | ||
| 895 | + } | ||
| 896 | + const active = !!profile.active; | ||
| 897 | + block += `<div>active: ${active ? 'yes' : 'no'}</div>`; | ||
| 898 | + const intents = Array.isArray(profile.intents) ? profile.intents : []; | ||
| 899 | + if (!intents.length) { | ||
| 900 | + block += '<div>intents: (none — no vocabulary match on query variants)</div>'; | ||
| 901 | + return block; | ||
| 902 | + } | ||
| 903 | + block += '<div style="margin-top: 4px;">intents:</div><ul style="margin: 4px 0 8px 20px; padding: 0;">'; | ||
| 904 | + for (const it of intents) { | ||
| 905 | + const aliases = Array.isArray(it.dimension_aliases) ? it.dimension_aliases.join(', ') : ''; | ||
| 906 | + block += '<li style="margin-bottom: 6px;">'; | ||
| 907 | + block += `<div><strong>intent_type</strong>: ${escapeHtml(it.intent_type || '')}</div>`; | ||
| 908 | + block += `<div><strong>dimension_aliases</strong>: ${escapeHtml(aliases || 'N/A')}</div>`; | ||
| 909 | + block += `<div><strong>matched_term</strong>: ${escapeHtml(it.matched_term || '')}</div>`; | ||
| 910 | + block += `<div><strong>canonical_value</strong>: ${escapeHtml(it.canonical_value || '')}</div>`; | ||
| 911 | + block += `<div><strong>matched_query_text</strong>: ${escapeHtml(it.matched_query_text || '')}</div>`; | ||
| 912 | + block += '</li>'; | ||
| 913 | + } | ||
| 914 | + block += '</ul>'; | ||
| 915 | + if (Array.isArray(profile.query_variants) && profile.query_variants.length > 0) { | ||
| 916 | + block += '<div style="margin-top: 6px;"><strong>query_variants</strong>:</div>'; | ||
| 917 | + block += `<pre style="background: #f5f5f5; padding: 8px; overflow: auto; max-height: 200px; margin-top: 4px;">${escapeHtml(customStringify(profile.query_variants))}</pre>`; | ||
| 918 | + } | ||
| 919 | + return block; | ||
| 920 | +} | ||
| 921 | + | ||
| 888 | // Display debug info | 922 | // Display debug info |
| 889 | function displayDebugInfo(data) { | 923 | function displayDebugInfo(data) { |
| 890 | const debugInfoDiv = document.getElementById('debugInfo'); | 924 | const debugInfoDiv = document.getElementById('debugInfo'); |
| @@ -916,7 +950,6 @@ function displayDebugInfo(data) { | @@ -916,7 +950,6 @@ function displayDebugInfo(data) { | ||
| 916 | html += `<div>detected_language: ${escapeHtml(debugInfo.query_analysis.detected_language || 'N/A')}</div>`; | 950 | html += `<div>detected_language: ${escapeHtml(debugInfo.query_analysis.detected_language || 'N/A')}</div>`; |
| 917 | html += `<div>index_languages: ${escapeHtml((debugInfo.query_analysis.index_languages || []).join(', ') || 'N/A')}</div>`; | 951 | html += `<div>index_languages: ${escapeHtml((debugInfo.query_analysis.index_languages || []).join(', ') || 'N/A')}</div>`; |
| 918 | html += `<div>query_tokens: ${escapeHtml((debugInfo.query_analysis.query_tokens || []).join(', ') || 'N/A')}</div>`; | 952 | html += `<div>query_tokens: ${escapeHtml((debugInfo.query_analysis.query_tokens || []).join(', ') || 'N/A')}</div>`; |
| 919 | - html += `<div>domain: ${escapeHtml(debugInfo.query_analysis.domain || 'default')}</div>`; | ||
| 920 | html += `<div>is_simple_query: ${debugInfo.query_analysis.is_simple_query ? 'yes' : 'no'}</div>`; | 953 | html += `<div>is_simple_query: ${debugInfo.query_analysis.is_simple_query ? 'yes' : 'no'}</div>`; |
| 921 | 954 | ||
| 922 | if (debugInfo.query_analysis.translations && Object.keys(debugInfo.query_analysis.translations).length > 0) { | 955 | if (debugInfo.query_analysis.translations && Object.keys(debugInfo.query_analysis.translations).length > 0) { |
| @@ -932,6 +965,9 @@ function displayDebugInfo(data) { | @@ -932,6 +965,9 @@ function displayDebugInfo(data) { | ||
| 932 | if (debugInfo.query_analysis.boolean_ast) { | 965 | if (debugInfo.query_analysis.boolean_ast) { |
| 933 | html += `<div>boolean_ast: ${escapeHtml(debugInfo.query_analysis.boolean_ast)}</div>`; | 966 | html += `<div>boolean_ast: ${escapeHtml(debugInfo.query_analysis.boolean_ast)}</div>`; |
| 934 | } | 967 | } |
| 968 | + | ||
| 969 | + const intentPayload = debugInfo.query_analysis.intent_detection ?? debugInfo.query_analysis.style_intent_profile; | ||
| 970 | + html += formatIntentDetectionHtml(intentPayload); | ||
| 935 | 971 | ||
| 936 | html += '</div>'; | 972 | html += '</div>'; |
| 937 | } | 973 | } |
| @@ -942,6 +978,12 @@ function displayDebugInfo(data) { | @@ -942,6 +978,12 @@ function displayDebugInfo(data) { | ||
| 942 | html += `<div>translation_enabled: ${debugInfo.feature_flags.translation_enabled ? 'enabled' : 'disabled'}</div>`; | 978 | html += `<div>translation_enabled: ${debugInfo.feature_flags.translation_enabled ? 'enabled' : 'disabled'}</div>`; |
| 943 | html += `<div>embedding_enabled: ${debugInfo.feature_flags.embedding_enabled ? 'enabled' : 'disabled'}</div>`; | 979 | html += `<div>embedding_enabled: ${debugInfo.feature_flags.embedding_enabled ? 'enabled' : 'disabled'}</div>`; |
| 944 | html += `<div>rerank_enabled: ${debugInfo.feature_flags.rerank_enabled ? 'enabled' : 'disabled'}</div>`; | 980 | html += `<div>rerank_enabled: ${debugInfo.feature_flags.rerank_enabled ? 'enabled' : 'disabled'}</div>`; |
| 981 | + if (debugInfo.feature_flags.style_intent_enabled !== undefined) { | ||
| 982 | + html += `<div>style_intent_enabled: ${debugInfo.feature_flags.style_intent_enabled ? 'enabled' : 'disabled'}</div>`; | ||
| 983 | + } | ||
| 984 | + if (debugInfo.feature_flags.style_intent_active !== undefined) { | ||
| 985 | + html += `<div>style_intent_active: ${debugInfo.feature_flags.style_intent_active ? 'yes' : 'no'}</div>`; | ||
| 986 | + } | ||
| 945 | html += '</div>'; | 987 | html += '</div>'; |
| 946 | } | 988 | } |
| 947 | 989 |
search/searcher.py
| @@ -119,7 +119,6 @@ class Searcher: | @@ -119,7 +119,6 @@ class Searcher: | ||
| 119 | self.style_sku_selector = StyleSkuSelector( | 119 | self.style_sku_selector = StyleSkuSelector( |
| 120 | self.style_intent_registry, | 120 | self.style_intent_registry, |
| 121 | text_encoder_getter=lambda: getattr(self.query_parser, "text_encoder", None), | 121 | text_encoder_getter=lambda: getattr(self.query_parser, "text_encoder", None), |
| 122 | - tokenizer_getter=lambda: getattr(self.query_parser, "_tokenizer", None), | ||
| 123 | ) | 122 | ) |
| 124 | 123 | ||
| 125 | # Query builder - simplified single-layer architecture | 124 | # Query builder - simplified single-layer architecture |
| @@ -397,7 +396,6 @@ class Searcher: | @@ -397,7 +396,6 @@ class Searcher: | ||
| 397 | detected_language=parsed_query.detected_language, | 396 | detected_language=parsed_query.detected_language, |
| 398 | translations=parsed_query.translations, | 397 | translations=parsed_query.translations, |
| 399 | query_vector=parsed_query.query_vector.tolist() if parsed_query.query_vector is not None else None, | 398 | query_vector=parsed_query.query_vector.tolist() if parsed_query.query_vector is not None else None, |
| 400 | - domain="default", | ||
| 401 | is_simple_query=True | 399 | is_simple_query=True |
| 402 | ) | 400 | ) |
| 403 | context.metadata["feature_flags"]["style_intent_active"] = self._has_style_intent(parsed_query) | 401 | context.metadata["feature_flags"]["style_intent_active"] = self._has_style_intent(parsed_query) |
| @@ -732,7 +730,7 @@ class Searcher: | @@ -732,7 +730,7 @@ class Searcher: | ||
| 732 | rerank_debug_by_doc[str(doc_id)] = item | 730 | rerank_debug_by_doc[str(doc_id)] = item |
| 733 | 731 | ||
| 734 | if self._has_style_intent(parsed_query): | 732 | if self._has_style_intent(parsed_query): |
| 735 | - if in_rerank_window and style_intent_decisions: | 733 | + if style_intent_decisions: |
| 736 | self.style_sku_selector.apply_precomputed_decisions( | 734 | self.style_sku_selector.apply_precomputed_decisions( |
| 737 | es_hits, | 735 | es_hits, |
| 738 | style_intent_decisions, | 736 | style_intent_decisions, |
| @@ -867,8 +865,7 @@ class Searcher: | @@ -867,8 +865,7 @@ class Searcher: | ||
| 867 | "has_vector": context.query_analysis.query_vector is not None, | 865 | "has_vector": context.query_analysis.query_vector is not None, |
| 868 | "query_tokens": getattr(parsed_query, "query_tokens", []), | 866 | "query_tokens": getattr(parsed_query, "query_tokens", []), |
| 869 | "is_simple_query": context.query_analysis.is_simple_query, | 867 | "is_simple_query": context.query_analysis.is_simple_query, |
| 870 | - "domain": context.query_analysis.domain, | ||
| 871 | - "style_intent_profile": context.get_intermediate_result("style_intent_profile"), | 868 | + "intent_detection": context.get_intermediate_result("style_intent_profile"), |
| 872 | }, | 869 | }, |
| 873 | "es_query": context.get_intermediate_result('es_query', {}), | 870 | "es_query": context.get_intermediate_result('es_query', {}), |
| 874 | "es_query_context": { | 871 | "es_query_context": { |
search/sku_intent_selector.py
| @@ -5,7 +5,7 @@ SKU selection for style-intent-aware search results. | @@ -5,7 +5,7 @@ SKU selection for style-intent-aware search results. | ||
| 5 | from __future__ import annotations | 5 | from __future__ import annotations |
| 6 | 6 | ||
| 7 | from dataclasses import dataclass, field | 7 | from dataclasses import dataclass, field |
| 8 | -from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple | 8 | +from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple |
| 9 | 9 | ||
| 10 | import numpy as np | 10 | import numpy as np |
| 11 | 11 | ||
| @@ -39,7 +39,18 @@ class _SkuCandidate: | @@ -39,7 +39,18 @@ class _SkuCandidate: | ||
| 39 | sku_id: str | 39 | sku_id: str |
| 40 | sku: Dict[str, Any] | 40 | sku: Dict[str, Any] |
| 41 | selection_text: str | 41 | selection_text: str |
| 42 | - intent_texts: Dict[str, str] | 42 | + normalized_selection_text: str |
| 43 | + intent_values: Dict[str, str] | ||
| 44 | + | ||
| 45 | + | ||
| 46 | +@dataclass | ||
| 47 | +class _SelectionContext: | ||
| 48 | + query_texts: Tuple[str, ...] | ||
| 49 | + matched_terms_by_intent: Dict[str, Tuple[str, ...]] | ||
| 50 | + query_vector: Optional[np.ndarray] | ||
| 51 | + text_match_cache: Dict[Tuple[str, str], bool] = field(default_factory=dict) | ||
| 52 | + selection_vector_cache: Dict[str, Optional[np.ndarray]] = field(default_factory=dict) | ||
| 53 | + similarity_cache: Dict[str, Optional[float]] = field(default_factory=dict) | ||
| 43 | 54 | ||
| 44 | 55 | ||
| 45 | class StyleSkuSelector: | 56 | class StyleSkuSelector: |
| @@ -50,11 +61,9 @@ class StyleSkuSelector: | @@ -50,11 +61,9 @@ class StyleSkuSelector: | ||
| 50 | registry: StyleIntentRegistry, | 61 | registry: StyleIntentRegistry, |
| 51 | *, | 62 | *, |
| 52 | text_encoder_getter: Optional[Callable[[], Any]] = None, | 63 | text_encoder_getter: Optional[Callable[[], Any]] = None, |
| 53 | - tokenizer_getter: Optional[Callable[[], Any]] = None, | ||
| 54 | ) -> None: | 64 | ) -> None: |
| 55 | self.registry = registry | 65 | self.registry = registry |
| 56 | self._text_encoder_getter = text_encoder_getter | 66 | self._text_encoder_getter = text_encoder_getter |
| 57 | - self._tokenizer_getter = tokenizer_getter | ||
| 58 | 67 | ||
| 59 | def prepare_hits( | 68 | def prepare_hits( |
| 60 | self, | 69 | self, |
| @@ -66,9 +75,7 @@ class StyleSkuSelector: | @@ -66,9 +75,7 @@ class StyleSkuSelector: | ||
| 66 | if not isinstance(style_profile, StyleIntentProfile) or not style_profile.is_active: | 75 | if not isinstance(style_profile, StyleIntentProfile) or not style_profile.is_active: |
| 67 | return decisions | 76 | return decisions |
| 68 | 77 | ||
| 69 | - query_texts = self._build_query_texts(parsed_query, style_profile) | ||
| 70 | - query_vector = self._get_query_vector(parsed_query) | ||
| 71 | - tokenizer = self._get_tokenizer() | 78 | + selection_context = self._build_selection_context(parsed_query, style_profile) |
| 72 | 79 | ||
| 73 | for hit in es_hits: | 80 | for hit in es_hits: |
| 74 | source = hit.get("_source") | 81 | source = hit.get("_source") |
| @@ -78,16 +85,15 @@ class StyleSkuSelector: | @@ -78,16 +85,15 @@ class StyleSkuSelector: | ||
| 78 | decision = self._select_for_source( | 85 | decision = self._select_for_source( |
| 79 | source, | 86 | source, |
| 80 | style_profile=style_profile, | 87 | style_profile=style_profile, |
| 81 | - query_texts=query_texts, | ||
| 82 | - query_vector=query_vector, | ||
| 83 | - tokenizer=tokenizer, | 88 | + selection_context=selection_context, |
| 84 | ) | 89 | ) |
| 85 | if decision is None: | 90 | if decision is None: |
| 86 | continue | 91 | continue |
| 87 | 92 | ||
| 88 | - self._apply_decision_to_source(source, decision) | ||
| 89 | if decision.rerank_suffix: | 93 | if decision.rerank_suffix: |
| 90 | hit["_style_rerank_suffix"] = decision.rerank_suffix | 94 | hit["_style_rerank_suffix"] = decision.rerank_suffix |
| 95 | + else: | ||
| 96 | + hit.pop("_style_rerank_suffix", None) | ||
| 91 | 97 | ||
| 92 | doc_id = hit.get("_id") | 98 | doc_id = hit.get("_id") |
| 93 | if doc_id is not None: | 99 | if doc_id is not None: |
| @@ -116,6 +122,8 @@ class StyleSkuSelector: | @@ -116,6 +122,8 @@ class StyleSkuSelector: | ||
| 116 | self._apply_decision_to_source(source, decision) | 122 | self._apply_decision_to_source(source, decision) |
| 117 | if decision.rerank_suffix: | 123 | if decision.rerank_suffix: |
| 118 | hit["_style_rerank_suffix"] = decision.rerank_suffix | 124 | hit["_style_rerank_suffix"] = decision.rerank_suffix |
| 125 | + else: | ||
| 126 | + hit.pop("_style_rerank_suffix", None) | ||
| 119 | 127 | ||
| 120 | def _build_query_texts( | 128 | def _build_query_texts( |
| 121 | self, | 129 | self, |
| @@ -165,25 +173,34 @@ class StyleSkuSelector: | @@ -165,25 +173,34 @@ class StyleSkuSelector: | ||
| 165 | return None | 173 | return None |
| 166 | return np.asarray(vectors[0], dtype=np.float32) | 174 | return np.asarray(vectors[0], dtype=np.float32) |
| 167 | 175 | ||
| 176 | + def _build_selection_context( | ||
| 177 | + self, | ||
| 178 | + parsed_query: Any, | ||
| 179 | + style_profile: StyleIntentProfile, | ||
| 180 | + ) -> _SelectionContext: | ||
| 181 | + matched_terms_by_intent: Dict[str, List[str]] = {} | ||
| 182 | + for intent in style_profile.intents: | ||
| 183 | + normalized_term = normalize_query_text(intent.matched_term) | ||
| 184 | + if not normalized_term: | ||
| 185 | + continue | ||
| 186 | + matched_terms = matched_terms_by_intent.setdefault(intent.intent_type, []) | ||
| 187 | + if normalized_term not in matched_terms: | ||
| 188 | + matched_terms.append(normalized_term) | ||
| 189 | + | ||
| 190 | + return _SelectionContext( | ||
| 191 | + query_texts=tuple(self._build_query_texts(parsed_query, style_profile)), | ||
| 192 | + matched_terms_by_intent={ | ||
| 193 | + intent_type: tuple(terms) | ||
| 194 | + for intent_type, terms in matched_terms_by_intent.items() | ||
| 195 | + }, | ||
| 196 | + query_vector=self._get_query_vector(parsed_query), | ||
| 197 | + ) | ||
| 198 | + | ||
| 168 | def _get_text_encoder(self) -> Any: | 199 | def _get_text_encoder(self) -> Any: |
| 169 | if self._text_encoder_getter is None: | 200 | if self._text_encoder_getter is None: |
| 170 | return None | 201 | return None |
| 171 | return self._text_encoder_getter() | 202 | return self._text_encoder_getter() |
| 172 | 203 | ||
| 173 | - def _get_tokenizer(self) -> Any: | ||
| 174 | - if self._tokenizer_getter is None: | ||
| 175 | - return None | ||
| 176 | - return self._tokenizer_getter() | ||
| 177 | - | ||
| 178 | - @staticmethod | ||
| 179 | - def _fallback_sku_text(sku: Dict[str, Any]) -> str: | ||
| 180 | - parts = [] | ||
| 181 | - for field_name in ("option1_value", "option2_value", "option3_value"): | ||
| 182 | - value = str(sku.get(field_name) or "").strip() | ||
| 183 | - if value: | ||
| 184 | - parts.append(value) | ||
| 185 | - return " ".join(parts) | ||
| 186 | - | ||
| 187 | def _resolve_dimensions( | 204 | def _resolve_dimensions( |
| 188 | self, | 205 | self, |
| 189 | source: Dict[str, Any], | 206 | source: Dict[str, Any], |
| @@ -212,157 +229,171 @@ class StyleSkuSelector: | @@ -212,157 +229,171 @@ class StyleSkuSelector: | ||
| 212 | skus: List[Dict[str, Any]], | 229 | skus: List[Dict[str, Any]], |
| 213 | resolved_dimensions: Dict[str, Optional[str]], | 230 | resolved_dimensions: Dict[str, Optional[str]], |
| 214 | ) -> List[_SkuCandidate]: | 231 | ) -> List[_SkuCandidate]: |
| 232 | + if not resolved_dimensions or any(not field_name for field_name in resolved_dimensions.values()): | ||
| 233 | + return [] | ||
| 234 | + | ||
| 215 | candidates: List[_SkuCandidate] = [] | 235 | candidates: List[_SkuCandidate] = [] |
| 216 | for index, sku in enumerate(skus): | 236 | for index, sku in enumerate(skus): |
| 217 | - fallback_text = self._fallback_sku_text(sku) | ||
| 218 | - intent_texts: Dict[str, str] = {} | 237 | + intent_values: Dict[str, str] = {} |
| 219 | for intent_type, field_name in resolved_dimensions.items(): | 238 | for intent_type, field_name in resolved_dimensions.items(): |
| 220 | - if field_name: | ||
| 221 | - value = str(sku.get(field_name) or "").strip() | ||
| 222 | - intent_texts[intent_type] = value or fallback_text | ||
| 223 | - else: | ||
| 224 | - intent_texts[intent_type] = fallback_text | 239 | + if not field_name: |
| 240 | + continue | ||
| 241 | + intent_values[intent_type] = str(sku.get(field_name) or "").strip() | ||
| 225 | 242 | ||
| 226 | selection_parts: List[str] = [] | 243 | selection_parts: List[str] = [] |
| 227 | seen = set() | 244 | seen = set() |
| 228 | - for value in intent_texts.values(): | 245 | + for value in intent_values.values(): |
| 229 | normalized = normalize_query_text(value) | 246 | normalized = normalize_query_text(value) |
| 230 | if not normalized or normalized in seen: | 247 | if not normalized or normalized in seen: |
| 231 | continue | 248 | continue |
| 232 | seen.add(normalized) | 249 | seen.add(normalized) |
| 233 | - selection_parts.append(str(value).strip()) | 250 | + selection_parts.append(value) |
| 234 | 251 | ||
| 235 | - selection_text = " ".join(selection_parts).strip() or fallback_text | 252 | + selection_text = " ".join(selection_parts).strip() |
| 236 | candidates.append( | 253 | candidates.append( |
| 237 | _SkuCandidate( | 254 | _SkuCandidate( |
| 238 | index=index, | 255 | index=index, |
| 239 | sku_id=str(sku.get("sku_id") or ""), | 256 | sku_id=str(sku.get("sku_id") or ""), |
| 240 | sku=sku, | 257 | sku=sku, |
| 241 | selection_text=selection_text, | 258 | selection_text=selection_text, |
| 242 | - intent_texts=intent_texts, | 259 | + normalized_selection_text=normalize_query_text(selection_text), |
| 260 | + intent_values=intent_values, | ||
| 243 | ) | 261 | ) |
| 244 | ) | 262 | ) |
| 245 | return candidates | 263 | return candidates |
| 246 | 264 | ||
| 247 | @staticmethod | 265 | @staticmethod |
| 248 | - def _is_direct_match( | ||
| 249 | - candidate: _SkuCandidate, | ||
| 250 | - query_texts: Sequence[str], | ||
| 251 | - ) -> bool: | ||
| 252 | - if not candidate.intent_texts or not query_texts: | ||
| 253 | - return False | ||
| 254 | - for value in candidate.intent_texts.values(): | ||
| 255 | - normalized_value = normalize_query_text(value) | ||
| 256 | - if not normalized_value: | ||
| 257 | - return False | ||
| 258 | - if not any(normalized_value in query_text for query_text in query_texts): | ||
| 259 | - return False | ||
| 260 | - return True | ||
| 261 | - | ||
| 262 | - def _is_generalized_match( | 266 | + def _empty_decision( |
| 267 | + resolved_dimensions: Dict[str, Optional[str]], | ||
| 268 | + matched_stage: str, | ||
| 269 | + ) -> SkuSelectionDecision: | ||
| 270 | + return SkuSelectionDecision( | ||
| 271 | + selected_sku_id=None, | ||
| 272 | + rerank_suffix="", | ||
| 273 | + selected_text="", | ||
| 274 | + matched_stage=matched_stage, | ||
| 275 | + resolved_dimensions=dict(resolved_dimensions), | ||
| 276 | + ) | ||
| 277 | + | ||
| 278 | + def _is_text_match( | ||
| 263 | self, | 279 | self, |
| 264 | - candidate: _SkuCandidate, | ||
| 265 | - style_profile: StyleIntentProfile, | ||
| 266 | - tokenizer: Any, | 280 | + intent_type: str, |
| 281 | + value: str, | ||
| 282 | + selection_context: _SelectionContext, | ||
| 267 | ) -> bool: | 283 | ) -> bool: |
| 268 | - if not candidate.intent_texts: | 284 | + normalized_value = normalize_query_text(value) |
| 285 | + if not normalized_value: | ||
| 269 | return False | 286 | return False |
| 270 | 287 | ||
| 271 | - for intent_type, value in candidate.intent_texts.items(): | ||
| 272 | - definition = self.registry.get_definition(intent_type) | ||
| 273 | - if definition is None: | ||
| 274 | - return False | ||
| 275 | - matched_canonicals = definition.match_text(value, tokenizer=tokenizer) | ||
| 276 | - if not matched_canonicals.intersection(style_profile.get_canonical_values(intent_type)): | ||
| 277 | - return False | ||
| 278 | - return True | 288 | + cache_key = (intent_type, normalized_value) |
| 289 | + cached = selection_context.text_match_cache.get(cache_key) | ||
| 290 | + if cached is not None: | ||
| 291 | + return cached | ||
| 292 | + | ||
| 293 | + matched_terms = selection_context.matched_terms_by_intent.get(intent_type, ()) | ||
| 294 | + has_term_match = any(term in normalized_value for term in matched_terms if term) | ||
| 295 | + query_contains_value = any( | ||
| 296 | + normalized_value in query_text | ||
| 297 | + for query_text in selection_context.query_texts | ||
| 298 | + ) | ||
| 299 | + matched = bool(has_term_match or query_contains_value) | ||
| 300 | + selection_context.text_match_cache[cache_key] = matched | ||
| 301 | + return matched | ||
| 302 | + | ||
| 303 | + def _find_first_text_match( | ||
| 304 | + self, | ||
| 305 | + candidates: Sequence[_SkuCandidate], | ||
| 306 | + selection_context: _SelectionContext, | ||
| 307 | + ) -> Optional[_SkuCandidate]: | ||
| 308 | + for candidate in candidates: | ||
| 309 | + if candidate.intent_values and all( | ||
| 310 | + self._is_text_match(intent_type, value, selection_context) | ||
| 311 | + for intent_type, value in candidate.intent_values.items() | ||
| 312 | + ): | ||
| 313 | + return candidate | ||
| 314 | + return None | ||
| 279 | 315 | ||
| 280 | def _select_by_embedding( | 316 | def _select_by_embedding( |
| 281 | self, | 317 | self, |
| 282 | candidates: Sequence[_SkuCandidate], | 318 | candidates: Sequence[_SkuCandidate], |
| 283 | - query_vector: Optional[np.ndarray], | 319 | + selection_context: _SelectionContext, |
| 284 | ) -> Tuple[Optional[_SkuCandidate], Optional[float]]: | 320 | ) -> Tuple[Optional[_SkuCandidate], Optional[float]]: |
| 285 | if not candidates: | 321 | if not candidates: |
| 286 | return None, None | 322 | return None, None |
| 287 | text_encoder = self._get_text_encoder() | 323 | text_encoder = self._get_text_encoder() |
| 288 | - if query_vector is None or text_encoder is None: | ||
| 289 | - return candidates[0], None | 324 | + if selection_context.query_vector is None or text_encoder is None: |
| 325 | + return None, None | ||
| 290 | 326 | ||
| 291 | unique_texts = list( | 327 | unique_texts = list( |
| 292 | dict.fromkeys( | 328 | dict.fromkeys( |
| 293 | - normalize_query_text(candidate.selection_text) | 329 | + candidate.normalized_selection_text |
| 294 | for candidate in candidates | 330 | for candidate in candidates |
| 295 | - if normalize_query_text(candidate.selection_text) | 331 | + if candidate.normalized_selection_text |
| 332 | + and candidate.normalized_selection_text not in selection_context.selection_vector_cache | ||
| 296 | ) | 333 | ) |
| 297 | ) | 334 | ) |
| 298 | - if not unique_texts: | ||
| 299 | - return candidates[0], None | ||
| 300 | - | ||
| 301 | - vectors = text_encoder.encode(unique_texts, priority=1) | ||
| 302 | - vector_map: Dict[str, np.ndarray] = {} | ||
| 303 | - for key, vector in zip(unique_texts, vectors): | ||
| 304 | - if vector is None: | ||
| 305 | - continue | ||
| 306 | - vector_map[key] = np.asarray(vector, dtype=np.float32) | 335 | + if unique_texts: |
| 336 | + vectors = text_encoder.encode(unique_texts, priority=1) | ||
| 337 | + for key, vector in zip(unique_texts, vectors): | ||
| 338 | + selection_context.selection_vector_cache[key] = ( | ||
| 339 | + np.asarray(vector, dtype=np.float32) if vector is not None else None | ||
| 340 | + ) | ||
| 307 | 341 | ||
| 308 | best_candidate: Optional[_SkuCandidate] = None | 342 | best_candidate: Optional[_SkuCandidate] = None |
| 309 | best_score: Optional[float] = None | 343 | best_score: Optional[float] = None |
| 310 | - query_vector_array = np.asarray(query_vector, dtype=np.float32) | 344 | + query_vector_array = np.asarray(selection_context.query_vector, dtype=np.float32) |
| 311 | for candidate in candidates: | 345 | for candidate in candidates: |
| 312 | - normalized_text = normalize_query_text(candidate.selection_text) | ||
| 313 | - candidate_vector = vector_map.get(normalized_text) | ||
| 314 | - if candidate_vector is None: | 346 | + normalized_text = candidate.normalized_selection_text |
| 347 | + if not normalized_text: | ||
| 348 | + continue | ||
| 349 | + | ||
| 350 | + score = selection_context.similarity_cache.get(normalized_text) | ||
| 351 | + if score is None: | ||
| 352 | + candidate_vector = selection_context.selection_vector_cache.get(normalized_text) | ||
| 353 | + if candidate_vector is None: | ||
| 354 | + selection_context.similarity_cache[normalized_text] = None | ||
| 355 | + continue | ||
| 356 | + score = float(np.inner(query_vector_array, candidate_vector)) | ||
| 357 | + selection_context.similarity_cache[normalized_text] = score | ||
| 358 | + | ||
| 359 | + if score is None: | ||
| 315 | continue | 360 | continue |
| 316 | - score = float(np.inner(query_vector_array, candidate_vector)) | ||
| 317 | if best_score is None or score > best_score: | 361 | if best_score is None or score > best_score: |
| 318 | best_candidate = candidate | 362 | best_candidate = candidate |
| 319 | best_score = score | 363 | best_score = score |
| 320 | 364 | ||
| 321 | - return best_candidate or candidates[0], best_score | 365 | + return best_candidate, best_score |
| 322 | 366 | ||
| 323 | def _select_for_source( | 367 | def _select_for_source( |
| 324 | self, | 368 | self, |
| 325 | source: Dict[str, Any], | 369 | source: Dict[str, Any], |
| 326 | *, | 370 | *, |
| 327 | style_profile: StyleIntentProfile, | 371 | style_profile: StyleIntentProfile, |
| 328 | - query_texts: Sequence[str], | ||
| 329 | - query_vector: Optional[np.ndarray], | ||
| 330 | - tokenizer: Any, | 372 | + selection_context: _SelectionContext, |
| 331 | ) -> Optional[SkuSelectionDecision]: | 373 | ) -> Optional[SkuSelectionDecision]: |
| 332 | skus = source.get("skus") | 374 | skus = source.get("skus") |
| 333 | if not isinstance(skus, list) or not skus: | 375 | if not isinstance(skus, list) or not skus: |
| 334 | return None | 376 | return None |
| 335 | 377 | ||
| 336 | resolved_dimensions = self._resolve_dimensions(source, style_profile) | 378 | resolved_dimensions = self._resolve_dimensions(source, style_profile) |
| 379 | + if not resolved_dimensions or any(not field_name for field_name in resolved_dimensions.values()): | ||
| 380 | + return self._empty_decision(resolved_dimensions, matched_stage="unresolved") | ||
| 381 | + | ||
| 337 | candidates = self._build_candidates(skus, resolved_dimensions) | 382 | candidates = self._build_candidates(skus, resolved_dimensions) |
| 338 | if not candidates: | 383 | if not candidates: |
| 339 | - return None | 384 | + return self._empty_decision(resolved_dimensions, matched_stage="no_candidates") |
| 340 | 385 | ||
| 341 | - direct_matches = [candidate for candidate in candidates if self._is_direct_match(candidate, query_texts)] | ||
| 342 | - if len(direct_matches) == 1: | ||
| 343 | - chosen = direct_matches[0] | ||
| 344 | - return self._build_decision(chosen, resolved_dimensions, matched_stage="direct") | 386 | + text_match = self._find_first_text_match(candidates, selection_context) |
| 387 | + if text_match is not None: | ||
| 388 | + return self._build_decision(text_match, resolved_dimensions, matched_stage="text") | ||
| 345 | 389 | ||
| 346 | - generalized_matches: List[_SkuCandidate] = [] | ||
| 347 | - if not direct_matches: | ||
| 348 | - generalized_matches = [ | ||
| 349 | - candidate | ||
| 350 | - for candidate in candidates | ||
| 351 | - if self._is_generalized_match(candidate, style_profile, tokenizer) | ||
| 352 | - ] | ||
| 353 | - if len(generalized_matches) == 1: | ||
| 354 | - chosen = generalized_matches[0] | ||
| 355 | - return self._build_decision(chosen, resolved_dimensions, matched_stage="generalized") | ||
| 356 | - | ||
| 357 | - embedding_pool = direct_matches or generalized_matches or candidates | ||
| 358 | - chosen, similarity_score = self._select_by_embedding(embedding_pool, query_vector) | 390 | + chosen, similarity_score = self._select_by_embedding(candidates, selection_context) |
| 359 | if chosen is None: | 391 | if chosen is None: |
| 360 | - return None | ||
| 361 | - stage = "embedding_from_matches" if direct_matches or generalized_matches else "embedding_from_all" | 392 | + return self._empty_decision(resolved_dimensions, matched_stage="no_match") |
| 362 | return self._build_decision( | 393 | return self._build_decision( |
| 363 | chosen, | 394 | chosen, |
| 364 | resolved_dimensions, | 395 | resolved_dimensions, |
| 365 | - matched_stage=stage, | 396 | + matched_stage="embedding", |
| 366 | similarity_score=similarity_score, | 397 | similarity_score=similarity_score, |
| 367 | ) | 398 | ) |
| 368 | 399 |
tests/test_search_rerank_window.py
| @@ -30,7 +30,6 @@ class _FakeParsedQuery: | @@ -30,7 +30,6 @@ class _FakeParsedQuery: | ||
| 30 | detected_language: str = "en" | 30 | detected_language: str = "en" |
| 31 | translations: Dict[str, str] = None | 31 | translations: Dict[str, str] = None |
| 32 | query_vector: Any = None | 32 | query_vector: Any = None |
| 33 | - domain: str = "default" | ||
| 34 | style_intent_profile: Any = None | 33 | style_intent_profile: Any = None |
| 35 | 34 | ||
| 36 | def to_dict(self) -> Dict[str, Any]: | 35 | def to_dict(self) -> Dict[str, Any]: |
| @@ -40,7 +39,6 @@ class _FakeParsedQuery: | @@ -40,7 +39,6 @@ class _FakeParsedQuery: | ||
| 40 | "rewritten_query": self.rewritten_query, | 39 | "rewritten_query": self.rewritten_query, |
| 41 | "detected_language": self.detected_language, | 40 | "detected_language": self.detected_language, |
| 42 | "translations": self.translations or {}, | 41 | "translations": self.translations or {}, |
| 43 | - "domain": self.domain, | ||
| 44 | "style_intent_profile": ( | 42 | "style_intent_profile": ( |
| 45 | self.style_intent_profile.to_dict() if self.style_intent_profile is not None else None | 43 | self.style_intent_profile.to_dict() if self.style_intent_profile is not None else None |
| 46 | ), | 44 | ), |
| @@ -542,6 +540,137 @@ def test_searcher_promotes_sku_when_option1_matches_translated_query(monkeypatch | @@ -542,6 +540,137 @@ def test_searcher_promotes_sku_when_option1_matches_translated_query(monkeypatch | ||
| 542 | assert result.results[0].image_url == "https://img/black.jpg" | 540 | assert result.results[0].image_url == "https://img/black.jpg" |
| 543 | 541 | ||
| 544 | 542 | ||
| 543 | +def test_searcher_uses_first_text_match_without_comparing_all_matches(monkeypatch): | ||
| 544 | + es_client = _FakeESClient(total_hits=1) | ||
| 545 | + searcher = _build_searcher(_build_search_config(rerank_enabled=False), es_client) | ||
| 546 | + context = create_request_context(reqid="sku-first-text", uid="u-sku-first-text") | ||
| 547 | + | ||
| 548 | + monkeypatch.setattr( | ||
| 549 | + "search.searcher.get_tenant_config_loader", | ||
| 550 | + lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en"]}), | ||
| 551 | + ) | ||
| 552 | + | ||
| 553 | + class _TextMatchQueryParser: | ||
| 554 | + text_encoder = None | ||
| 555 | + | ||
| 556 | + def parse( | ||
| 557 | + self, | ||
| 558 | + query: str, | ||
| 559 | + tenant_id: str, | ||
| 560 | + generate_vector: bool, | ||
| 561 | + context: Any, | ||
| 562 | + target_languages: Any = None, | ||
| 563 | + ): | ||
| 564 | + return _FakeParsedQuery( | ||
| 565 | + original_query=query, | ||
| 566 | + query_normalized=query, | ||
| 567 | + rewritten_query=query, | ||
| 568 | + translations={}, | ||
| 569 | + style_intent_profile=_build_style_intent_profile( | ||
| 570 | + "color", "black", "color", "colors", "颜色" | ||
| 571 | + ), | ||
| 572 | + ) | ||
| 573 | + | ||
| 574 | + searcher.query_parser = _TextMatchQueryParser() | ||
| 575 | + | ||
| 576 | + def _full_source_with_multiple_text_matches(doc_id: str) -> Dict[str, Any]: | ||
| 577 | + return { | ||
| 578 | + "spu_id": doc_id, | ||
| 579 | + "title": {"en": f"product-{doc_id}"}, | ||
| 580 | + "brief": {"en": f"brief-{doc_id}"}, | ||
| 581 | + "vendor": {"en": f"vendor-{doc_id}"}, | ||
| 582 | + "option1_name": "Color", | ||
| 583 | + "image_url": "https://img/default.jpg", | ||
| 584 | + "skus": [ | ||
| 585 | + {"sku_id": "sku-red", "option1_value": "Red", "image_src": "https://img/red.jpg"}, | ||
| 586 | + { | ||
| 587 | + "sku_id": "sku-gloss-black", | ||
| 588 | + "option1_value": "Gloss Black", | ||
| 589 | + "image_src": "https://img/gloss-black.jpg", | ||
| 590 | + }, | ||
| 591 | + {"sku_id": "sku-black", "option1_value": "Black", "image_src": "https://img/black.jpg"}, | ||
| 592 | + ], | ||
| 593 | + } | ||
| 594 | + | ||
| 595 | + monkeypatch.setattr(_FakeESClient, "_full_source", staticmethod(_full_source_with_multiple_text_matches)) | ||
| 596 | + | ||
| 597 | + result = searcher.search( | ||
| 598 | + query="black dress", | ||
| 599 | + tenant_id="162", | ||
| 600 | + from_=0, | ||
| 601 | + size=1, | ||
| 602 | + context=context, | ||
| 603 | + enable_rerank=False, | ||
| 604 | + ) | ||
| 605 | + | ||
| 606 | + assert len(result.results) == 1 | ||
| 607 | + assert result.results[0].skus[0].sku_id == "sku-gloss-black" | ||
| 608 | + assert result.results[0].image_url == "https://img/gloss-black.jpg" | ||
| 609 | + | ||
| 610 | + | ||
| 611 | +def test_searcher_skips_sku_selection_when_option_name_does_not_match_dimension_alias(monkeypatch): | ||
| 612 | + es_client = _FakeESClient(total_hits=1) | ||
| 613 | + searcher = _build_searcher(_build_search_config(rerank_enabled=False), es_client) | ||
| 614 | + context = create_request_context(reqid="sku-unresolved-dimension", uid="u-sku-unresolved-dimension") | ||
| 615 | + | ||
| 616 | + monkeypatch.setattr( | ||
| 617 | + "search.searcher.get_tenant_config_loader", | ||
| 618 | + lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en", "zh"]}), | ||
| 619 | + ) | ||
| 620 | + | ||
| 621 | + class _UnresolvedDimensionQueryParser: | ||
| 622 | + text_encoder = None | ||
| 623 | + | ||
| 624 | + def parse( | ||
| 625 | + self, | ||
| 626 | + query: str, | ||
| 627 | + tenant_id: str, | ||
| 628 | + generate_vector: bool, | ||
| 629 | + context: Any, | ||
| 630 | + target_languages: Any = None, | ||
| 631 | + ): | ||
| 632 | + return _FakeParsedQuery( | ||
| 633 | + original_query=query, | ||
| 634 | + query_normalized=query, | ||
| 635 | + rewritten_query=query, | ||
| 636 | + translations={"en": "black dress"}, | ||
| 637 | + style_intent_profile=_build_style_intent_profile( | ||
| 638 | + "color", "black", "color", "colors", "颜色" | ||
| 639 | + ), | ||
| 640 | + ) | ||
| 641 | + | ||
| 642 | + searcher.query_parser = _UnresolvedDimensionQueryParser() | ||
| 643 | + | ||
| 644 | + def _full_source_with_unmatched_option_name(doc_id: str) -> Dict[str, Any]: | ||
| 645 | + return { | ||
| 646 | + "spu_id": doc_id, | ||
| 647 | + "title": {"en": f"product-{doc_id}"}, | ||
| 648 | + "brief": {"en": f"brief-{doc_id}"}, | ||
| 649 | + "vendor": {"en": f"vendor-{doc_id}"}, | ||
| 650 | + "option1_name": "Tone", | ||
| 651 | + "image_url": "https://img/default.jpg", | ||
| 652 | + "skus": [ | ||
| 653 | + {"sku_id": "sku-red", "option1_value": "Red", "image_src": "https://img/red.jpg"}, | ||
| 654 | + {"sku_id": "sku-black", "option1_value": "Black", "image_src": "https://img/black.jpg"}, | ||
| 655 | + ], | ||
| 656 | + } | ||
| 657 | + | ||
| 658 | + monkeypatch.setattr(_FakeESClient, "_full_source", staticmethod(_full_source_with_unmatched_option_name)) | ||
| 659 | + | ||
| 660 | + result = searcher.search( | ||
| 661 | + query="黑色 连衣裙", | ||
| 662 | + tenant_id="162", | ||
| 663 | + from_=0, | ||
| 664 | + size=1, | ||
| 665 | + context=context, | ||
| 666 | + enable_rerank=False, | ||
| 667 | + ) | ||
| 668 | + | ||
| 669 | + assert len(result.results) == 1 | ||
| 670 | + assert result.results[0].skus[0].sku_id == "sku-red" | ||
| 671 | + assert result.results[0].image_url == "https://img/default.jpg" | ||
| 672 | + | ||
| 673 | + | ||
| 545 | def test_searcher_promotes_sku_by_embedding_when_query_has_no_direct_option_match(monkeypatch): | 674 | def test_searcher_promotes_sku_by_embedding_when_query_has_no_direct_option_match(monkeypatch): |
| 546 | es_client = _FakeESClient(total_hits=1) | 675 | es_client = _FakeESClient(total_hits=1) |
| 547 | searcher = _build_searcher(_build_search_config(rerank_enabled=False), es_client) | 676 | searcher = _build_searcher(_build_search_config(rerank_enabled=False), es_client) |