Commit cda1cd6231ec713689f779d3a0f464b582f47110

Authored by tangwang
1 parent dad3c867

意图分析&应用 baseline

config/config.yaml
@@ -17,9 +17,9 @@ runtime: @@ -17,9 +17,9 @@ runtime:
17 embedding_port: 6005 17 embedding_port: 6005
18 embedding_text_port: 6005 18 embedding_text_port: 6005
19 embedding_image_port: 6008 19 embedding_image_port: 6008
20 - translator_host: "127.0.0.1" 20 + translator_host: "0.0.0.0"
21 translator_port: 6006 21 translator_port: 6006
22 - reranker_host: "127.0.0.1" 22 + reranker_host: "0.0.0.0"
23 reranker_port: 6007 23 reranker_port: 6007
24 24
25 # 基础设施连接(敏感项优先读环境变量:ES_*、REDIS_*、DB_*、DASHSCOPE_API_KEY、DEEPL_AUTH_KEY) 25 # 基础设施连接(敏感项优先读环境变量:ES_*、REDIS_*、DB_*、DASHSCOPE_API_KEY、DEEPL_AUTH_KEY)
@@ -116,6 +116,14 @@ query_config: @@ -116,6 +116,14 @@ query_config:
116 translation_embedding_wait_budget_ms_source_in_index: 500 # 80 116 translation_embedding_wait_budget_ms_source_in_index: 500 # 80
117 translation_embedding_wait_budget_ms_source_not_in_index: 500 #200 117 translation_embedding_wait_budget_ms_source_not_in_index: 500 #200
118 118
  119 + style_intent:
  120 + enabled: true
  121 + color_dictionary_path: "config/dictionaries/style_intent_color.csv"
  122 + size_dictionary_path: "config/dictionaries/style_intent_size.csv"
  123 + dimension_aliases:
  124 + color: ["color", "colors", "colour", "colours", "颜色", "色", "色系"]
  125 + size: ["size", "sizes", "sizing", "尺码", "尺寸", "码数", "号码", "码"]
  126 +
119 # 动态多语言检索字段配置 127 # 动态多语言检索字段配置
120 # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式; 128 # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式;
121 # shared_fields 为无语言后缀字段。 129 # shared_fields 为无语言后缀字段。
config/dictionaries/style_intent_color.csv 0 → 100644
@@ -0,0 +1,15 @@ @@ -0,0 +1,15 @@
  1 +black,black,blk,黑,黑色
  2 +white,white,wht,白,白色
  3 +red,red,reddish,红,红色
  4 +blue,blue,blu,蓝,蓝色
  5 +green,green,grn,绿,绿色
  6 +yellow,yellow,ylw,黄,黄色
  7 +pink,pink,粉,粉色
  8 +purple,purple,violet,紫,紫色
  9 +gray,gray,grey,灰,灰色
  10 +brown,brown,棕,棕色,咖啡色
  11 +beige,beige,khaki,米色,卡其色
  12 +navy,navy,navy blue,藏青,藏蓝,深蓝
  13 +silver,silver,银,银色
  14 +gold,gold,金,金色
  15 +orange,orange,橙,橙色
config/dictionaries/style_intent_size.csv 0 → 100644
@@ -0,0 +1,8 @@ @@ -0,0 +1,8 @@
  1 +xs,xs,extra small,x-small,加小码
  2 +s,s,small,小码,小号
  3 +m,m,medium,中码,中号
  4 +l,l,large,大码,大号
  5 +xl,xl,x-large,extra large,加大码
  6 +xxl,xxl,2xl,xx-large,双加大码
  7 +xxxl,xxxl,3xl,xxx-large,三加大码
  8 +one size,one size,onesize,free size,均码
@@ -95,6 +95,29 @@ def _read_rewrite_dictionary(path: Path) -> Dict[str, str]: @@ -95,6 +95,29 @@ def _read_rewrite_dictionary(path: Path) -> Dict[str, str]:
95 return rewrite_dict 95 return rewrite_dict
96 96
97 97
  98 +def _read_synonym_csv_dictionary(path: Path) -> List[List[str]]:
  99 + rows: List[List[str]] = []
  100 + if not path.exists():
  101 + return rows
  102 +
  103 + with open(path, "r", encoding="utf-8") as handle:
  104 + for raw_line in handle:
  105 + line = raw_line.strip()
  106 + if not line or line.startswith("#"):
  107 + continue
  108 + parts = [segment.strip() for segment in line.split(",")]
  109 + normalized = [segment for segment in parts if segment]
  110 + if normalized:
  111 + rows.append(normalized)
  112 + return rows
  113 +
  114 +
  115 +_DEFAULT_STYLE_INTENT_DIMENSION_ALIASES: Dict[str, List[str]] = {
  116 + "color": ["color", "colors", "colour", "colours", "颜色", "色", "色系"],
  117 + "size": ["size", "sizes", "sizing", "尺码", "尺寸", "码数", "号码", "码"],
  118 +}
  119 +
  120 +
98 class AppConfigLoader: 121 class AppConfigLoader:
99 """Load the unified application configuration.""" 122 """Load the unified application configuration."""
100 123
@@ -253,6 +276,45 @@ class AppConfigLoader: @@ -253,6 +276,45 @@ class AppConfigLoader:
253 if isinstance(query_cfg.get("text_query_strategy"), dict) 276 if isinstance(query_cfg.get("text_query_strategy"), dict)
254 else {} 277 else {}
255 ) 278 )
  279 + style_intent_cfg = (
  280 + query_cfg.get("style_intent")
  281 + if isinstance(query_cfg.get("style_intent"), dict)
  282 + else {}
  283 + )
  284 +
  285 + def _resolve_project_path(value: Any, default_path: Path) -> Path:
  286 + if value in (None, ""):
  287 + return default_path
  288 + candidate = Path(str(value))
  289 + if candidate.is_absolute():
  290 + return candidate
  291 + return self.project_root / candidate
  292 +
  293 + style_color_path = _resolve_project_path(
  294 + style_intent_cfg.get("color_dictionary_path"),
  295 + self.config_dir / "dictionaries" / "style_intent_color.csv",
  296 + )
  297 + style_size_path = _resolve_project_path(
  298 + style_intent_cfg.get("size_dictionary_path"),
  299 + self.config_dir / "dictionaries" / "style_intent_size.csv",
  300 + )
  301 + configured_dimension_aliases = (
  302 + style_intent_cfg.get("dimension_aliases")
  303 + if isinstance(style_intent_cfg.get("dimension_aliases"), dict)
  304 + else {}
  305 + )
  306 + style_dimension_aliases: Dict[str, List[str]] = {}
  307 + for intent_type, default_aliases in _DEFAULT_STYLE_INTENT_DIMENSION_ALIASES.items():
  308 + aliases = configured_dimension_aliases.get(intent_type)
  309 + if isinstance(aliases, list) and aliases:
  310 + style_dimension_aliases[intent_type] = [str(alias) for alias in aliases if str(alias).strip()]
  311 + else:
  312 + style_dimension_aliases[intent_type] = list(default_aliases)
  313 +
  314 + style_intent_terms = {
  315 + "color": _read_synonym_csv_dictionary(style_color_path),
  316 + "size": _read_synonym_csv_dictionary(style_size_path),
  317 + }
256 query_config = QueryConfig( 318 query_config = QueryConfig(
257 supported_languages=list(query_cfg.get("supported_languages") or ["zh", "en"]), 319 supported_languages=list(query_cfg.get("supported_languages") or ["zh", "en"]),
258 default_language=str(query_cfg.get("default_language") or "en"), 320 default_language=str(query_cfg.get("default_language") or "en"),
@@ -324,6 +386,9 @@ class AppConfigLoader: @@ -324,6 +386,9 @@ class AppConfigLoader:
324 translation_embedding_wait_budget_ms_source_not_in_index=int( 386 translation_embedding_wait_budget_ms_source_not_in_index=int(
325 query_cfg.get("translation_embedding_wait_budget_ms_source_not_in_index", 200) 387 query_cfg.get("translation_embedding_wait_budget_ms_source_not_in_index", 200)
326 ), 388 ),
  389 + style_intent_enabled=bool(style_intent_cfg.get("enabled", True)),
  390 + style_intent_terms=style_intent_terms,
  391 + style_intent_dimension_aliases=style_dimension_aliases,
327 ) 392 )
328 393
329 function_score_cfg = raw.get("function_score") if isinstance(raw.get("function_score"), dict) else {} 394 function_score_cfg = raw.get("function_score") if isinstance(raw.get("function_score"), dict) else {}
@@ -64,6 +64,9 @@ class QueryConfig: @@ -64,6 +64,9 @@ class QueryConfig:
64 # 检测语言不在 index_languages 内:翻译对召回更关键,预算较长。 64 # 检测语言不在 index_languages 内:翻译对召回更关键,预算较长。
65 translation_embedding_wait_budget_ms_source_in_index: int = 80 65 translation_embedding_wait_budget_ms_source_in_index: int = 80
66 translation_embedding_wait_budget_ms_source_not_in_index: int = 200 66 translation_embedding_wait_budget_ms_source_not_in_index: int = 200
  67 + style_intent_enabled: bool = True
  68 + style_intent_terms: Dict[str, List[List[str]]] = field(default_factory=dict)
  69 + style_intent_dimension_aliases: Dict[str, List[str]] = field(default_factory=dict)
67 70
68 71
69 @dataclass(frozen=True) 72 @dataclass(frozen=True)
docs/TODO-意图判断.md
@@ -39,3 +39,15 @@ intent 考虑由 QueryParser 编排、具体实现拆成独立模块,主义好 @@ -39,3 +39,15 @@ intent 考虑由 QueryParser 编排、具体实现拆成独立模块,主义好
39 39
40 5. TODO: 搜索接口里,results[].skus 不是全量子 SKU:由 sku_filter_dimension 控制在应用层按维度分组折叠,每个「维度取值组合」只保留一条 SKU(组内第一条)。请求未传该字段时,Pydantic 默认是 ["option1"],等价于只按 option1_value 去重;服务端不会读取店铺主题的「主展示维」,需调用方与装修配置对齐并传入正确维度。因此当用户有款式等更细粒度意图、而款式落在 option2/option3(或对应 option*_name)时,若仍用默认只按 option1(常见为颜色)折叠,同一颜色下多种款式只会出现一条代表 SKU,无法从返回的 skus 里拿到该颜色下的全部款式行。(若业务需要全量子款,需传包含对应维度的 sku_filter_dimension,或传 null/[] 跳过折叠——以当前 ResultFormatter 实现为准。) 40 5. TODO: 搜索接口里,results[].skus 不是全量子 SKU:由 sku_filter_dimension 控制在应用层按维度分组折叠,每个「维度取值组合」只保留一条 SKU(组内第一条)。请求未传该字段时,Pydantic 默认是 ["option1"],等价于只按 option1_value 去重;服务端不会读取店铺主题的「主展示维」,需调用方与装修配置对齐并传入正确维度。因此当用户有款式等更细粒度意图、而款式落在 option2/option3(或对应 option*_name)时,若仍用默认只按 option1(常见为颜色)折叠,同一颜色下多种款式只会出现一条代表 SKU,无法从返回的 skus 里拿到该颜色下的全部款式行。(若业务需要全量子款,需传包含对应维度的 sku_filter_dimension,或传 null/[] 跳过折叠——以当前 ResultFormatter 实现为准。)
41 41
  42 +
  43 +
  44 +
  45 +
  46 +GPT5.4 主要思路:
  47 +我准备按这条线改,尽量把逻辑收束成两个新模块而不是继续堆在 Searcher 里:
  48 +
  49 +QueryParser 负责产出统一的“款式意图分析结果”,包含:命中的意图类型、标准化值、命中的 query 变体、分词结果、维度候选词。
  50 +独立的 SKU 选择模块负责:维度解析、候选值抽取、三阶段匹配、选中 SKU 后的主图替换,以及生成 rerank 用的属性后缀。
  51 +Searcher 只编排:解析 query -> 首次 ES 召回 -> 若有款式意图则对 rerank 窗口内全部 hits 做 SKU 预筛选并补充 rerank doc 文本 -> run_rerank -> 分页 -> page fill -> ResultFormatter。这样旧的页内 SKU 置顶逻辑可以下沉或删除,避免前后两个阶段重复做同一件事。我再补看一下你最近写的意图草稿文档,确认有没有现成词表示例或边界条件,避免我这边把词表格式和你预期做偏。
  52 +
  53 +
docs/数据统计/options名称和取值统计.md 0 → 100644
@@ -0,0 +1,118 @@ @@ -0,0 +1,118 @@
  1 +
  2 +## prompt
  3 +参考 @docs/常用查询 - ES.md 帮我查询 tenant_Id=163的@search_products.json (1314-1332) 这些字段的统计分布,也就是这里每个字段的所有的值及其对应的文档数
  4 +
  5 +
  6 +已按 `docs/常用查询 - ES.md` 的写法,在索引 **`search_products_tenant_163`** 上做了 `terms` 聚合(租户已按索引拆分,用 `match_all` 即可,等价于只查 tenant 163)。集群在本机 `localhost:9200` 上执行成功。
  7 +
  8 +## 结果概要
  9 +
  10 +- **总文档数**:8977
  11 +- 六个字段的 **`sum_other_doc_count` 均为 0**,在 `size: 20000` 下没有截断,即:**当前数据里每个字段的取值都已全部列出**。
  12 +- **`__missing__`**:来自聚合里的 `"missing": "__missing__"`,表示该字段缺失或为空的文档数(便于和「有值」一起看)。
  13 +- **`option1_values` / `option2_values`** 多为多值数组:同一篇文档可落入多个桶,因此 **各桶 `doc_count` 之和会大于 8977**(这是 ES `terms` 的正常语义,表示「含该取值的文档数」,不是互斥分区)。
  14 +- **`option1_name` / `option2_name` / `option3_name`** 及 **`option3_values`** 的桶计数之和与文档关系更接近单值/稀疏(见下方校验)。
  15 +
  16 +### `option1_name`(13 个取值)
  17 +
  18 +| 值 | 文档数 |
  19 +|---:|---:|
  20 +| __missing__ | 4538 |
  21 +| Size | 2916 |
  22 +| Color | 1174 |
  23 +| 颜色 | 244 |
  24 +| COLOR | 56 |
  25 +| color | 16 |
  26 +| Colour | 15 |
  27 +| Variant | 9 |
  28 +| Style | 3 |
  29 +| Colors | 2 |
  30 +| Scent | 2 |
  31 +| Cup Size | 1 |
  32 +| Pattern Name | 1 |
  33 +
  34 +### `option2_name`(13 个取值)
  35 +
  36 +| 值 | 文档数 |
  37 +|---:|---:|
  38 +| __missing__ | 4666 |
  39 +| Color | 2879 |
  40 +| Size | 1134 |
  41 +| 尺码 | 244 |
  42 +| SIZE | 17 |
  43 +| size | 16 |
  44 +| Style | 12 |
  45 +| Cup Size | 4 |
  46 +| Item Package Quantity | 1 |
  47 +| Number of Items | 1 |
  48 +| Ring Size | 1 |
  49 +| Scent | 1 |
  50 +| Team Name | 1 |
  51 +
  52 +### `option3_name`(11 个取值)
  53 +
  54 +| 值 | 文档数 |
  55 +|---:|---:|
  56 +| __missing__ | 8831 |
  57 +| Fit Type | 52 |
  58 +| Special Size | 31 |
  59 +| Number of Items | 17 |
  60 +| Special Size Type | 16 |
  61 +| Team Name | 15 |
  62 +| Material Type | 8 |
  63 +| Style | 4 |
  64 +| Configuration | 1 |
  65 +| Hand Orientation | 1 |
  66 +| Pattern Name | 1 |
  67 +
  68 +### `option1_values` / `option2_values`
  69 +
  70 +- **桶数量**:`option1_values` **3463**,`option2_values` **3605**(完整列表过长,不宜贴在对话里)。
  71 +- **文档中出现最多的取值(前 10)**
  72 + - **option1_values**:`__missing__` 4538,其后如 Medium 749、Black 653、Large 635、8/9/White/10/7/Small…
  73 + - **option2_values**:`__missing__` 4666,其后如 L 1161、M 1158、S 1064、XL 1036、Black 876…
  74 +
  75 +### `option3_values`(94 个桶,已全部枚举)
  76 +
  77 +除 `__missing__` 8831 外,另有 93 个具体取值;上文 Markdown 表里已列全(从 Standard 到 `standard` 等)。
  78 +
  79 +---
  80 +
  81 +## 可复用的查询(与文档一致)
  82 +
  83 +```bash
  84 +curl -sS -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_163/_search?pretty' \
  85 + -H 'Content-Type: application/json' -d '{
  86 + "query": { "match_all": {} },
  87 + "size": 0,
  88 + "track_total_hits": true,
  89 + "aggs": {
  90 + "option1_name_dist": {
  91 + "terms": { "field": "option1_name", "size": 20000, "order": { "_count": "desc" }, "missing": "__missing__" }
  92 + },
  93 + "option2_name_dist": {
  94 + "terms": { "field": "option2_name", "size": 20000, "order": { "_count": "desc" }, "missing": "__missing__" }
  95 + },
  96 + "option3_name_dist": {
  97 + "terms": { "field": "option3_name", "size": 20000, "order": { "_count": "desc" }, "missing": "__missing__" }
  98 + },
  99 + "option1_values_dist": {
  100 + "terms": { "field": "option1_values", "size": 20000, "order": { "_count": "desc" }, "missing": "__missing__" }
  101 + },
  102 + "option2_values_dist": {
  103 + "terms": { "field": "option2_values", "size": 20000, "order": { "_count": "desc" }, "missing": "__missing__" }
  104 + },
  105 + "option3_values_dist": {
  106 + "terms": { "field": "option3_values", "size": 20000, "order": { "_count": "desc" }, "missing": "__missing__" }
  107 + }
  108 + }
  109 +}'
  110 +```
  111 +
  112 +若某租户取值超过 2 万,需把 `size` 再加大或用 **composite** 分页扫完全部桶。
  113 +
  114 +## 完整 JSON(含 3463 + 3605 条 value 分布)
  115 +
  116 +已整理为:`/tmp/tenant163_option_fields_distribution.json`(约 620KB),结构为按字段分组的 `values: [{ "value", "doc_count" }, ...]`,便于你用脚本或表格工具打开。
  117 +
  118 +如需去掉 `__missing__` 桶,从请求里删掉各聚合中的 `"missing": "__missing__"` 即可。
0 \ No newline at end of file 119 \ No newline at end of file
query/query_parser.py
@@ -12,7 +12,6 @@ from dataclasses import dataclass, field @@ -12,7 +12,6 @@ from dataclasses import dataclass, field
12 from typing import Any, Callable, Dict, List, Optional, Tuple 12 from typing import Any, Callable, Dict, List, Optional, Tuple
13 import numpy as np 13 import numpy as np
14 import logging 14 import logging
15 -import re  
16 from concurrent.futures import ThreadPoolExecutor, wait 15 from concurrent.futures import ThreadPoolExecutor, wait
17 16
18 from embeddings.text_encoder import TextEmbeddingEncoder 17 from embeddings.text_encoder import TextEmbeddingEncoder
@@ -20,25 +19,14 @@ from config import SearchConfig @@ -20,25 +19,14 @@ from config import SearchConfig
20 from translation import create_translation_client 19 from translation import create_translation_client
21 from .language_detector import LanguageDetector 20 from .language_detector import LanguageDetector
22 from .query_rewriter import QueryRewriter, QueryNormalizer 21 from .query_rewriter import QueryRewriter, QueryNormalizer
  22 +from .style_intent import StyleIntentDetector, StyleIntentProfile, StyleIntentRegistry
  23 +from .tokenization import extract_token_strings, simple_tokenize_query
23 24
24 logger = logging.getLogger(__name__) 25 logger = logging.getLogger(__name__)
25 26
26 import hanlp # type: ignore 27 import hanlp # type: ignore
27 28
28 29
29 -def simple_tokenize_query(text: str) -> List[str]:  
30 - """  
31 - Lightweight tokenizer for suggestion-side heuristics only.  
32 -  
33 - - Consecutive CJK characters form one token  
34 - - Latin / digit runs (with internal hyphens) form tokens  
35 - """  
36 - if not text:  
37 - return []  
38 - pattern = re.compile(r"[\u4e00-\u9fff]+|[A-Za-z0-9_]+(?:-[A-Za-z0-9_]+)*")  
39 - return pattern.findall(text)  
40 -  
41 -  
42 @dataclass(slots=True) 30 @dataclass(slots=True)
43 class ParsedQuery: 31 class ParsedQuery:
44 """Container for query parser facts.""" 32 """Container for query parser facts."""
@@ -50,6 +38,7 @@ class ParsedQuery: @@ -50,6 +38,7 @@ class ParsedQuery:
50 translations: Dict[str, str] = field(default_factory=dict) 38 translations: Dict[str, str] = field(default_factory=dict)
51 query_vector: Optional[np.ndarray] = None 39 query_vector: Optional[np.ndarray] = None
52 query_tokens: List[str] = field(default_factory=list) 40 query_tokens: List[str] = field(default_factory=list)
  41 + style_intent_profile: Optional[StyleIntentProfile] = None
53 42
54 def to_dict(self) -> Dict[str, Any]: 43 def to_dict(self) -> Dict[str, Any]:
55 """Convert to dictionary representation.""" 44 """Convert to dictionary representation."""
@@ -60,6 +49,9 @@ class ParsedQuery: @@ -60,6 +49,9 @@ class ParsedQuery:
60 "detected_language": self.detected_language, 49 "detected_language": self.detected_language,
61 "translations": self.translations, 50 "translations": self.translations,
62 "query_tokens": self.query_tokens, 51 "query_tokens": self.query_tokens,
  52 + "style_intent_profile": (
  53 + self.style_intent_profile.to_dict() if self.style_intent_profile is not None else None
  54 + ),
63 } 55 }
64 56
65 57
@@ -97,6 +89,11 @@ class QueryParser: @@ -97,6 +89,11 @@ class QueryParser:
97 self.language_detector = LanguageDetector() 89 self.language_detector = LanguageDetector()
98 self.rewriter = QueryRewriter(config.query_config.rewrite_dictionary) 90 self.rewriter = QueryRewriter(config.query_config.rewrite_dictionary)
99 self._tokenizer = tokenizer or self._build_tokenizer() 91 self._tokenizer = tokenizer or self._build_tokenizer()
  92 + self.style_intent_registry = StyleIntentRegistry.from_query_config(config.query_config)
  93 + self.style_intent_detector = StyleIntentDetector(
  94 + self.style_intent_registry,
  95 + tokenizer=self._tokenizer,
  96 + )
100 97
101 # Eager initialization (startup-time failure visibility, no lazy init in request path) 98 # Eager initialization (startup-time failure visibility, no lazy init in request path)
102 if self.config.query_config.enable_text_embedding and self._text_encoder is None: 99 if self.config.query_config.enable_text_embedding and self._text_encoder is None:
@@ -172,28 +169,7 @@ class QueryParser: @@ -172,28 +169,7 @@ class QueryParser:
172 @staticmethod 169 @staticmethod
173 def _extract_tokens(tokenizer_result: Any) -> List[str]: 170 def _extract_tokens(tokenizer_result: Any) -> List[str]:
174 """Normalize tokenizer output into a flat token string list.""" 171 """Normalize tokenizer output into a flat token string list."""
175 - if not tokenizer_result:  
176 - return []  
177 - if isinstance(tokenizer_result, str):  
178 - token = tokenizer_result.strip()  
179 - return [token] if token else []  
180 -  
181 - tokens: List[str] = []  
182 - for item in tokenizer_result:  
183 - token: Optional[str] = None  
184 - if isinstance(item, str):  
185 - token = item  
186 - elif isinstance(item, (list, tuple)) and item:  
187 - token = str(item[0])  
188 - elif item is not None:  
189 - token = str(item)  
190 -  
191 - if token is None:  
192 - continue  
193 - token = token.strip()  
194 - if token:  
195 - tokens.append(token)  
196 - return tokens 172 + return extract_token_strings(tokenizer_result)
197 173
198 def _get_query_tokens(self, query: str) -> List[str]: 174 def _get_query_tokens(self, query: str) -> List[str]:
199 return self._extract_tokens(self._tokenizer(query)) 175 return self._extract_tokens(self._tokenizer(query))
@@ -425,6 +401,22 @@ class QueryParser: @@ -425,6 +401,22 @@ class QueryParser:
425 context.store_intermediate_result("translations", translations) 401 context.store_intermediate_result("translations", translations)
426 402
427 # Build result 403 # Build result
  404 + base_result = ParsedQuery(
  405 + original_query=query,
  406 + query_normalized=normalized,
  407 + rewritten_query=query_text,
  408 + detected_language=detected_lang,
  409 + translations=translations,
  410 + query_vector=query_vector,
  411 + query_tokens=query_tokens,
  412 + )
  413 + style_intent_profile = self.style_intent_detector.detect(base_result)
  414 + if context:
  415 + context.store_intermediate_result(
  416 + "style_intent_profile",
  417 + style_intent_profile.to_dict(),
  418 + )
  419 +
428 result = ParsedQuery( 420 result = ParsedQuery(
429 original_query=query, 421 original_query=query,
430 query_normalized=normalized, 422 query_normalized=normalized,
@@ -433,6 +425,7 @@ class QueryParser: @@ -433,6 +425,7 @@ class QueryParser:
433 translations=translations, 425 translations=translations,
434 query_vector=query_vector, 426 query_vector=query_vector,
435 query_tokens=query_tokens, 427 query_tokens=query_tokens,
  428 + style_intent_profile=style_intent_profile,
436 ) 429 )
437 430
438 if context and hasattr(context, 'logger'): 431 if context and hasattr(context, 'logger'):
query/style_intent.py 0 → 100644
@@ -0,0 +1,261 @@ @@ -0,0 +1,261 @@
  1 +"""
  2 +Style intent detection for query understanding.
  3 +"""
  4 +
  5 +from __future__ import annotations
  6 +
  7 +from dataclasses import dataclass, field
  8 +from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Set, Tuple
  9 +
  10 +from .tokenization import TokenizedText, normalize_query_text, tokenize_text
  11 +
  12 +
  13 +@dataclass(frozen=True)
  14 +class StyleIntentDefinition:
  15 + intent_type: str
  16 + term_groups: Tuple[Tuple[str, ...], ...]
  17 + dimension_aliases: Tuple[str, ...]
  18 + synonym_to_canonical: Dict[str, str]
  19 + max_term_ngram: int = 3
  20 +
  21 + @classmethod
  22 + def from_rows(
  23 + cls,
  24 + intent_type: str,
  25 + rows: Sequence[Sequence[str]],
  26 + dimension_aliases: Sequence[str],
  27 + ) -> "StyleIntentDefinition":
  28 + term_groups: List[Tuple[str, ...]] = []
  29 + synonym_to_canonical: Dict[str, str] = {}
  30 + max_ngram = 1
  31 +
  32 + for row in rows:
  33 + normalized_terms: List[str] = []
  34 + for raw_term in row:
  35 + term = normalize_query_text(raw_term)
  36 + if not term or term in normalized_terms:
  37 + continue
  38 + normalized_terms.append(term)
  39 + if not normalized_terms:
  40 + continue
  41 +
  42 + canonical = normalized_terms[0]
  43 + term_groups.append(tuple(normalized_terms))
  44 + for term in normalized_terms:
  45 + synonym_to_canonical[term] = canonical
  46 + max_ngram = max(max_ngram, len(term.split()))
  47 +
  48 + aliases = tuple(
  49 + dict.fromkeys(
  50 + term
  51 + for term in (
  52 + normalize_query_text(alias)
  53 + for alias in dimension_aliases
  54 + )
  55 + if term
  56 + )
  57 + )
  58 +
  59 + return cls(
  60 + intent_type=intent_type,
  61 + term_groups=tuple(term_groups),
  62 + dimension_aliases=aliases,
  63 + synonym_to_canonical=synonym_to_canonical,
  64 + max_term_ngram=max_ngram,
  65 + )
  66 +
  67 + def match_candidates(self, candidates: Iterable[str]) -> Set[str]:
  68 + matched: Set[str] = set()
  69 + for candidate in candidates:
  70 + canonical = self.synonym_to_canonical.get(normalize_query_text(candidate))
  71 + if canonical:
  72 + matched.add(canonical)
  73 + return matched
  74 +
  75 + def match_text(
  76 + self,
  77 + text: str,
  78 + *,
  79 + tokenizer: Optional[Callable[[str], Any]] = None,
  80 + ) -> Set[str]:
  81 + bundle = tokenize_text(text, tokenizer=tokenizer, max_ngram=self.max_term_ngram)
  82 + return self.match_candidates(bundle.candidates)
  83 +
  84 +
  85 +@dataclass(frozen=True)
  86 +class DetectedStyleIntent:
  87 + intent_type: str
  88 + canonical_value: str
  89 + matched_term: str
  90 + matched_query_text: str
  91 + dimension_aliases: Tuple[str, ...]
  92 +
  93 + def to_dict(self) -> Dict[str, Any]:
  94 + return {
  95 + "intent_type": self.intent_type,
  96 + "canonical_value": self.canonical_value,
  97 + "matched_term": self.matched_term,
  98 + "matched_query_text": self.matched_query_text,
  99 + "dimension_aliases": list(self.dimension_aliases),
  100 + }
  101 +
  102 +
  103 +@dataclass(frozen=True)
  104 +class StyleIntentProfile:
  105 + query_variants: Tuple[TokenizedText, ...] = field(default_factory=tuple)
  106 + intents: Tuple[DetectedStyleIntent, ...] = field(default_factory=tuple)
  107 +
  108 + @property
  109 + def is_active(self) -> bool:
  110 + return bool(self.intents)
  111 +
  112 + def get_intents(self, intent_type: Optional[str] = None) -> List[DetectedStyleIntent]:
  113 + if intent_type is None:
  114 + return list(self.intents)
  115 + normalized = normalize_query_text(intent_type)
  116 + return [intent for intent in self.intents if intent.intent_type == normalized]
  117 +
  118 + def get_canonical_values(self, intent_type: str) -> Set[str]:
  119 + return {intent.canonical_value for intent in self.get_intents(intent_type)}
  120 +
  121 + def to_dict(self) -> Dict[str, Any]:
  122 + return {
  123 + "active": self.is_active,
  124 + "intents": [intent.to_dict() for intent in self.intents],
  125 + "query_variants": [
  126 + {
  127 + "text": variant.text,
  128 + "normalized_text": variant.normalized_text,
  129 + "fine_tokens": list(variant.fine_tokens),
  130 + "coarse_tokens": list(variant.coarse_tokens),
  131 + "candidates": list(variant.candidates),
  132 + }
  133 + for variant in self.query_variants
  134 + ],
  135 + }
  136 +
  137 +
  138 +class StyleIntentRegistry:
  139 + """Holds style intent vocabularies and matching helpers."""
  140 +
  141 + def __init__(
  142 + self,
  143 + definitions: Dict[str, StyleIntentDefinition],
  144 + *,
  145 + enabled: bool = True,
  146 + ) -> None:
  147 + self.definitions = definitions
  148 + self.enabled = bool(enabled)
  149 +
  150 + @classmethod
  151 + def from_query_config(cls, query_config: Any) -> "StyleIntentRegistry":
  152 + style_terms = getattr(query_config, "style_intent_terms", {}) or {}
  153 + dimension_aliases = getattr(query_config, "style_intent_dimension_aliases", {}) or {}
  154 + definitions: Dict[str, StyleIntentDefinition] = {}
  155 +
  156 + for intent_type, rows in style_terms.items():
  157 + definition = StyleIntentDefinition.from_rows(
  158 + intent_type=normalize_query_text(intent_type),
  159 + rows=rows or [],
  160 + dimension_aliases=dimension_aliases.get(intent_type, []),
  161 + )
  162 + if definition.synonym_to_canonical:
  163 + definitions[definition.intent_type] = definition
  164 +
  165 + return cls(
  166 + definitions,
  167 + enabled=bool(getattr(query_config, "style_intent_enabled", True)),
  168 + )
  169 +
  170 + def get_definition(self, intent_type: str) -> Optional[StyleIntentDefinition]:
  171 + return self.definitions.get(normalize_query_text(intent_type))
  172 +
  173 + def get_dimension_aliases(self, intent_type: str) -> Tuple[str, ...]:
  174 + definition = self.get_definition(intent_type)
  175 + return definition.dimension_aliases if definition else tuple()
  176 +
  177 +
  178 +class StyleIntentDetector:
  179 + """Detects style intents from parsed query variants."""
  180 +
  181 + def __init__(
  182 + self,
  183 + registry: StyleIntentRegistry,
  184 + *,
  185 + tokenizer: Optional[Callable[[str], Any]] = None,
  186 + ) -> None:
  187 + self.registry = registry
  188 + self.tokenizer = tokenizer
  189 +
  190 + def _build_query_variants(self, parsed_query: Any) -> Tuple[TokenizedText, ...]:
  191 + seen = set()
  192 + variants: List[TokenizedText] = []
  193 + texts = [
  194 + getattr(parsed_query, "original_query", None),
  195 + getattr(parsed_query, "query_normalized", None),
  196 + getattr(parsed_query, "rewritten_query", None),
  197 + ]
  198 +
  199 + translations = getattr(parsed_query, "translations", {}) or {}
  200 + if isinstance(translations, dict):
  201 + texts.extend(translations.values())
  202 +
  203 + for raw_text in texts:
  204 + text = str(raw_text or "").strip()
  205 + if not text:
  206 + continue
  207 + normalized = normalize_query_text(text)
  208 + if not normalized or normalized in seen:
  209 + continue
  210 + seen.add(normalized)
  211 + variants.append(
  212 + tokenize_text(
  213 + text,
  214 + tokenizer=self.tokenizer,
  215 + max_ngram=max(
  216 + (definition.max_term_ngram for definition in self.registry.definitions.values()),
  217 + default=3,
  218 + ),
  219 + )
  220 + )
  221 +
  222 + return tuple(variants)
  223 +
  224 + def detect(self, parsed_query: Any) -> StyleIntentProfile:
  225 + if not self.registry.enabled or not self.registry.definitions:
  226 + return StyleIntentProfile()
  227 +
  228 + query_variants = self._build_query_variants(parsed_query)
  229 + detected: List[DetectedStyleIntent] = []
  230 + seen_pairs = set()
  231 +
  232 + for variant in query_variants:
  233 + for intent_type, definition in self.registry.definitions.items():
  234 + matched_canonicals = definition.match_candidates(variant.candidates)
  235 + if not matched_canonicals:
  236 + continue
  237 +
  238 + for candidate in variant.candidates:
  239 + normalized_candidate = normalize_query_text(candidate)
  240 + canonical = definition.synonym_to_canonical.get(normalized_candidate)
  241 + if not canonical or canonical not in matched_canonicals:
  242 + continue
  243 + pair = (intent_type, canonical)
  244 + if pair in seen_pairs:
  245 + continue
  246 + seen_pairs.add(pair)
  247 + detected.append(
  248 + DetectedStyleIntent(
  249 + intent_type=intent_type,
  250 + canonical_value=canonical,
  251 + matched_term=normalized_candidate,
  252 + matched_query_text=variant.text,
  253 + dimension_aliases=definition.dimension_aliases,
  254 + )
  255 + )
  256 + break
  257 +
  258 + return StyleIntentProfile(
  259 + query_variants=query_variants,
  260 + intents=tuple(detected),
  261 + )
query/tokenization.py 0 → 100644
@@ -0,0 +1,122 @@ @@ -0,0 +1,122 @@
  1 +"""
  2 +Shared tokenization helpers for query understanding.
  3 +"""
  4 +
  5 +from __future__ import annotations
  6 +
  7 +from dataclasses import dataclass
  8 +import re
  9 +from typing import Any, Callable, Iterable, List, Optional, Sequence, Tuple
  10 +
  11 +
  12 +_TOKEN_PATTERN = re.compile(r"[\u4e00-\u9fff]+|[A-Za-z0-9_]+(?:-[A-Za-z0-9_]+)*")
  13 +
  14 +
  15 +def normalize_query_text(text: Optional[str]) -> str:
  16 + if text is None:
  17 + return ""
  18 + return " ".join(str(text).strip().casefold().split())
  19 +
  20 +
  21 +def simple_tokenize_query(text: str) -> List[str]:
  22 + """
  23 + Lightweight tokenizer for coarse query matching.
  24 +
  25 + - Consecutive CJK characters form one token
  26 + - Latin / digit runs (with internal hyphens) form tokens
  27 + """
  28 + if not text:
  29 + return []
  30 + return _TOKEN_PATTERN.findall(text)
  31 +
  32 +
  33 +def extract_token_strings(tokenizer_result: Any) -> List[str]:
  34 + """Normalize tokenizer output into a flat token string list."""
  35 + if not tokenizer_result:
  36 + return []
  37 + if isinstance(tokenizer_result, str):
  38 + token = tokenizer_result.strip()
  39 + return [token] if token else []
  40 +
  41 + tokens: List[str] = []
  42 + for item in tokenizer_result:
  43 + token: Optional[str] = None
  44 + if isinstance(item, str):
  45 + token = item
  46 + elif isinstance(item, (list, tuple)) and item:
  47 + token = str(item[0])
  48 + elif item is not None:
  49 + token = str(item)
  50 +
  51 + if token is None:
  52 + continue
  53 + token = token.strip()
  54 + if token:
  55 + tokens.append(token)
  56 + return tokens
  57 +
  58 +
  59 +def _dedupe_preserve_order(values: Iterable[str]) -> List[str]:
  60 + result: List[str] = []
  61 + seen = set()
  62 + for value in values:
  63 + normalized = normalize_query_text(value)
  64 + if not normalized or normalized in seen:
  65 + continue
  66 + seen.add(normalized)
  67 + result.append(normalized)
  68 + return result
  69 +
  70 +
  71 +def _build_phrase_candidates(tokens: Sequence[str], max_ngram: int) -> List[str]:
  72 + if not tokens:
  73 + return []
  74 +
  75 + phrases: List[str] = []
  76 + upper = max(1, int(max_ngram))
  77 + for size in range(1, upper + 1):
  78 + if size > len(tokens):
  79 + break
  80 + for start in range(0, len(tokens) - size + 1):
  81 + phrase = " ".join(tokens[start:start + size]).strip()
  82 + if phrase:
  83 + phrases.append(phrase)
  84 + return phrases
  85 +
  86 +
  87 +@dataclass(frozen=True)
  88 +class TokenizedText:
  89 + text: str
  90 + normalized_text: str
  91 + fine_tokens: Tuple[str, ...]
  92 + coarse_tokens: Tuple[str, ...]
  93 + candidates: Tuple[str, ...]
  94 +
  95 +
  96 +def tokenize_text(
  97 + text: str,
  98 + *,
  99 + tokenizer: Optional[Callable[[str], Any]] = None,
  100 + max_ngram: int = 3,
  101 +) -> TokenizedText:
  102 + normalized_text = normalize_query_text(text)
  103 + coarse_tokens = _dedupe_preserve_order(simple_tokenize_query(text))
  104 +
  105 + fine_raw = extract_token_strings(tokenizer(text)) if tokenizer is not None and text else []
  106 + fine_tokens = _dedupe_preserve_order(fine_raw)
  107 +
  108 + candidates = _dedupe_preserve_order(
  109 + list(fine_tokens)
  110 + + list(coarse_tokens)
  111 + + _build_phrase_candidates(fine_tokens, max_ngram=max_ngram)
  112 + + _build_phrase_candidates(coarse_tokens, max_ngram=max_ngram)
  113 + + ([normalized_text] if normalized_text else [])
  114 + )
  115 +
  116 + return TokenizedText(
  117 + text=text,
  118 + normalized_text=normalized_text,
  119 + fine_tokens=tuple(fine_tokens),
  120 + coarse_tokens=tuple(coarse_tokens),
  121 + candidates=tuple(candidates),
  122 + )
search/rerank_client.py
@@ -62,11 +62,19 @@ def build_docs_from_hits( @@ -62,11 +62,19 @@ def build_docs_from_hits(
62 need_category_path = "{category_path}" in doc_template 62 need_category_path = "{category_path}" in doc_template
63 for hit in es_hits: 63 for hit in es_hits:
64 src = hit.get("_source") or {} 64 src = hit.get("_source") or {}
  65 + title_suffix = str(hit.get("_style_rerank_suffix") or "").strip()
65 if only_title: 66 if only_title:
66 - docs.append(pick_lang_text(src.get("title"))) 67 + title = pick_lang_text(src.get("title"))
  68 + if title_suffix:
  69 + title = f"{title} {title_suffix}".strip()
  70 + docs.append(title)
67 else: 71 else:
68 values = _SafeDict( 72 values = _SafeDict(
69 - title=pick_lang_text(src.get("title")), 73 + title=(
  74 + f"{pick_lang_text(src.get('title'))} {title_suffix}".strip()
  75 + if title_suffix
  76 + else pick_lang_text(src.get("title"))
  77 + ),
70 brief=pick_lang_text(src.get("brief")) if need_brief else "", 78 brief=pick_lang_text(src.get("brief")) if need_brief else "",
71 vendor=pick_lang_text(src.get("vendor")) if need_vendor else "", 79 vendor=pick_lang_text(src.get("vendor")) if need_vendor else "",
72 description=pick_lang_text(src.get("description")) if need_description else "", 80 description=pick_lang_text(src.get("description")) if need_description else "",
search/searcher.py
@@ -10,12 +10,13 @@ import time, json @@ -10,12 +10,13 @@ import time, json
10 import logging 10 import logging
11 import hashlib 11 import hashlib
12 from string import Formatter 12 from string import Formatter
13 -import numpy as np  
14 13
15 from utils.es_client import ESClient 14 from utils.es_client import ESClient
16 from query import QueryParser, ParsedQuery 15 from query import QueryParser, ParsedQuery
  16 +from query.style_intent import StyleIntentRegistry
17 from embeddings.image_encoder import CLIPImageEncoder 17 from embeddings.image_encoder import CLIPImageEncoder
18 from .es_query_builder import ESQueryBuilder 18 from .es_query_builder import ESQueryBuilder
  19 +from .sku_intent_selector import SkuSelectionDecision, StyleSkuSelector
19 from config import SearchConfig 20 from config import SearchConfig
20 from config.tenant_config_loader import get_tenant_config_loader 21 from config.tenant_config_loader import get_tenant_config_loader
21 from context.request_context import RequestContext, RequestContextStage 22 from context.request_context import RequestContext, RequestContextStage
@@ -115,6 +116,12 @@ class Searcher: @@ -115,6 +116,12 @@ class Searcher:
115 else: 116 else:
116 self.image_encoder = image_encoder 117 self.image_encoder = image_encoder
117 self.source_fields = config.query_config.source_fields 118 self.source_fields = config.query_config.source_fields
  119 + self.style_intent_registry = StyleIntentRegistry.from_query_config(self.config.query_config)
  120 + self.style_sku_selector = StyleSkuSelector(
  121 + self.style_intent_registry,
  122 + text_encoder_getter=lambda: getattr(self.query_parser, "text_encoder", None),
  123 + tokenizer_getter=lambda: getattr(self.query_parser, "_tokenizer", None),
  124 + )
118 125
119 # Query builder - simplified single-layer architecture 126 # Query builder - simplified single-layer architecture
120 self.query_builder = ESQueryBuilder( 127 self.query_builder = ESQueryBuilder(
@@ -155,7 +162,11 @@ class Searcher: @@ -155,7 +162,11 @@ class Searcher:
155 return 162 return
156 es_query["_source"] = {"includes": self.source_fields} 163 es_query["_source"] = {"includes": self.source_fields}
157 164
158 - def _resolve_rerank_source_filter(self, doc_template: str) -> Dict[str, Any]: 165 + def _resolve_rerank_source_filter(
  166 + self,
  167 + doc_template: str,
  168 + parsed_query: Optional[ParsedQuery] = None,
  169 + ) -> Dict[str, Any]:
159 """ 170 """
160 Build a lightweight _source filter for rerank prefetch. 171 Build a lightweight _source filter for rerank prefetch.
161 172
@@ -182,6 +193,16 @@ class Searcher: @@ -182,6 +193,16 @@ class Searcher:
182 if not includes: 193 if not includes:
183 includes.add("title") 194 includes.add("title")
184 195
  196 + if self._has_style_intent(parsed_query):
  197 + includes.update(
  198 + {
  199 + "skus",
  200 + "option1_name",
  201 + "option2_name",
  202 + "option3_name",
  203 + }
  204 + )
  205 +
185 return {"includes": sorted(includes)} 206 return {"includes": sorted(includes)}
186 207
187 def _fetch_hits_by_ids( 208 def _fetch_hits_by_ids(
@@ -225,256 +246,23 @@ class Searcher: @@ -225,256 +246,23 @@ class Searcher:
225 return hits_by_id, int(resp.get("took", 0) or 0) 246 return hits_by_id, int(resp.get("took", 0) or 0)
226 247
227 @staticmethod 248 @staticmethod
228 - def _normalize_sku_match_text(value: Optional[str]) -> str:  
229 - """Normalize free text for lightweight SKU option matching."""  
230 - if value is None:  
231 - return ""  
232 - return " ".join(str(value).strip().casefold().split())  
233 -  
234 - @staticmethod  
235 - def _sku_option1_embedding_key(  
236 - sku: Dict[str, Any],  
237 - spu_option1_name: Optional[Any] = None,  
238 - ) -> Optional[str]:  
239 - """  
240 - Text sent to the embedding service for option1 must be "name:value"  
241 - (option name from SKU row or SPU-level option1_name).  
242 - """  
243 - value_raw = sku.get("option1_value")  
244 - if value_raw is None:  
245 - return None  
246 - value = str(value_raw).strip()  
247 - if not value:  
248 - return None  
249 - name = sku.get("option1_name")  
250 - if name is None or not str(name).strip():  
251 - name = spu_option1_name  
252 - name_str = str(name).strip() if name is not None and str(name).strip() else ""  
253 - if name_str:  
254 - value = f"{name_str}:{value}"  
255 - return value.casefold()  
256 -  
257 - def _build_sku_query_texts(self, parsed_query: ParsedQuery) -> List[str]:  
258 - """Collect original and translated query texts for SKU option matching."""  
259 - candidates: List[str] = []  
260 - for text in (  
261 - getattr(parsed_query, "original_query", None),  
262 - getattr(parsed_query, "query_normalized", None),  
263 - getattr(parsed_query, "rewritten_query", None),  
264 - ):  
265 - normalized = self._normalize_sku_match_text(text)  
266 - if normalized:  
267 - candidates.append(normalized)  
268 -  
269 - translations = getattr(parsed_query, "translations", {}) or {}  
270 - if isinstance(translations, dict):  
271 - for text in translations.values():  
272 - normalized = self._normalize_sku_match_text(text)  
273 - if normalized:  
274 - candidates.append(normalized)  
275 -  
276 - deduped: List[str] = []  
277 - seen = set()  
278 - for text in candidates:  
279 - if text in seen:  
280 - continue  
281 - seen.add(text)  
282 - deduped.append(text)  
283 - return deduped  
284 -  
285 - def _find_query_matching_sku_index(  
286 - self,  
287 - skus: List[Dict[str, Any]],  
288 - query_texts: List[str],  
289 - spu_option1_name: Optional[Any] = None,  
290 - ) -> Optional[int]:  
291 - """Return the first SKU whose option1_value (or name:value) appears in query texts."""  
292 - if not skus or not query_texts:  
293 - return None  
294 -  
295 - for index, sku in enumerate(skus):  
296 - option1_value = self._normalize_sku_match_text(sku.get("option1_value"))  
297 - if not option1_value:  
298 - continue  
299 - if any(option1_value in query_text for query_text in query_texts):  
300 - return index  
301 - embed_key = self._sku_option1_embedding_key(sku, spu_option1_name)  
302 - if embed_key and embed_key != option1_value:  
303 - composite_norm = self._normalize_sku_match_text(embed_key.replace(":", " "))  
304 - if any(composite_norm in query_text for query_text in query_texts):  
305 - return index  
306 - if any(embed_key.casefold() in query_text for query_text in query_texts):  
307 - return index  
308 - return None  
309 -  
310 - def _encode_query_vector_for_sku_matching(  
311 - self,  
312 - parsed_query: ParsedQuery,  
313 - context: Optional[RequestContext] = None,  
314 - ) -> Optional[np.ndarray]:  
315 - """Best-effort fallback query embedding for final-page SKU matching."""  
316 - query_text = (  
317 - getattr(parsed_query, "rewritten_query", None)  
318 - or getattr(parsed_query, "query_normalized", None)  
319 - or getattr(parsed_query, "original_query", None)  
320 - )  
321 - if not query_text:  
322 - return None  
323 -  
324 - text_encoder = getattr(self.query_parser, "text_encoder", None)  
325 - if text_encoder is None:  
326 - return None  
327 -  
328 - try:  
329 - vectors = text_encoder.encode([query_text], priority=1)  
330 - except Exception as exc:  
331 - logger.warning("Failed to encode query vector for SKU matching: %s", exc, exc_info=True)  
332 - if context is not None:  
333 - context.add_warning(f"SKU query embedding failed: {exc}")  
334 - return None  
335 -  
336 - if vectors is None or len(vectors) == 0:  
337 - return None  
338 -  
339 - vector = vectors[0]  
340 - if vector is None:  
341 - return None  
342 - return np.asarray(vector, dtype=np.float32)  
343 -  
344 - def _select_sku_by_embedding(  
345 - self,  
346 - skus: List[Dict[str, Any]],  
347 - option1_vectors: Dict[str, np.ndarray],  
348 - query_vector: np.ndarray,  
349 - spu_option1_name: Optional[Any] = None,  
350 - ) -> Tuple[Optional[int], Optional[float]]:  
351 - """Select the SKU whose option1 embedding key (name:value) is most similar to the query."""  
352 - best_index: Optional[int] = None  
353 - best_score: Optional[float] = None  
354 -  
355 - for index, sku in enumerate(skus):  
356 - embed_key = self._sku_option1_embedding_key(sku, spu_option1_name)  
357 - if not embed_key:  
358 - continue  
359 - option_vector = option1_vectors.get(embed_key)  
360 - if option_vector is None:  
361 - continue  
362 - score = float(np.inner(query_vector, option_vector))  
363 - if best_score is None or score > best_score:  
364 - best_index = index  
365 - best_score = score  
366 -  
367 - return best_index, best_score  
368 -  
369 - @staticmethod  
370 - def _promote_matching_sku(source: Dict[str, Any], match_index: int) -> Optional[Dict[str, Any]]:  
371 - """Move the matched SKU to the front and swap the SPU image."""  
372 - skus = source.get("skus")  
373 - if not isinstance(skus, list) or match_index < 0 or match_index >= len(skus):  
374 - return None  
375 -  
376 - matched_sku = skus.pop(match_index)  
377 - skus.insert(0, matched_sku) 249 + def _has_style_intent(parsed_query: Optional[ParsedQuery]) -> bool:
  250 + profile = getattr(parsed_query, "style_intent_profile", None)
  251 + return bool(getattr(profile, "is_active", False))
378 252
379 - image_src = matched_sku.get("image_src") or matched_sku.get("imageSrc")  
380 - if image_src:  
381 - source["image_url"] = image_src  
382 - return matched_sku  
383 -  
384 - def _apply_sku_sorting_for_page_hits( 253 + def _apply_style_intent_to_hits(
385 self, 254 self,
386 es_hits: List[Dict[str, Any]], 255 es_hits: List[Dict[str, Any]],
387 parsed_query: ParsedQuery, 256 parsed_query: ParsedQuery,
388 context: Optional[RequestContext] = None, 257 context: Optional[RequestContext] = None,
389 - ) -> None:  
390 - """Sort each page hit's SKUs so the best-matching SKU is first."""  
391 - if not es_hits:  
392 - return  
393 -  
394 - query_texts = self._build_sku_query_texts(parsed_query)  
395 - unmatched_hits: List[Dict[str, Any]] = []  
396 - option1_values_to_encode: List[str] = []  
397 - seen_option1_values = set()  
398 - text_matched = 0  
399 - embedding_matched = 0  
400 -  
401 - for hit in es_hits:  
402 - source = hit.get("_source")  
403 - if not isinstance(source, dict):  
404 - continue  
405 - skus = source.get("skus")  
406 - if not isinstance(skus, list) or not skus:  
407 - continue  
408 -  
409 - spu_option1_name = source.get("option1_name")  
410 - match_index = self._find_query_matching_sku_index(  
411 - skus, query_texts, spu_option1_name=spu_option1_name  
412 - )  
413 - if match_index is not None:  
414 - self._promote_matching_sku(source, match_index)  
415 - text_matched += 1  
416 - continue  
417 -  
418 - unmatched_hits.append(hit)  
419 - for sku in skus:  
420 - embed_key = self._sku_option1_embedding_key(sku, spu_option1_name)  
421 - if not embed_key or embed_key in seen_option1_values:  
422 - continue  
423 - seen_option1_values.add(embed_key)  
424 - option1_values_to_encode.append(embed_key)  
425 -  
426 - if not unmatched_hits or not option1_values_to_encode:  
427 - return  
428 -  
429 - query_vector = getattr(parsed_query, "query_vector", None)  
430 - if query_vector is None:  
431 - query_vector = self._encode_query_vector_for_sku_matching(parsed_query, context=context)  
432 - if query_vector is None:  
433 - return  
434 -  
435 - text_encoder = getattr(self.query_parser, "text_encoder", None)  
436 - if text_encoder is None:  
437 - return  
438 -  
439 - try:  
440 - encoded_option_vectors = text_encoder.encode(option1_values_to_encode, priority=1)  
441 - except Exception as exc:  
442 - logger.warning("Failed to encode SKU option1 values for final-page sorting: %s", exc, exc_info=True)  
443 - if context is not None:  
444 - context.add_warning(f"SKU option embedding failed: {exc}")  
445 - return  
446 -  
447 - option1_vectors: Dict[str, np.ndarray] = {}  
448 - for option1_value, vector in zip(option1_values_to_encode, encoded_option_vectors):  
449 - if vector is None:  
450 - continue  
451 - option1_vectors[option1_value] = np.asarray(vector, dtype=np.float32)  
452 -  
453 - query_vector_array = np.asarray(query_vector, dtype=np.float32)  
454 - for hit in unmatched_hits:  
455 - source = hit.get("_source")  
456 - if not isinstance(source, dict):  
457 - continue  
458 - skus = source.get("skus")  
459 - if not isinstance(skus, list) or not skus:  
460 - continue  
461 - match_index, _ = self._select_sku_by_embedding(  
462 - skus,  
463 - option1_vectors,  
464 - query_vector_array,  
465 - spu_option1_name=source.get("option1_name"),  
466 - )  
467 - if match_index is None:  
468 - continue  
469 - self._promote_matching_sku(source, match_index)  
470 - embedding_matched += 1  
471 -  
472 - if text_matched or embedding_matched:  
473 - logger.info(  
474 - "Final-page SKU sorting completed | text_matched=%s | embedding_matched=%s",  
475 - text_matched,  
476 - embedding_matched, 258 + ) -> Dict[str, SkuSelectionDecision]:
  259 + decisions = self.style_sku_selector.prepare_hits(es_hits, parsed_query)
  260 + if decisions and context is not None:
  261 + context.store_intermediate_result(
  262 + "style_intent_sku_decisions",
  263 + {doc_id: decision.to_dict() for doc_id, decision in decisions.items()},
477 ) 264 )
  265 + return decisions
478 266
479 def search( 267 def search(
480 self, 268 self,
@@ -583,7 +371,8 @@ class Searcher: @@ -583,7 +371,8 @@ class Searcher:
583 context.metadata['feature_flags'] = { 371 context.metadata['feature_flags'] = {
584 'translation_enabled': enable_translation, 372 'translation_enabled': enable_translation,
585 'embedding_enabled': enable_embedding, 373 'embedding_enabled': enable_embedding,
586 - 'rerank_enabled': do_rerank 374 + 'rerank_enabled': do_rerank,
  375 + 'style_intent_enabled': bool(self.style_intent_registry.enabled),
587 } 376 }
588 377
589 # Step 1: Parse query 378 # Step 1: Parse query
@@ -607,6 +396,7 @@ class Searcher: @@ -607,6 +396,7 @@ class Searcher:
607 domain="default", 396 domain="default",
608 is_simple_query=True 397 is_simple_query=True
609 ) 398 )
  399 + context.metadata["feature_flags"]["style_intent_active"] = self._has_style_intent(parsed_query)
610 400
611 context.logger.info( 401 context.logger.info(
612 f"查询解析完成 | 原查询: '{parsed_query.original_query}' | " 402 f"查询解析完成 | 原查询: '{parsed_query.original_query}' | "
@@ -667,7 +457,10 @@ class Searcher: @@ -667,7 +457,10 @@ class Searcher:
667 es_query_for_fetch = es_query 457 es_query_for_fetch = es_query
668 rerank_prefetch_source = None 458 rerank_prefetch_source = None
669 if in_rerank_window: 459 if in_rerank_window:
670 - rerank_prefetch_source = self._resolve_rerank_source_filter(effective_doc_template) 460 + rerank_prefetch_source = self._resolve_rerank_source_filter(
  461 + effective_doc_template,
  462 + parsed_query=parsed_query,
  463 + )
671 es_query_for_fetch = dict(es_query) 464 es_query_for_fetch = dict(es_query)
672 es_query_for_fetch["_source"] = rerank_prefetch_source 465 es_query_for_fetch["_source"] = rerank_prefetch_source
673 466
@@ -751,6 +544,20 @@ class Searcher: @@ -751,6 +544,20 @@ class Searcher:
751 finally: 544 finally:
752 context.end_stage(RequestContextStage.ELASTICSEARCH_SEARCH_PRIMARY) 545 context.end_stage(RequestContextStage.ELASTICSEARCH_SEARCH_PRIMARY)
753 546
  547 + style_intent_decisions: Dict[str, SkuSelectionDecision] = {}
  548 + if self._has_style_intent(parsed_query) and in_rerank_window:
  549 + style_intent_decisions = self._apply_style_intent_to_hits(
  550 + es_response.get("hits", {}).get("hits") or [],
  551 + parsed_query,
  552 + context=context,
  553 + )
  554 + if style_intent_decisions:
  555 + context.logger.info(
  556 + "款式意图 SKU 预筛选完成 | hits=%s",
  557 + len(style_intent_decisions),
  558 + extra={'reqid': context.reqid, 'uid': context.uid}
  559 + )
  560 +
754 # Optional Step 4.5: AI reranking(仅当请求范围在重排窗口内时执行) 561 # Optional Step 4.5: AI reranking(仅当请求范围在重排窗口内时执行)
755 if do_rerank and in_rerank_window: 562 if do_rerank and in_rerank_window:
756 context.start_stage(RequestContextStage.RERANKING) 563 context.start_stage(RequestContextStage.RERANKING)
@@ -841,6 +648,11 @@ class Searcher: @@ -841,6 +648,11 @@ class Searcher:
841 if "_source" in detail_hit: 648 if "_source" in detail_hit:
842 hit["_source"] = detail_hit.get("_source") or {} 649 hit["_source"] = detail_hit.get("_source") or {}
843 filled += 1 650 filled += 1
  651 + if style_intent_decisions:
  652 + self.style_sku_selector.apply_precomputed_decisions(
  653 + sliced,
  654 + style_intent_decisions,
  655 + )
844 if fill_took: 656 if fill_took:
845 es_response["took"] = int((es_response.get("took", 0) or 0) + fill_took) 657 es_response["took"] = int((es_response.get("took", 0) or 0) + fill_took)
846 context.logger.info( 658 context.logger.info(
@@ -883,7 +695,18 @@ class Searcher: @@ -883,7 +695,18 @@ class Searcher:
883 continue 695 continue
884 rerank_debug_by_doc[str(doc_id)] = item 696 rerank_debug_by_doc[str(doc_id)] = item
885 697
886 - self._apply_sku_sorting_for_page_hits(es_hits, parsed_query, context=context) 698 + if self._has_style_intent(parsed_query):
  699 + if in_rerank_window and style_intent_decisions:
  700 + self.style_sku_selector.apply_precomputed_decisions(
  701 + es_hits,
  702 + style_intent_decisions,
  703 + )
  704 + elif not in_rerank_window:
  705 + style_intent_decisions = self._apply_style_intent_to_hits(
  706 + es_hits,
  707 + parsed_query,
  708 + context=context,
  709 + )
887 710
888 # Format results using ResultFormatter 711 # Format results using ResultFormatter
889 formatted_results = ResultFormatter.format_search_results( 712 formatted_results = ResultFormatter.format_search_results(
@@ -902,6 +725,11 @@ class Searcher: @@ -902,6 +725,11 @@ class Searcher:
902 rerank_debug = None 725 rerank_debug = None
903 if doc_id is not None: 726 if doc_id is not None:
904 rerank_debug = rerank_debug_by_doc.get(str(doc_id)) 727 rerank_debug = rerank_debug_by_doc.get(str(doc_id))
  728 + style_intent_debug = None
  729 + if doc_id is not None and style_intent_decisions:
  730 + decision = style_intent_decisions.get(str(doc_id))
  731 + if decision is not None:
  732 + style_intent_debug = decision.to_dict()
905 733
906 raw_score = hit.get("_score") 734 raw_score = hit.get("_score")
907 try: 735 try:
@@ -940,6 +768,9 @@ class Searcher: @@ -940,6 +768,9 @@ class Searcher:
940 debug_entry["fused_score"] = rerank_debug.get("fused_score") 768 debug_entry["fused_score"] = rerank_debug.get("fused_score")
941 debug_entry["matched_queries"] = rerank_debug.get("matched_queries") 769 debug_entry["matched_queries"] = rerank_debug.get("matched_queries")
942 770
  771 + if style_intent_debug:
  772 + debug_entry["style_intent_sku"] = style_intent_debug
  773 +
943 per_result_debug.append(debug_entry) 774 per_result_debug.append(debug_entry)
944 775
945 # Format facets 776 # Format facets
@@ -987,7 +818,8 @@ class Searcher: @@ -987,7 +818,8 @@ class Searcher:
987 "translations": context.query_analysis.translations, 818 "translations": context.query_analysis.translations,
988 "has_vector": context.query_analysis.query_vector is not None, 819 "has_vector": context.query_analysis.query_vector is not None,
989 "is_simple_query": context.query_analysis.is_simple_query, 820 "is_simple_query": context.query_analysis.is_simple_query,
990 - "domain": context.query_analysis.domain 821 + "domain": context.query_analysis.domain,
  822 + "style_intent_profile": context.get_intermediate_result("style_intent_profile"),
991 }, 823 },
992 "es_query": context.get_intermediate_result('es_query', {}), 824 "es_query": context.get_intermediate_result('es_query', {}),
993 "es_response": { 825 "es_response": {
search/sku_intent_selector.py 0 → 100644
@@ -0,0 +1,405 @@ @@ -0,0 +1,405 @@
  1 +"""
  2 +SKU selection for style-intent-aware search results.
  3 +"""
  4 +
  5 +from __future__ import annotations
  6 +
  7 +from dataclasses import dataclass, field
  8 +from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple
  9 +
  10 +import numpy as np
  11 +
  12 +from query.style_intent import StyleIntentProfile, StyleIntentRegistry
  13 +from query.tokenization import normalize_query_text
  14 +
  15 +
  16 +@dataclass(frozen=True)
  17 +class SkuSelectionDecision:
  18 + selected_sku_id: Optional[str]
  19 + rerank_suffix: str
  20 + selected_text: str
  21 + matched_stage: str
  22 + similarity_score: Optional[float] = None
  23 + resolved_dimensions: Dict[str, Optional[str]] = field(default_factory=dict)
  24 +
  25 + def to_dict(self) -> Dict[str, Any]:
  26 + return {
  27 + "selected_sku_id": self.selected_sku_id,
  28 + "rerank_suffix": self.rerank_suffix,
  29 + "selected_text": self.selected_text,
  30 + "matched_stage": self.matched_stage,
  31 + "similarity_score": self.similarity_score,
  32 + "resolved_dimensions": dict(self.resolved_dimensions),
  33 + }
  34 +
  35 +
  36 +@dataclass
  37 +class _SkuCandidate:
  38 + index: int
  39 + sku_id: str
  40 + sku: Dict[str, Any]
  41 + selection_text: str
  42 + intent_texts: Dict[str, str]
  43 +
  44 +
  45 +class StyleSkuSelector:
  46 + """Selects the best SKU for an SPU based on detected style intent."""
  47 +
  48 + def __init__(
  49 + self,
  50 + registry: StyleIntentRegistry,
  51 + *,
  52 + text_encoder_getter: Optional[Callable[[], Any]] = None,
  53 + tokenizer_getter: Optional[Callable[[], Any]] = None,
  54 + ) -> None:
  55 + self.registry = registry
  56 + self._text_encoder_getter = text_encoder_getter
  57 + self._tokenizer_getter = tokenizer_getter
  58 +
  59 + def prepare_hits(
  60 + self,
  61 + es_hits: List[Dict[str, Any]],
  62 + parsed_query: Any,
  63 + ) -> Dict[str, SkuSelectionDecision]:
  64 + decisions: Dict[str, SkuSelectionDecision] = {}
  65 + style_profile = getattr(parsed_query, "style_intent_profile", None)
  66 + if not isinstance(style_profile, StyleIntentProfile) or not style_profile.is_active:
  67 + return decisions
  68 +
  69 + query_texts = self._build_query_texts(parsed_query, style_profile)
  70 + query_vector = self._get_query_vector(parsed_query)
  71 + tokenizer = self._get_tokenizer()
  72 +
  73 + for hit in es_hits:
  74 + source = hit.get("_source")
  75 + if not isinstance(source, dict):
  76 + continue
  77 +
  78 + decision = self._select_for_source(
  79 + source,
  80 + style_profile=style_profile,
  81 + query_texts=query_texts,
  82 + query_vector=query_vector,
  83 + tokenizer=tokenizer,
  84 + )
  85 + if decision is None:
  86 + continue
  87 +
  88 + self._apply_decision_to_source(source, decision)
  89 + if decision.rerank_suffix:
  90 + hit["_style_rerank_suffix"] = decision.rerank_suffix
  91 +
  92 + doc_id = hit.get("_id")
  93 + if doc_id is not None:
  94 + decisions[str(doc_id)] = decision
  95 +
  96 + return decisions
  97 +
  98 + def apply_precomputed_decisions(
  99 + self,
  100 + es_hits: List[Dict[str, Any]],
  101 + decisions: Dict[str, SkuSelectionDecision],
  102 + ) -> None:
  103 + if not es_hits or not decisions:
  104 + return
  105 +
  106 + for hit in es_hits:
  107 + doc_id = hit.get("_id")
  108 + if doc_id is None:
  109 + continue
  110 + decision = decisions.get(str(doc_id))
  111 + if decision is None:
  112 + continue
  113 + source = hit.get("_source")
  114 + if not isinstance(source, dict):
  115 + continue
  116 + self._apply_decision_to_source(source, decision)
  117 + if decision.rerank_suffix:
  118 + hit["_style_rerank_suffix"] = decision.rerank_suffix
  119 +
  120 + def _build_query_texts(
  121 + self,
  122 + parsed_query: Any,
  123 + style_profile: StyleIntentProfile,
  124 + ) -> List[str]:
  125 + texts = [variant.normalized_text for variant in style_profile.query_variants if variant.normalized_text]
  126 + if texts:
  127 + return list(dict.fromkeys(texts))
  128 +
  129 + fallbacks: List[str] = []
  130 + for value in (
  131 + getattr(parsed_query, "original_query", None),
  132 + getattr(parsed_query, "query_normalized", None),
  133 + getattr(parsed_query, "rewritten_query", None),
  134 + ):
  135 + normalized = normalize_query_text(value)
  136 + if normalized:
  137 + fallbacks.append(normalized)
  138 + translations = getattr(parsed_query, "translations", {}) or {}
  139 + if isinstance(translations, dict):
  140 + for value in translations.values():
  141 + normalized = normalize_query_text(value)
  142 + if normalized:
  143 + fallbacks.append(normalized)
  144 + return list(dict.fromkeys(fallbacks))
  145 +
  146 + def _get_query_vector(self, parsed_query: Any) -> Optional[np.ndarray]:
  147 + query_vector = getattr(parsed_query, "query_vector", None)
  148 + if query_vector is not None:
  149 + return np.asarray(query_vector, dtype=np.float32)
  150 +
  151 + text_encoder = self._get_text_encoder()
  152 + if text_encoder is None:
  153 + return None
  154 +
  155 + query_text = (
  156 + getattr(parsed_query, "rewritten_query", None)
  157 + or getattr(parsed_query, "query_normalized", None)
  158 + or getattr(parsed_query, "original_query", None)
  159 + )
  160 + if not query_text:
  161 + return None
  162 +
  163 + vectors = text_encoder.encode([query_text], priority=1)
  164 + if vectors is None or len(vectors) == 0 or vectors[0] is None:
  165 + return None
  166 + return np.asarray(vectors[0], dtype=np.float32)
  167 +
  168 + def _get_text_encoder(self) -> Any:
  169 + if self._text_encoder_getter is None:
  170 + return None
  171 + return self._text_encoder_getter()
  172 +
  173 + def _get_tokenizer(self) -> Any:
  174 + if self._tokenizer_getter is None:
  175 + return None
  176 + return self._tokenizer_getter()
  177 +
  178 + @staticmethod
  179 + def _fallback_sku_text(sku: Dict[str, Any]) -> str:
  180 + parts = []
  181 + for field_name in ("option1_value", "option2_value", "option3_value"):
  182 + value = str(sku.get(field_name) or "").strip()
  183 + if value:
  184 + parts.append(value)
  185 + return " ".join(parts)
  186 +
  187 + def _resolve_dimensions(
  188 + self,
  189 + source: Dict[str, Any],
  190 + style_profile: StyleIntentProfile,
  191 + ) -> Dict[str, Optional[str]]:
  192 + option_names = {
  193 + "option1_value": normalize_query_text(source.get("option1_name")),
  194 + "option2_value": normalize_query_text(source.get("option2_name")),
  195 + "option3_value": normalize_query_text(source.get("option3_name")),
  196 + }
  197 + resolved: Dict[str, Optional[str]] = {}
  198 + for intent in style_profile.intents:
  199 + if intent.intent_type in resolved:
  200 + continue
  201 + aliases = set(intent.dimension_aliases or self.registry.get_dimension_aliases(intent.intent_type))
  202 + matched_field = None
  203 + for field_name, option_name in option_names.items():
  204 + if option_name and option_name in aliases:
  205 + matched_field = field_name
  206 + break
  207 + resolved[intent.intent_type] = matched_field
  208 + return resolved
  209 +
  210 + def _build_candidates(
  211 + self,
  212 + skus: List[Dict[str, Any]],
  213 + resolved_dimensions: Dict[str, Optional[str]],
  214 + ) -> List[_SkuCandidate]:
  215 + candidates: List[_SkuCandidate] = []
  216 + for index, sku in enumerate(skus):
  217 + fallback_text = self._fallback_sku_text(sku)
  218 + intent_texts: Dict[str, str] = {}
  219 + for intent_type, field_name in resolved_dimensions.items():
  220 + if field_name:
  221 + value = str(sku.get(field_name) or "").strip()
  222 + intent_texts[intent_type] = value or fallback_text
  223 + else:
  224 + intent_texts[intent_type] = fallback_text
  225 +
  226 + selection_parts: List[str] = []
  227 + seen = set()
  228 + for value in intent_texts.values():
  229 + normalized = normalize_query_text(value)
  230 + if not normalized or normalized in seen:
  231 + continue
  232 + seen.add(normalized)
  233 + selection_parts.append(str(value).strip())
  234 +
  235 + selection_text = " ".join(selection_parts).strip() or fallback_text
  236 + candidates.append(
  237 + _SkuCandidate(
  238 + index=index,
  239 + sku_id=str(sku.get("sku_id") or ""),
  240 + sku=sku,
  241 + selection_text=selection_text,
  242 + intent_texts=intent_texts,
  243 + )
  244 + )
  245 + return candidates
  246 +
  247 + @staticmethod
  248 + def _is_direct_match(
  249 + candidate: _SkuCandidate,
  250 + query_texts: Sequence[str],
  251 + ) -> bool:
  252 + if not candidate.intent_texts or not query_texts:
  253 + return False
  254 + for value in candidate.intent_texts.values():
  255 + normalized_value = normalize_query_text(value)
  256 + if not normalized_value:
  257 + return False
  258 + if not any(normalized_value in query_text for query_text in query_texts):
  259 + return False
  260 + return True
  261 +
  262 + def _is_generalized_match(
  263 + self,
  264 + candidate: _SkuCandidate,
  265 + style_profile: StyleIntentProfile,
  266 + tokenizer: Any,
  267 + ) -> bool:
  268 + if not candidate.intent_texts:
  269 + return False
  270 +
  271 + for intent_type, value in candidate.intent_texts.items():
  272 + definition = self.registry.get_definition(intent_type)
  273 + if definition is None:
  274 + return False
  275 + matched_canonicals = definition.match_text(value, tokenizer=tokenizer)
  276 + if not matched_canonicals.intersection(style_profile.get_canonical_values(intent_type)):
  277 + return False
  278 + return True
  279 +
  280 + def _select_by_embedding(
  281 + self,
  282 + candidates: Sequence[_SkuCandidate],
  283 + query_vector: Optional[np.ndarray],
  284 + ) -> Tuple[Optional[_SkuCandidate], Optional[float]]:
  285 + if not candidates:
  286 + return None, None
  287 + text_encoder = self._get_text_encoder()
  288 + if query_vector is None or text_encoder is None:
  289 + return candidates[0], None
  290 +
  291 + unique_texts = list(
  292 + dict.fromkeys(
  293 + normalize_query_text(candidate.selection_text)
  294 + for candidate in candidates
  295 + if normalize_query_text(candidate.selection_text)
  296 + )
  297 + )
  298 + if not unique_texts:
  299 + return candidates[0], None
  300 +
  301 + vectors = text_encoder.encode(unique_texts, priority=1)
  302 + vector_map: Dict[str, np.ndarray] = {}
  303 + for key, vector in zip(unique_texts, vectors):
  304 + if vector is None:
  305 + continue
  306 + vector_map[key] = np.asarray(vector, dtype=np.float32)
  307 +
  308 + best_candidate: Optional[_SkuCandidate] = None
  309 + best_score: Optional[float] = None
  310 + query_vector_array = np.asarray(query_vector, dtype=np.float32)
  311 + for candidate in candidates:
  312 + normalized_text = normalize_query_text(candidate.selection_text)
  313 + candidate_vector = vector_map.get(normalized_text)
  314 + if candidate_vector is None:
  315 + continue
  316 + score = float(np.inner(query_vector_array, candidate_vector))
  317 + if best_score is None or score > best_score:
  318 + best_candidate = candidate
  319 + best_score = score
  320 +
  321 + return best_candidate or candidates[0], best_score
  322 +
  323 + def _select_for_source(
  324 + self,
  325 + source: Dict[str, Any],
  326 + *,
  327 + style_profile: StyleIntentProfile,
  328 + query_texts: Sequence[str],
  329 + query_vector: Optional[np.ndarray],
  330 + tokenizer: Any,
  331 + ) -> Optional[SkuSelectionDecision]:
  332 + skus = source.get("skus")
  333 + if not isinstance(skus, list) or not skus:
  334 + return None
  335 +
  336 + resolved_dimensions = self._resolve_dimensions(source, style_profile)
  337 + candidates = self._build_candidates(skus, resolved_dimensions)
  338 + if not candidates:
  339 + return None
  340 +
  341 + direct_matches = [candidate for candidate in candidates if self._is_direct_match(candidate, query_texts)]
  342 + if len(direct_matches) == 1:
  343 + chosen = direct_matches[0]
  344 + return self._build_decision(chosen, resolved_dimensions, matched_stage="direct")
  345 +
  346 + generalized_matches: List[_SkuCandidate] = []
  347 + if not direct_matches:
  348 + generalized_matches = [
  349 + candidate
  350 + for candidate in candidates
  351 + if self._is_generalized_match(candidate, style_profile, tokenizer)
  352 + ]
  353 + if len(generalized_matches) == 1:
  354 + chosen = generalized_matches[0]
  355 + return self._build_decision(chosen, resolved_dimensions, matched_stage="generalized")
  356 +
  357 + embedding_pool = direct_matches or generalized_matches or candidates
  358 + chosen, similarity_score = self._select_by_embedding(embedding_pool, query_vector)
  359 + if chosen is None:
  360 + return None
  361 + stage = "embedding_from_matches" if direct_matches or generalized_matches else "embedding_from_all"
  362 + return self._build_decision(
  363 + chosen,
  364 + resolved_dimensions,
  365 + matched_stage=stage,
  366 + similarity_score=similarity_score,
  367 + )
  368 +
  369 + @staticmethod
  370 + def _build_decision(
  371 + candidate: _SkuCandidate,
  372 + resolved_dimensions: Dict[str, Optional[str]],
  373 + *,
  374 + matched_stage: str,
  375 + similarity_score: Optional[float] = None,
  376 + ) -> SkuSelectionDecision:
  377 + return SkuSelectionDecision(
  378 + selected_sku_id=candidate.sku_id or None,
  379 + rerank_suffix=str(candidate.selection_text or "").strip(),
  380 + selected_text=str(candidate.selection_text or "").strip(),
  381 + matched_stage=matched_stage,
  382 + similarity_score=similarity_score,
  383 + resolved_dimensions=dict(resolved_dimensions),
  384 + )
  385 +
  386 + @staticmethod
  387 + def _apply_decision_to_source(source: Dict[str, Any], decision: SkuSelectionDecision) -> None:
  388 + skus = source.get("skus")
  389 + if not isinstance(skus, list) or not skus or not decision.selected_sku_id:
  390 + return
  391 +
  392 + selected_index = None
  393 + for index, sku in enumerate(skus):
  394 + if str(sku.get("sku_id") or "") == decision.selected_sku_id:
  395 + selected_index = index
  396 + break
  397 + if selected_index is None:
  398 + return
  399 +
  400 + selected_sku = skus.pop(selected_index)
  401 + skus.insert(0, selected_sku)
  402 +
  403 + image_src = selected_sku.get("image_src") or selected_sku.get("imageSrc")
  404 + if image_src:
  405 + source["image_url"] = image_src
tests/test_search_rerank_window.py
@@ -18,6 +18,7 @@ from config import ( @@ -18,6 +18,7 @@ from config import (
18 SearchConfig, 18 SearchConfig,
19 ) 19 )
20 from context import create_request_context 20 from context import create_request_context
  21 +from query.style_intent import DetectedStyleIntent, StyleIntentProfile
21 from search.searcher import Searcher 22 from search.searcher import Searcher
22 23
23 24
@@ -30,6 +31,7 @@ class _FakeParsedQuery: @@ -30,6 +31,7 @@ class _FakeParsedQuery:
30 translations: Dict[str, str] = None 31 translations: Dict[str, str] = None
31 query_vector: Any = None 32 query_vector: Any = None
32 domain: str = "default" 33 domain: str = "default"
  34 + style_intent_profile: Any = None
33 35
34 def to_dict(self) -> Dict[str, Any]: 36 def to_dict(self) -> Dict[str, Any]:
35 return { 37 return {
@@ -39,9 +41,27 @@ class _FakeParsedQuery: @@ -39,9 +41,27 @@ class _FakeParsedQuery:
39 "detected_language": self.detected_language, 41 "detected_language": self.detected_language,
40 "translations": self.translations or {}, 42 "translations": self.translations or {},
41 "domain": self.domain, 43 "domain": self.domain,
  44 + "style_intent_profile": (
  45 + self.style_intent_profile.to_dict() if self.style_intent_profile is not None else None
  46 + ),
42 } 47 }
43 48
44 49
  50 +def _build_style_intent_profile(intent_type: str, canonical_value: str, *dimension_aliases: str) -> StyleIntentProfile:
  51 + aliases = dimension_aliases or (intent_type,)
  52 + return StyleIntentProfile(
  53 + intents=(
  54 + DetectedStyleIntent(
  55 + intent_type=intent_type,
  56 + canonical_value=canonical_value,
  57 + matched_term=canonical_value,
  58 + matched_query_text=canonical_value,
  59 + dimension_aliases=tuple(aliases),
  60 + ),
  61 + )
  62 + )
  63 +
  64 +
45 class _FakeQueryParser: 65 class _FakeQueryParser:
46 def parse( 66 def parse(
47 self, 67 self,
@@ -340,6 +360,57 @@ def test_searcher_rerank_prefetch_source_follows_doc_template(monkeypatch): @@ -340,6 +360,57 @@ def test_searcher_rerank_prefetch_source_follows_doc_template(monkeypatch):
340 assert es_client.calls[0]["body"]["_source"] == {"includes": ["brief", "title", "vendor"]} 360 assert es_client.calls[0]["body"]["_source"] == {"includes": ["brief", "title", "vendor"]}
341 361
342 362
  363 +def test_searcher_rerank_prefetch_source_includes_sku_fields_when_style_intent_active(monkeypatch):
  364 + es_client = _FakeESClient()
  365 + searcher = _build_searcher(_build_search_config(rerank_enabled=True), es_client)
  366 + context = create_request_context(reqid="t1c", uid="u1c")
  367 +
  368 + monkeypatch.setattr(
  369 + "search.searcher.get_tenant_config_loader",
  370 + lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en"]}),
  371 + )
  372 + monkeypatch.setattr(
  373 + "search.rerank_client.run_rerank",
  374 + lambda **kwargs: (kwargs["es_response"], None, []),
  375 + )
  376 +
  377 + class _IntentQueryParser:
  378 + text_encoder = None
  379 +
  380 + def parse(
  381 + self,
  382 + query: str,
  383 + tenant_id: str,
  384 + generate_vector: bool,
  385 + context: Any,
  386 + target_languages: Any = None,
  387 + ):
  388 + return _FakeParsedQuery(
  389 + original_query=query,
  390 + query_normalized=query,
  391 + rewritten_query=query,
  392 + translations={},
  393 + style_intent_profile=_build_style_intent_profile(
  394 + "color", "black", "color", "colors", "颜色"
  395 + ),
  396 + )
  397 +
  398 + searcher.query_parser = _IntentQueryParser()
  399 +
  400 + searcher.search(
  401 + query="black dress",
  402 + tenant_id="162",
  403 + from_=0,
  404 + size=5,
  405 + context=context,
  406 + enable_rerank=None,
  407 + )
  408 +
  409 + assert es_client.calls[0]["body"]["_source"] == {
  410 + "includes": ["option1_name", "option2_name", "option3_name", "skus", "title"]
  411 + }
  412 +
  413 +
343 def test_searcher_skips_rerank_when_request_explicitly_false(monkeypatch): 414 def test_searcher_skips_rerank_when_request_explicitly_false(monkeypatch):
344 es_client = _FakeESClient() 415 es_client = _FakeESClient()
345 searcher = _build_searcher(_build_search_config(rerank_enabled=True), es_client) 416 searcher = _build_searcher(_build_search_config(rerank_enabled=True), es_client)
@@ -434,6 +505,9 @@ def test_searcher_promotes_sku_when_option1_matches_translated_query(monkeypatch @@ -434,6 +505,9 @@ def test_searcher_promotes_sku_when_option1_matches_translated_query(monkeypatch
434 query_normalized=query, 505 query_normalized=query,
435 rewritten_query=query, 506 rewritten_query=query,
436 translations={"en": "black dress"}, 507 translations={"en": "black dress"},
  508 + style_intent_profile=_build_style_intent_profile(
  509 + "color", "black", "color", "colors", "颜色"
  510 + ),
437 ) 511 )
438 512
439 searcher.query_parser = _TranslatedQueryParser() 513 searcher.query_parser = _TranslatedQueryParser()
@@ -481,8 +555,8 @@ def test_searcher_promotes_sku_by_embedding_when_query_has_no_direct_option_matc @@ -481,8 +555,8 @@ def test_searcher_promotes_sku_by_embedding_when_query_has_no_direct_option_matc
481 encoder = _FakeTextEncoder( 555 encoder = _FakeTextEncoder(
482 { 556 {
483 "linen summer dress": [0.8, 0.2], 557 "linen summer dress": [0.8, 0.2],
484 - "color:red": [1.0, 0.0],  
485 - "color:blue": [0.0, 1.0], 558 + "red": [1.0, 0.0],
  559 + "blue": [0.0, 1.0],
486 } 560 }
487 ) 561 )
488 562
@@ -503,6 +577,9 @@ def test_searcher_promotes_sku_by_embedding_when_query_has_no_direct_option_matc @@ -503,6 +577,9 @@ def test_searcher_promotes_sku_by_embedding_when_query_has_no_direct_option_matc
503 rewritten_query=query, 577 rewritten_query=query,
504 translations={}, 578 translations={},
505 query_vector=np.array([0.0, 1.0], dtype=np.float32), 579 query_vector=np.array([0.0, 1.0], dtype=np.float32),
  580 + style_intent_profile=_build_style_intent_profile(
  581 + "color", "blue", "color", "colors", "颜色"
  582 + ),
506 ) 583 )
507 584
508 searcher.query_parser = _EmbeddingQueryParser() 585 searcher.query_parser = _EmbeddingQueryParser()
tests/test_style_intent.py 0 → 100644
@@ -0,0 +1,35 @@ @@ -0,0 +1,35 @@
  1 +from types import SimpleNamespace
  2 +
  3 +from config import QueryConfig
  4 +from query.style_intent import StyleIntentDetector, StyleIntentRegistry
  5 +
  6 +
  7 +def test_style_intent_detector_matches_original_and_translated_queries():
  8 + query_config = QueryConfig(
  9 + style_intent_terms={
  10 + "color": [["black", "黑色", "black"]],
  11 + "size": [["xl", "x-large", "加大码"]],
  12 + },
  13 + style_intent_dimension_aliases={
  14 + "color": ["color", "颜色"],
  15 + "size": ["size", "尺码"],
  16 + },
  17 + )
  18 + detector = StyleIntentDetector(
  19 + StyleIntentRegistry.from_query_config(query_config),
  20 + tokenizer=lambda text: text.split(),
  21 + )
  22 +
  23 + parsed_query = SimpleNamespace(
  24 + original_query="黑色 连衣裙",
  25 + query_normalized="黑色 连衣裙",
  26 + rewritten_query="黑色 连衣裙",
  27 + translations={"en": "black dress xl"},
  28 + )
  29 +
  30 + profile = detector.detect(parsed_query)
  31 +
  32 + assert profile.is_active is True
  33 + assert profile.get_canonical_values("color") == {"black"}
  34 + assert profile.get_canonical_values("size") == {"xl"}
  35 + assert len(profile.query_variants) == 2