Commit cda1cd6231ec713689f779d3a0f464b582f47110

Authored by tangwang
1 parent dad3c867

意图分析&应用 baseline

config/config.yaml
... ... @@ -17,9 +17,9 @@ runtime:
17 17 embedding_port: 6005
18 18 embedding_text_port: 6005
19 19 embedding_image_port: 6008
20   - translator_host: "127.0.0.1"
  20 + translator_host: "0.0.0.0"
21 21 translator_port: 6006
22   - reranker_host: "127.0.0.1"
  22 + reranker_host: "0.0.0.0"
23 23 reranker_port: 6007
24 24  
25 25 # 基础设施连接(敏感项优先读环境变量:ES_*、REDIS_*、DB_*、DASHSCOPE_API_KEY、DEEPL_AUTH_KEY)
... ... @@ -116,6 +116,14 @@ query_config:
116 116 translation_embedding_wait_budget_ms_source_in_index: 500 # 80
117 117 translation_embedding_wait_budget_ms_source_not_in_index: 500 #200
118 118  
  119 + style_intent:
  120 + enabled: true
  121 + color_dictionary_path: "config/dictionaries/style_intent_color.csv"
  122 + size_dictionary_path: "config/dictionaries/style_intent_size.csv"
  123 + dimension_aliases:
  124 + color: ["color", "colors", "colour", "colours", "颜色", "色", "色系"]
  125 + size: ["size", "sizes", "sizing", "尺码", "尺寸", "码数", "号码", "码"]
  126 +
119 127 # 动态多语言检索字段配置
120 128 # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式;
121 129 # shared_fields 为无语言后缀字段。
... ...
config/dictionaries/style_intent_color.csv 0 → 100644
... ... @@ -0,0 +1,15 @@
  1 +black,black,blk,黑,黑色
  2 +white,white,wht,白,白色
  3 +red,red,reddish,红,红色
  4 +blue,blue,blu,蓝,蓝色
  5 +green,green,grn,绿,绿色
  6 +yellow,yellow,ylw,黄,黄色
  7 +pink,pink,粉,粉色
  8 +purple,purple,violet,紫,紫色
  9 +gray,gray,grey,灰,灰色
  10 +brown,brown,棕,棕色,咖啡色
  11 +beige,beige,khaki,米色,卡其色
  12 +navy,navy,navy blue,藏青,藏蓝,深蓝
  13 +silver,silver,银,银色
  14 +gold,gold,金,金色
  15 +orange,orange,橙,橙色
... ...
config/dictionaries/style_intent_size.csv 0 → 100644
... ... @@ -0,0 +1,8 @@
  1 +xs,xs,extra small,x-small,加小码
  2 +s,s,small,小码,小号
  3 +m,m,medium,中码,中号
  4 +l,l,large,大码,大号
  5 +xl,xl,x-large,extra large,加大码
  6 +xxl,xxl,2xl,xx-large,双加大码
  7 +xxxl,xxxl,3xl,xxx-large,三加大码
  8 +one size,one size,onesize,free size,均码
... ...
config/loader.py
... ... @@ -95,6 +95,29 @@ def _read_rewrite_dictionary(path: Path) -> Dict[str, str]:
95 95 return rewrite_dict
96 96  
97 97  
  98 +def _read_synonym_csv_dictionary(path: Path) -> List[List[str]]:
  99 + rows: List[List[str]] = []
  100 + if not path.exists():
  101 + return rows
  102 +
  103 + with open(path, "r", encoding="utf-8") as handle:
  104 + for raw_line in handle:
  105 + line = raw_line.strip()
  106 + if not line or line.startswith("#"):
  107 + continue
  108 + parts = [segment.strip() for segment in line.split(",")]
  109 + normalized = [segment for segment in parts if segment]
  110 + if normalized:
  111 + rows.append(normalized)
  112 + return rows
  113 +
  114 +
  115 +_DEFAULT_STYLE_INTENT_DIMENSION_ALIASES: Dict[str, List[str]] = {
  116 + "color": ["color", "colors", "colour", "colours", "颜色", "色", "色系"],
  117 + "size": ["size", "sizes", "sizing", "尺码", "尺寸", "码数", "号码", "码"],
  118 +}
  119 +
  120 +
98 121 class AppConfigLoader:
99 122 """Load the unified application configuration."""
100 123  
... ... @@ -253,6 +276,45 @@ class AppConfigLoader:
253 276 if isinstance(query_cfg.get("text_query_strategy"), dict)
254 277 else {}
255 278 )
  279 + style_intent_cfg = (
  280 + query_cfg.get("style_intent")
  281 + if isinstance(query_cfg.get("style_intent"), dict)
  282 + else {}
  283 + )
  284 +
  285 + def _resolve_project_path(value: Any, default_path: Path) -> Path:
  286 + if value in (None, ""):
  287 + return default_path
  288 + candidate = Path(str(value))
  289 + if candidate.is_absolute():
  290 + return candidate
  291 + return self.project_root / candidate
  292 +
  293 + style_color_path = _resolve_project_path(
  294 + style_intent_cfg.get("color_dictionary_path"),
  295 + self.config_dir / "dictionaries" / "style_intent_color.csv",
  296 + )
  297 + style_size_path = _resolve_project_path(
  298 + style_intent_cfg.get("size_dictionary_path"),
  299 + self.config_dir / "dictionaries" / "style_intent_size.csv",
  300 + )
  301 + configured_dimension_aliases = (
  302 + style_intent_cfg.get("dimension_aliases")
  303 + if isinstance(style_intent_cfg.get("dimension_aliases"), dict)
  304 + else {}
  305 + )
  306 + style_dimension_aliases: Dict[str, List[str]] = {}
  307 + for intent_type, default_aliases in _DEFAULT_STYLE_INTENT_DIMENSION_ALIASES.items():
  308 + aliases = configured_dimension_aliases.get(intent_type)
  309 + if isinstance(aliases, list) and aliases:
  310 + style_dimension_aliases[intent_type] = [str(alias) for alias in aliases if str(alias).strip()]
  311 + else:
  312 + style_dimension_aliases[intent_type] = list(default_aliases)
  313 +
  314 + style_intent_terms = {
  315 + "color": _read_synonym_csv_dictionary(style_color_path),
  316 + "size": _read_synonym_csv_dictionary(style_size_path),
  317 + }
256 318 query_config = QueryConfig(
257 319 supported_languages=list(query_cfg.get("supported_languages") or ["zh", "en"]),
258 320 default_language=str(query_cfg.get("default_language") or "en"),
... ... @@ -324,6 +386,9 @@ class AppConfigLoader:
324 386 translation_embedding_wait_budget_ms_source_not_in_index=int(
325 387 query_cfg.get("translation_embedding_wait_budget_ms_source_not_in_index", 200)
326 388 ),
  389 + style_intent_enabled=bool(style_intent_cfg.get("enabled", True)),
  390 + style_intent_terms=style_intent_terms,
  391 + style_intent_dimension_aliases=style_dimension_aliases,
327 392 )
328 393  
329 394 function_score_cfg = raw.get("function_score") if isinstance(raw.get("function_score"), dict) else {}
... ...
config/schema.py
... ... @@ -64,6 +64,9 @@ class QueryConfig:
64 64 # 检测语言不在 index_languages 内:翻译对召回更关键,预算较长。
65 65 translation_embedding_wait_budget_ms_source_in_index: int = 80
66 66 translation_embedding_wait_budget_ms_source_not_in_index: int = 200
  67 + style_intent_enabled: bool = True
  68 + style_intent_terms: Dict[str, List[List[str]]] = field(default_factory=dict)
  69 + style_intent_dimension_aliases: Dict[str, List[str]] = field(default_factory=dict)
67 70  
68 71  
69 72 @dataclass(frozen=True)
... ...
docs/TODO-意图判断.md
... ... @@ -39,3 +39,15 @@ intent 考虑由 QueryParser 编排、具体实现拆成独立模块,主义好
39 39  
40 40 5. TODO: 搜索接口里,results[].skus 不是全量子 SKU:由 sku_filter_dimension 控制在应用层按维度分组折叠,每个「维度取值组合」只保留一条 SKU(组内第一条)。请求未传该字段时,Pydantic 默认是 ["option1"],等价于只按 option1_value 去重;服务端不会读取店铺主题的「主展示维」,需调用方与装修配置对齐并传入正确维度。因此当用户有款式等更细粒度意图、而款式落在 option2/option3(或对应 option*_name)时,若仍用默认只按 option1(常见为颜色)折叠,同一颜色下多种款式只会出现一条代表 SKU,无法从返回的 skus 里拿到该颜色下的全部款式行。(若业务需要全量子款,需传包含对应维度的 sku_filter_dimension,或传 null/[] 跳过折叠——以当前 ResultFormatter 实现为准。)
41 41  
  42 +
  43 +
  44 +
  45 +
  46 +GPT5.4 主要思路:
  47 +我准备按这条线改,尽量把逻辑收束成两个新模块而不是继续堆在 Searcher 里:
  48 +
  49 +QueryParser 负责产出统一的“款式意图分析结果”,包含:命中的意图类型、标准化值、命中的 query 变体、分词结果、维度候选词。
  50 +独立的 SKU 选择模块负责:维度解析、候选值抽取、三阶段匹配、选中 SKU 后的主图替换,以及生成 rerank 用的属性后缀。
  51 +Searcher 只编排:解析 query -> 首次 ES 召回 -> 若有款式意图则对 rerank 窗口内全部 hits 做 SKU 预筛选并补充 rerank doc 文本 -> run_rerank -> 分页 -> page fill -> ResultFormatter。这样旧的页内 SKU 置顶逻辑可以下沉或删除,避免前后两个阶段重复做同一件事。我再补看一下你最近写的意图草稿文档,确认有没有现成词表示例或边界条件,避免我这边把词表格式和你预期做偏。
  52 +
  53 +
... ...
docs/数据统计/options名称和取值统计.md 0 → 100644
... ... @@ -0,0 +1,118 @@
  1 +
  2 +## prompt
  3 +参考 @docs/常用查询 - ES.md 帮我查询 tenant_Id=163的@search_products.json (1314-1332) 这些字段的统计分布,也就是这里每个字段的所有的值及其对应的文档数
  4 +
  5 +
  6 +已按 `docs/常用查询 - ES.md` 的写法,在索引 **`search_products_tenant_163`** 上做了 `terms` 聚合(租户已按索引拆分,用 `match_all` 即可,等价于只查 tenant 163)。集群在本机 `localhost:9200` 上执行成功。
  7 +
  8 +## 结果概要
  9 +
  10 +- **总文档数**:8977
  11 +- 六个字段的 **`sum_other_doc_count` 均为 0**,在 `size: 20000` 下没有截断,即:**当前数据里每个字段的取值都已全部列出**。
  12 +- **`__missing__`**:来自聚合里的 `"missing": "__missing__"`,表示该字段缺失或为空的文档数(便于和「有值」一起看)。
  13 +- **`option1_values` / `option2_values`** 多为多值数组:同一篇文档可落入多个桶,因此 **各桶 `doc_count` 之和会大于 8977**(这是 ES `terms` 的正常语义,表示「含该取值的文档数」,不是互斥分区)。
  14 +- **`option1_name` / `option2_name` / `option3_name`** 及 **`option3_values`** 的桶计数之和与文档关系更接近单值/稀疏(见下方校验)。
  15 +
  16 +### `option1_name`(13 个取值)
  17 +
  18 +| 值 | 文档数 |
  19 +|---:|---:|
  20 +| __missing__ | 4538 |
  21 +| Size | 2916 |
  22 +| Color | 1174 |
  23 +| 颜色 | 244 |
  24 +| COLOR | 56 |
  25 +| color | 16 |
  26 +| Colour | 15 |
  27 +| Variant | 9 |
  28 +| Style | 3 |
  29 +| Colors | 2 |
  30 +| Scent | 2 |
  31 +| Cup Size | 1 |
  32 +| Pattern Name | 1 |
  33 +
  34 +### `option2_name`(13 个取值)
  35 +
  36 +| 值 | 文档数 |
  37 +|---:|---:|
  38 +| __missing__ | 4666 |
  39 +| Color | 2879 |
  40 +| Size | 1134 |
  41 +| 尺码 | 244 |
  42 +| SIZE | 17 |
  43 +| size | 16 |
  44 +| Style | 12 |
  45 +| Cup Size | 4 |
  46 +| Item Package Quantity | 1 |
  47 +| Number of Items | 1 |
  48 +| Ring Size | 1 |
  49 +| Scent | 1 |
  50 +| Team Name | 1 |
  51 +
  52 +### `option3_name`(11 个取值)
  53 +
  54 +| 值 | 文档数 |
  55 +|---:|---:|
  56 +| __missing__ | 8831 |
  57 +| Fit Type | 52 |
  58 +| Special Size | 31 |
  59 +| Number of Items | 17 |
  60 +| Special Size Type | 16 |
  61 +| Team Name | 15 |
  62 +| Material Type | 8 |
  63 +| Style | 4 |
  64 +| Configuration | 1 |
  65 +| Hand Orientation | 1 |
  66 +| Pattern Name | 1 |
  67 +
  68 +### `option1_values` / `option2_values`
  69 +
  70 +- **桶数量**:`option1_values` **3463**,`option2_values` **3605**(完整列表过长,不宜贴在对话里)。
  71 +- **文档中出现最多的取值(前 10)**
  72 + - **option1_values**:`__missing__` 4538,其后如 Medium 749、Black 653、Large 635、8/9/White/10/7/Small…
  73 + - **option2_values**:`__missing__` 4666,其后如 L 1161、M 1158、S 1064、XL 1036、Black 876…
  74 +
  75 +### `option3_values`(94 个桶,已全部枚举)
  76 +
  77 +除 `__missing__` 8831 外,另有 93 个具体取值;上文 Markdown 表里已列全(从 Standard 到 `standard` 等)。
  78 +
  79 +---
  80 +
  81 +## 可复用的查询(与文档一致)
  82 +
  83 +```bash
  84 +curl -sS -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_163/_search?pretty' \
  85 + -H 'Content-Type: application/json' -d '{
  86 + "query": { "match_all": {} },
  87 + "size": 0,
  88 + "track_total_hits": true,
  89 + "aggs": {
  90 + "option1_name_dist": {
  91 + "terms": { "field": "option1_name", "size": 20000, "order": { "_count": "desc" }, "missing": "__missing__" }
  92 + },
  93 + "option2_name_dist": {
  94 + "terms": { "field": "option2_name", "size": 20000, "order": { "_count": "desc" }, "missing": "__missing__" }
  95 + },
  96 + "option3_name_dist": {
  97 + "terms": { "field": "option3_name", "size": 20000, "order": { "_count": "desc" }, "missing": "__missing__" }
  98 + },
  99 + "option1_values_dist": {
  100 + "terms": { "field": "option1_values", "size": 20000, "order": { "_count": "desc" }, "missing": "__missing__" }
  101 + },
  102 + "option2_values_dist": {
  103 + "terms": { "field": "option2_values", "size": 20000, "order": { "_count": "desc" }, "missing": "__missing__" }
  104 + },
  105 + "option3_values_dist": {
  106 + "terms": { "field": "option3_values", "size": 20000, "order": { "_count": "desc" }, "missing": "__missing__" }
  107 + }
  108 + }
  109 +}'
  110 +```
  111 +
  112 +若某租户取值超过 2 万,需把 `size` 再加大或用 **composite** 分页扫完全部桶。
  113 +
  114 +## 完整 JSON(含 3463 + 3605 条 value 分布)
  115 +
  116 +已整理为:`/tmp/tenant163_option_fields_distribution.json`(约 620KB),结构为按字段分组的 `values: [{ "value", "doc_count" }, ...]`,便于你用脚本或表格工具打开。
  117 +
  118 +如需去掉 `__missing__` 桶,从请求里删掉各聚合中的 `"missing": "__missing__"` 即可。
0 119 \ No newline at end of file
... ...
query/query_parser.py
... ... @@ -12,7 +12,6 @@ from dataclasses import dataclass, field
12 12 from typing import Any, Callable, Dict, List, Optional, Tuple
13 13 import numpy as np
14 14 import logging
15   -import re
16 15 from concurrent.futures import ThreadPoolExecutor, wait
17 16  
18 17 from embeddings.text_encoder import TextEmbeddingEncoder
... ... @@ -20,25 +19,14 @@ from config import SearchConfig
20 19 from translation import create_translation_client
21 20 from .language_detector import LanguageDetector
22 21 from .query_rewriter import QueryRewriter, QueryNormalizer
  22 +from .style_intent import StyleIntentDetector, StyleIntentProfile, StyleIntentRegistry
  23 +from .tokenization import extract_token_strings, simple_tokenize_query
23 24  
24 25 logger = logging.getLogger(__name__)
25 26  
26 27 import hanlp # type: ignore
27 28  
28 29  
29   -def simple_tokenize_query(text: str) -> List[str]:
30   - """
31   - Lightweight tokenizer for suggestion-side heuristics only.
32   -
33   - - Consecutive CJK characters form one token
34   - - Latin / digit runs (with internal hyphens) form tokens
35   - """
36   - if not text:
37   - return []
38   - pattern = re.compile(r"[\u4e00-\u9fff]+|[A-Za-z0-9_]+(?:-[A-Za-z0-9_]+)*")
39   - return pattern.findall(text)
40   -
41   -
42 30 @dataclass(slots=True)
43 31 class ParsedQuery:
44 32 """Container for query parser facts."""
... ... @@ -50,6 +38,7 @@ class ParsedQuery:
50 38 translations: Dict[str, str] = field(default_factory=dict)
51 39 query_vector: Optional[np.ndarray] = None
52 40 query_tokens: List[str] = field(default_factory=list)
  41 + style_intent_profile: Optional[StyleIntentProfile] = None
53 42  
54 43 def to_dict(self) -> Dict[str, Any]:
55 44 """Convert to dictionary representation."""
... ... @@ -60,6 +49,9 @@ class ParsedQuery:
60 49 "detected_language": self.detected_language,
61 50 "translations": self.translations,
62 51 "query_tokens": self.query_tokens,
  52 + "style_intent_profile": (
  53 + self.style_intent_profile.to_dict() if self.style_intent_profile is not None else None
  54 + ),
63 55 }
64 56  
65 57  
... ... @@ -97,6 +89,11 @@ class QueryParser:
97 89 self.language_detector = LanguageDetector()
98 90 self.rewriter = QueryRewriter(config.query_config.rewrite_dictionary)
99 91 self._tokenizer = tokenizer or self._build_tokenizer()
  92 + self.style_intent_registry = StyleIntentRegistry.from_query_config(config.query_config)
  93 + self.style_intent_detector = StyleIntentDetector(
  94 + self.style_intent_registry,
  95 + tokenizer=self._tokenizer,
  96 + )
100 97  
101 98 # Eager initialization (startup-time failure visibility, no lazy init in request path)
102 99 if self.config.query_config.enable_text_embedding and self._text_encoder is None:
... ... @@ -172,28 +169,7 @@ class QueryParser:
172 169 @staticmethod
173 170 def _extract_tokens(tokenizer_result: Any) -> List[str]:
174 171 """Normalize tokenizer output into a flat token string list."""
175   - if not tokenizer_result:
176   - return []
177   - if isinstance(tokenizer_result, str):
178   - token = tokenizer_result.strip()
179   - return [token] if token else []
180   -
181   - tokens: List[str] = []
182   - for item in tokenizer_result:
183   - token: Optional[str] = None
184   - if isinstance(item, str):
185   - token = item
186   - elif isinstance(item, (list, tuple)) and item:
187   - token = str(item[0])
188   - elif item is not None:
189   - token = str(item)
190   -
191   - if token is None:
192   - continue
193   - token = token.strip()
194   - if token:
195   - tokens.append(token)
196   - return tokens
  172 + return extract_token_strings(tokenizer_result)
197 173  
198 174 def _get_query_tokens(self, query: str) -> List[str]:
199 175 return self._extract_tokens(self._tokenizer(query))
... ... @@ -425,6 +401,22 @@ class QueryParser:
425 401 context.store_intermediate_result("translations", translations)
426 402  
427 403 # Build result
  404 + base_result = ParsedQuery(
  405 + original_query=query,
  406 + query_normalized=normalized,
  407 + rewritten_query=query_text,
  408 + detected_language=detected_lang,
  409 + translations=translations,
  410 + query_vector=query_vector,
  411 + query_tokens=query_tokens,
  412 + )
  413 + style_intent_profile = self.style_intent_detector.detect(base_result)
  414 + if context:
  415 + context.store_intermediate_result(
  416 + "style_intent_profile",
  417 + style_intent_profile.to_dict(),
  418 + )
  419 +
428 420 result = ParsedQuery(
429 421 original_query=query,
430 422 query_normalized=normalized,
... ... @@ -433,6 +425,7 @@ class QueryParser:
433 425 translations=translations,
434 426 query_vector=query_vector,
435 427 query_tokens=query_tokens,
  428 + style_intent_profile=style_intent_profile,
436 429 )
437 430  
438 431 if context and hasattr(context, 'logger'):
... ...
query/style_intent.py 0 → 100644
... ... @@ -0,0 +1,261 @@
  1 +"""
  2 +Style intent detection for query understanding.
  3 +"""
  4 +
  5 +from __future__ import annotations
  6 +
  7 +from dataclasses import dataclass, field
  8 +from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Set, Tuple
  9 +
  10 +from .tokenization import TokenizedText, normalize_query_text, tokenize_text
  11 +
  12 +
  13 +@dataclass(frozen=True)
  14 +class StyleIntentDefinition:
  15 + intent_type: str
  16 + term_groups: Tuple[Tuple[str, ...], ...]
  17 + dimension_aliases: Tuple[str, ...]
  18 + synonym_to_canonical: Dict[str, str]
  19 + max_term_ngram: int = 3
  20 +
  21 + @classmethod
  22 + def from_rows(
  23 + cls,
  24 + intent_type: str,
  25 + rows: Sequence[Sequence[str]],
  26 + dimension_aliases: Sequence[str],
  27 + ) -> "StyleIntentDefinition":
  28 + term_groups: List[Tuple[str, ...]] = []
  29 + synonym_to_canonical: Dict[str, str] = {}
  30 + max_ngram = 1
  31 +
  32 + for row in rows:
  33 + normalized_terms: List[str] = []
  34 + for raw_term in row:
  35 + term = normalize_query_text(raw_term)
  36 + if not term or term in normalized_terms:
  37 + continue
  38 + normalized_terms.append(term)
  39 + if not normalized_terms:
  40 + continue
  41 +
  42 + canonical = normalized_terms[0]
  43 + term_groups.append(tuple(normalized_terms))
  44 + for term in normalized_terms:
  45 + synonym_to_canonical[term] = canonical
  46 + max_ngram = max(max_ngram, len(term.split()))
  47 +
  48 + aliases = tuple(
  49 + dict.fromkeys(
  50 + term
  51 + for term in (
  52 + normalize_query_text(alias)
  53 + for alias in dimension_aliases
  54 + )
  55 + if term
  56 + )
  57 + )
  58 +
  59 + return cls(
  60 + intent_type=intent_type,
  61 + term_groups=tuple(term_groups),
  62 + dimension_aliases=aliases,
  63 + synonym_to_canonical=synonym_to_canonical,
  64 + max_term_ngram=max_ngram,
  65 + )
  66 +
  67 + def match_candidates(self, candidates: Iterable[str]) -> Set[str]:
  68 + matched: Set[str] = set()
  69 + for candidate in candidates:
  70 + canonical = self.synonym_to_canonical.get(normalize_query_text(candidate))
  71 + if canonical:
  72 + matched.add(canonical)
  73 + return matched
  74 +
  75 + def match_text(
  76 + self,
  77 + text: str,
  78 + *,
  79 + tokenizer: Optional[Callable[[str], Any]] = None,
  80 + ) -> Set[str]:
  81 + bundle = tokenize_text(text, tokenizer=tokenizer, max_ngram=self.max_term_ngram)
  82 + return self.match_candidates(bundle.candidates)
  83 +
  84 +
  85 +@dataclass(frozen=True)
  86 +class DetectedStyleIntent:
  87 + intent_type: str
  88 + canonical_value: str
  89 + matched_term: str
  90 + matched_query_text: str
  91 + dimension_aliases: Tuple[str, ...]
  92 +
  93 + def to_dict(self) -> Dict[str, Any]:
  94 + return {
  95 + "intent_type": self.intent_type,
  96 + "canonical_value": self.canonical_value,
  97 + "matched_term": self.matched_term,
  98 + "matched_query_text": self.matched_query_text,
  99 + "dimension_aliases": list(self.dimension_aliases),
  100 + }
  101 +
  102 +
  103 +@dataclass(frozen=True)
  104 +class StyleIntentProfile:
  105 + query_variants: Tuple[TokenizedText, ...] = field(default_factory=tuple)
  106 + intents: Tuple[DetectedStyleIntent, ...] = field(default_factory=tuple)
  107 +
  108 + @property
  109 + def is_active(self) -> bool:
  110 + return bool(self.intents)
  111 +
  112 + def get_intents(self, intent_type: Optional[str] = None) -> List[DetectedStyleIntent]:
  113 + if intent_type is None:
  114 + return list(self.intents)
  115 + normalized = normalize_query_text(intent_type)
  116 + return [intent for intent in self.intents if intent.intent_type == normalized]
  117 +
  118 + def get_canonical_values(self, intent_type: str) -> Set[str]:
  119 + return {intent.canonical_value for intent in self.get_intents(intent_type)}
  120 +
  121 + def to_dict(self) -> Dict[str, Any]:
  122 + return {
  123 + "active": self.is_active,
  124 + "intents": [intent.to_dict() for intent in self.intents],
  125 + "query_variants": [
  126 + {
  127 + "text": variant.text,
  128 + "normalized_text": variant.normalized_text,
  129 + "fine_tokens": list(variant.fine_tokens),
  130 + "coarse_tokens": list(variant.coarse_tokens),
  131 + "candidates": list(variant.candidates),
  132 + }
  133 + for variant in self.query_variants
  134 + ],
  135 + }
  136 +
  137 +
  138 +class StyleIntentRegistry:
  139 + """Holds style intent vocabularies and matching helpers."""
  140 +
  141 + def __init__(
  142 + self,
  143 + definitions: Dict[str, StyleIntentDefinition],
  144 + *,
  145 + enabled: bool = True,
  146 + ) -> None:
  147 + self.definitions = definitions
  148 + self.enabled = bool(enabled)
  149 +
  150 + @classmethod
  151 + def from_query_config(cls, query_config: Any) -> "StyleIntentRegistry":
  152 + style_terms = getattr(query_config, "style_intent_terms", {}) or {}
  153 + dimension_aliases = getattr(query_config, "style_intent_dimension_aliases", {}) or {}
  154 + definitions: Dict[str, StyleIntentDefinition] = {}
  155 +
  156 + for intent_type, rows in style_terms.items():
  157 + definition = StyleIntentDefinition.from_rows(
  158 + intent_type=normalize_query_text(intent_type),
  159 + rows=rows or [],
  160 + dimension_aliases=dimension_aliases.get(intent_type, []),
  161 + )
  162 + if definition.synonym_to_canonical:
  163 + definitions[definition.intent_type] = definition
  164 +
  165 + return cls(
  166 + definitions,
  167 + enabled=bool(getattr(query_config, "style_intent_enabled", True)),
  168 + )
  169 +
  170 + def get_definition(self, intent_type: str) -> Optional[StyleIntentDefinition]:
  171 + return self.definitions.get(normalize_query_text(intent_type))
  172 +
  173 + def get_dimension_aliases(self, intent_type: str) -> Tuple[str, ...]:
  174 + definition = self.get_definition(intent_type)
  175 + return definition.dimension_aliases if definition else tuple()
  176 +
  177 +
  178 +class StyleIntentDetector:
  179 + """Detects style intents from parsed query variants."""
  180 +
  181 + def __init__(
  182 + self,
  183 + registry: StyleIntentRegistry,
  184 + *,
  185 + tokenizer: Optional[Callable[[str], Any]] = None,
  186 + ) -> None:
  187 + self.registry = registry
  188 + self.tokenizer = tokenizer
  189 +
  190 + def _build_query_variants(self, parsed_query: Any) -> Tuple[TokenizedText, ...]:
  191 + seen = set()
  192 + variants: List[TokenizedText] = []
  193 + texts = [
  194 + getattr(parsed_query, "original_query", None),
  195 + getattr(parsed_query, "query_normalized", None),
  196 + getattr(parsed_query, "rewritten_query", None),
  197 + ]
  198 +
  199 + translations = getattr(parsed_query, "translations", {}) or {}
  200 + if isinstance(translations, dict):
  201 + texts.extend(translations.values())
  202 +
  203 + for raw_text in texts:
  204 + text = str(raw_text or "").strip()
  205 + if not text:
  206 + continue
  207 + normalized = normalize_query_text(text)
  208 + if not normalized or normalized in seen:
  209 + continue
  210 + seen.add(normalized)
  211 + variants.append(
  212 + tokenize_text(
  213 + text,
  214 + tokenizer=self.tokenizer,
  215 + max_ngram=max(
  216 + (definition.max_term_ngram for definition in self.registry.definitions.values()),
  217 + default=3,
  218 + ),
  219 + )
  220 + )
  221 +
  222 + return tuple(variants)
  223 +
  224 + def detect(self, parsed_query: Any) -> StyleIntentProfile:
  225 + if not self.registry.enabled or not self.registry.definitions:
  226 + return StyleIntentProfile()
  227 +
  228 + query_variants = self._build_query_variants(parsed_query)
  229 + detected: List[DetectedStyleIntent] = []
  230 + seen_pairs = set()
  231 +
  232 + for variant in query_variants:
  233 + for intent_type, definition in self.registry.definitions.items():
  234 + matched_canonicals = definition.match_candidates(variant.candidates)
  235 + if not matched_canonicals:
  236 + continue
  237 +
  238 + for candidate in variant.candidates:
  239 + normalized_candidate = normalize_query_text(candidate)
  240 + canonical = definition.synonym_to_canonical.get(normalized_candidate)
  241 + if not canonical or canonical not in matched_canonicals:
  242 + continue
  243 + pair = (intent_type, canonical)
  244 + if pair in seen_pairs:
  245 + continue
  246 + seen_pairs.add(pair)
  247 + detected.append(
  248 + DetectedStyleIntent(
  249 + intent_type=intent_type,
  250 + canonical_value=canonical,
  251 + matched_term=normalized_candidate,
  252 + matched_query_text=variant.text,
  253 + dimension_aliases=definition.dimension_aliases,
  254 + )
  255 + )
  256 + break
  257 +
  258 + return StyleIntentProfile(
  259 + query_variants=query_variants,
  260 + intents=tuple(detected),
  261 + )
... ...
query/tokenization.py 0 → 100644
... ... @@ -0,0 +1,122 @@
  1 +"""
  2 +Shared tokenization helpers for query understanding.
  3 +"""
  4 +
  5 +from __future__ import annotations
  6 +
  7 +from dataclasses import dataclass
  8 +import re
  9 +from typing import Any, Callable, Iterable, List, Optional, Sequence, Tuple
  10 +
  11 +
  12 +_TOKEN_PATTERN = re.compile(r"[\u4e00-\u9fff]+|[A-Za-z0-9_]+(?:-[A-Za-z0-9_]+)*")
  13 +
  14 +
  15 +def normalize_query_text(text: Optional[str]) -> str:
  16 + if text is None:
  17 + return ""
  18 + return " ".join(str(text).strip().casefold().split())
  19 +
  20 +
  21 +def simple_tokenize_query(text: str) -> List[str]:
  22 + """
  23 + Lightweight tokenizer for coarse query matching.
  24 +
  25 + - Consecutive CJK characters form one token
  26 + - Latin / digit runs (with internal hyphens) form tokens
  27 + """
  28 + if not text:
  29 + return []
  30 + return _TOKEN_PATTERN.findall(text)
  31 +
  32 +
  33 +def extract_token_strings(tokenizer_result: Any) -> List[str]:
  34 + """Normalize tokenizer output into a flat token string list."""
  35 + if not tokenizer_result:
  36 + return []
  37 + if isinstance(tokenizer_result, str):
  38 + token = tokenizer_result.strip()
  39 + return [token] if token else []
  40 +
  41 + tokens: List[str] = []
  42 + for item in tokenizer_result:
  43 + token: Optional[str] = None
  44 + if isinstance(item, str):
  45 + token = item
  46 + elif isinstance(item, (list, tuple)) and item:
  47 + token = str(item[0])
  48 + elif item is not None:
  49 + token = str(item)
  50 +
  51 + if token is None:
  52 + continue
  53 + token = token.strip()
  54 + if token:
  55 + tokens.append(token)
  56 + return tokens
  57 +
  58 +
  59 +def _dedupe_preserve_order(values: Iterable[str]) -> List[str]:
  60 + result: List[str] = []
  61 + seen = set()
  62 + for value in values:
  63 + normalized = normalize_query_text(value)
  64 + if not normalized or normalized in seen:
  65 + continue
  66 + seen.add(normalized)
  67 + result.append(normalized)
  68 + return result
  69 +
  70 +
  71 +def _build_phrase_candidates(tokens: Sequence[str], max_ngram: int) -> List[str]:
  72 + if not tokens:
  73 + return []
  74 +
  75 + phrases: List[str] = []
  76 + upper = max(1, int(max_ngram))
  77 + for size in range(1, upper + 1):
  78 + if size > len(tokens):
  79 + break
  80 + for start in range(0, len(tokens) - size + 1):
  81 + phrase = " ".join(tokens[start:start + size]).strip()
  82 + if phrase:
  83 + phrases.append(phrase)
  84 + return phrases
  85 +
  86 +
  87 +@dataclass(frozen=True)
  88 +class TokenizedText:
  89 + text: str
  90 + normalized_text: str
  91 + fine_tokens: Tuple[str, ...]
  92 + coarse_tokens: Tuple[str, ...]
  93 + candidates: Tuple[str, ...]
  94 +
  95 +
  96 +def tokenize_text(
  97 + text: str,
  98 + *,
  99 + tokenizer: Optional[Callable[[str], Any]] = None,
  100 + max_ngram: int = 3,
  101 +) -> TokenizedText:
  102 + normalized_text = normalize_query_text(text)
  103 + coarse_tokens = _dedupe_preserve_order(simple_tokenize_query(text))
  104 +
  105 + fine_raw = extract_token_strings(tokenizer(text)) if tokenizer is not None and text else []
  106 + fine_tokens = _dedupe_preserve_order(fine_raw)
  107 +
  108 + candidates = _dedupe_preserve_order(
  109 + list(fine_tokens)
  110 + + list(coarse_tokens)
  111 + + _build_phrase_candidates(fine_tokens, max_ngram=max_ngram)
  112 + + _build_phrase_candidates(coarse_tokens, max_ngram=max_ngram)
  113 + + ([normalized_text] if normalized_text else [])
  114 + )
  115 +
  116 + return TokenizedText(
  117 + text=text,
  118 + normalized_text=normalized_text,
  119 + fine_tokens=tuple(fine_tokens),
  120 + coarse_tokens=tuple(coarse_tokens),
  121 + candidates=tuple(candidates),
  122 + )
... ...
search/rerank_client.py
... ... @@ -62,11 +62,19 @@ def build_docs_from_hits(
62 62 need_category_path = "{category_path}" in doc_template
63 63 for hit in es_hits:
64 64 src = hit.get("_source") or {}
  65 + title_suffix = str(hit.get("_style_rerank_suffix") or "").strip()
65 66 if only_title:
66   - docs.append(pick_lang_text(src.get("title")))
  67 + title = pick_lang_text(src.get("title"))
  68 + if title_suffix:
  69 + title = f"{title} {title_suffix}".strip()
  70 + docs.append(title)
67 71 else:
68 72 values = _SafeDict(
69   - title=pick_lang_text(src.get("title")),
  73 + title=(
  74 + f"{pick_lang_text(src.get('title'))} {title_suffix}".strip()
  75 + if title_suffix
  76 + else pick_lang_text(src.get("title"))
  77 + ),
70 78 brief=pick_lang_text(src.get("brief")) if need_brief else "",
71 79 vendor=pick_lang_text(src.get("vendor")) if need_vendor else "",
72 80 description=pick_lang_text(src.get("description")) if need_description else "",
... ...
search/searcher.py
... ... @@ -10,12 +10,13 @@ import time, json
10 10 import logging
11 11 import hashlib
12 12 from string import Formatter
13   -import numpy as np
14 13  
15 14 from utils.es_client import ESClient
16 15 from query import QueryParser, ParsedQuery
  16 +from query.style_intent import StyleIntentRegistry
17 17 from embeddings.image_encoder import CLIPImageEncoder
18 18 from .es_query_builder import ESQueryBuilder
  19 +from .sku_intent_selector import SkuSelectionDecision, StyleSkuSelector
19 20 from config import SearchConfig
20 21 from config.tenant_config_loader import get_tenant_config_loader
21 22 from context.request_context import RequestContext, RequestContextStage
... ... @@ -115,6 +116,12 @@ class Searcher:
115 116 else:
116 117 self.image_encoder = image_encoder
117 118 self.source_fields = config.query_config.source_fields
  119 + self.style_intent_registry = StyleIntentRegistry.from_query_config(self.config.query_config)
  120 + self.style_sku_selector = StyleSkuSelector(
  121 + self.style_intent_registry,
  122 + text_encoder_getter=lambda: getattr(self.query_parser, "text_encoder", None),
  123 + tokenizer_getter=lambda: getattr(self.query_parser, "_tokenizer", None),
  124 + )
118 125  
119 126 # Query builder - simplified single-layer architecture
120 127 self.query_builder = ESQueryBuilder(
... ... @@ -155,7 +162,11 @@ class Searcher:
155 162 return
156 163 es_query["_source"] = {"includes": self.source_fields}
157 164  
158   - def _resolve_rerank_source_filter(self, doc_template: str) -> Dict[str, Any]:
  165 + def _resolve_rerank_source_filter(
  166 + self,
  167 + doc_template: str,
  168 + parsed_query: Optional[ParsedQuery] = None,
  169 + ) -> Dict[str, Any]:
159 170 """
160 171 Build a lightweight _source filter for rerank prefetch.
161 172  
... ... @@ -182,6 +193,16 @@ class Searcher:
182 193 if not includes:
183 194 includes.add("title")
184 195  
  196 + if self._has_style_intent(parsed_query):
  197 + includes.update(
  198 + {
  199 + "skus",
  200 + "option1_name",
  201 + "option2_name",
  202 + "option3_name",
  203 + }
  204 + )
  205 +
185 206 return {"includes": sorted(includes)}
186 207  
187 208 def _fetch_hits_by_ids(
... ... @@ -225,256 +246,23 @@ class Searcher:
225 246 return hits_by_id, int(resp.get("took", 0) or 0)
226 247  
227 248 @staticmethod
228   - def _normalize_sku_match_text(value: Optional[str]) -> str:
229   - """Normalize free text for lightweight SKU option matching."""
230   - if value is None:
231   - return ""
232   - return " ".join(str(value).strip().casefold().split())
233   -
234   - @staticmethod
235   - def _sku_option1_embedding_key(
236   - sku: Dict[str, Any],
237   - spu_option1_name: Optional[Any] = None,
238   - ) -> Optional[str]:
239   - """
240   - Text sent to the embedding service for option1 must be "name:value"
241   - (option name from SKU row or SPU-level option1_name).
242   - """
243   - value_raw = sku.get("option1_value")
244   - if value_raw is None:
245   - return None
246   - value = str(value_raw).strip()
247   - if not value:
248   - return None
249   - name = sku.get("option1_name")
250   - if name is None or not str(name).strip():
251   - name = spu_option1_name
252   - name_str = str(name).strip() if name is not None and str(name).strip() else ""
253   - if name_str:
254   - value = f"{name_str}:{value}"
255   - return value.casefold()
256   -
257   - def _build_sku_query_texts(self, parsed_query: ParsedQuery) -> List[str]:
258   - """Collect original and translated query texts for SKU option matching."""
259   - candidates: List[str] = []
260   - for text in (
261   - getattr(parsed_query, "original_query", None),
262   - getattr(parsed_query, "query_normalized", None),
263   - getattr(parsed_query, "rewritten_query", None),
264   - ):
265   - normalized = self._normalize_sku_match_text(text)
266   - if normalized:
267   - candidates.append(normalized)
268   -
269   - translations = getattr(parsed_query, "translations", {}) or {}
270   - if isinstance(translations, dict):
271   - for text in translations.values():
272   - normalized = self._normalize_sku_match_text(text)
273   - if normalized:
274   - candidates.append(normalized)
275   -
276   - deduped: List[str] = []
277   - seen = set()
278   - for text in candidates:
279   - if text in seen:
280   - continue
281   - seen.add(text)
282   - deduped.append(text)
283   - return deduped
284   -
285   - def _find_query_matching_sku_index(
286   - self,
287   - skus: List[Dict[str, Any]],
288   - query_texts: List[str],
289   - spu_option1_name: Optional[Any] = None,
290   - ) -> Optional[int]:
291   - """Return the first SKU whose option1_value (or name:value) appears in query texts."""
292   - if not skus or not query_texts:
293   - return None
294   -
295   - for index, sku in enumerate(skus):
296   - option1_value = self._normalize_sku_match_text(sku.get("option1_value"))
297   - if not option1_value:
298   - continue
299   - if any(option1_value in query_text for query_text in query_texts):
300   - return index
301   - embed_key = self._sku_option1_embedding_key(sku, spu_option1_name)
302   - if embed_key and embed_key != option1_value:
303   - composite_norm = self._normalize_sku_match_text(embed_key.replace(":", " "))
304   - if any(composite_norm in query_text for query_text in query_texts):
305   - return index
306   - if any(embed_key.casefold() in query_text for query_text in query_texts):
307   - return index
308   - return None
309   -
310   - def _encode_query_vector_for_sku_matching(
311   - self,
312   - parsed_query: ParsedQuery,
313   - context: Optional[RequestContext] = None,
314   - ) -> Optional[np.ndarray]:
315   - """Best-effort fallback query embedding for final-page SKU matching."""
316   - query_text = (
317   - getattr(parsed_query, "rewritten_query", None)
318   - or getattr(parsed_query, "query_normalized", None)
319   - or getattr(parsed_query, "original_query", None)
320   - )
321   - if not query_text:
322   - return None
323   -
324   - text_encoder = getattr(self.query_parser, "text_encoder", None)
325   - if text_encoder is None:
326   - return None
327   -
328   - try:
329   - vectors = text_encoder.encode([query_text], priority=1)
330   - except Exception as exc:
331   - logger.warning("Failed to encode query vector for SKU matching: %s", exc, exc_info=True)
332   - if context is not None:
333   - context.add_warning(f"SKU query embedding failed: {exc}")
334   - return None
335   -
336   - if vectors is None or len(vectors) == 0:
337   - return None
338   -
339   - vector = vectors[0]
340   - if vector is None:
341   - return None
342   - return np.asarray(vector, dtype=np.float32)
343   -
344   - def _select_sku_by_embedding(
345   - self,
346   - skus: List[Dict[str, Any]],
347   - option1_vectors: Dict[str, np.ndarray],
348   - query_vector: np.ndarray,
349   - spu_option1_name: Optional[Any] = None,
350   - ) -> Tuple[Optional[int], Optional[float]]:
351   - """Select the SKU whose option1 embedding key (name:value) is most similar to the query."""
352   - best_index: Optional[int] = None
353   - best_score: Optional[float] = None
354   -
355   - for index, sku in enumerate(skus):
356   - embed_key = self._sku_option1_embedding_key(sku, spu_option1_name)
357   - if not embed_key:
358   - continue
359   - option_vector = option1_vectors.get(embed_key)
360   - if option_vector is None:
361   - continue
362   - score = float(np.inner(query_vector, option_vector))
363   - if best_score is None or score > best_score:
364   - best_index = index
365   - best_score = score
366   -
367   - return best_index, best_score
368   -
369   - @staticmethod
370   - def _promote_matching_sku(source: Dict[str, Any], match_index: int) -> Optional[Dict[str, Any]]:
371   - """Move the matched SKU to the front and swap the SPU image."""
372   - skus = source.get("skus")
373   - if not isinstance(skus, list) or match_index < 0 or match_index >= len(skus):
374   - return None
375   -
376   - matched_sku = skus.pop(match_index)
377   - skus.insert(0, matched_sku)
  249 + def _has_style_intent(parsed_query: Optional[ParsedQuery]) -> bool:
  250 + profile = getattr(parsed_query, "style_intent_profile", None)
  251 + return bool(getattr(profile, "is_active", False))
378 252  
379   - image_src = matched_sku.get("image_src") or matched_sku.get("imageSrc")
380   - if image_src:
381   - source["image_url"] = image_src
382   - return matched_sku
383   -
384   - def _apply_sku_sorting_for_page_hits(
  253 + def _apply_style_intent_to_hits(
385 254 self,
386 255 es_hits: List[Dict[str, Any]],
387 256 parsed_query: ParsedQuery,
388 257 context: Optional[RequestContext] = None,
389   - ) -> None:
390   - """Sort each page hit's SKUs so the best-matching SKU is first."""
391   - if not es_hits:
392   - return
393   -
394   - query_texts = self._build_sku_query_texts(parsed_query)
395   - unmatched_hits: List[Dict[str, Any]] = []
396   - option1_values_to_encode: List[str] = []
397   - seen_option1_values = set()
398   - text_matched = 0
399   - embedding_matched = 0
400   -
401   - for hit in es_hits:
402   - source = hit.get("_source")
403   - if not isinstance(source, dict):
404   - continue
405   - skus = source.get("skus")
406   - if not isinstance(skus, list) or not skus:
407   - continue
408   -
409   - spu_option1_name = source.get("option1_name")
410   - match_index = self._find_query_matching_sku_index(
411   - skus, query_texts, spu_option1_name=spu_option1_name
412   - )
413   - if match_index is not None:
414   - self._promote_matching_sku(source, match_index)
415   - text_matched += 1
416   - continue
417   -
418   - unmatched_hits.append(hit)
419   - for sku in skus:
420   - embed_key = self._sku_option1_embedding_key(sku, spu_option1_name)
421   - if not embed_key or embed_key in seen_option1_values:
422   - continue
423   - seen_option1_values.add(embed_key)
424   - option1_values_to_encode.append(embed_key)
425   -
426   - if not unmatched_hits or not option1_values_to_encode:
427   - return
428   -
429   - query_vector = getattr(parsed_query, "query_vector", None)
430   - if query_vector is None:
431   - query_vector = self._encode_query_vector_for_sku_matching(parsed_query, context=context)
432   - if query_vector is None:
433   - return
434   -
435   - text_encoder = getattr(self.query_parser, "text_encoder", None)
436   - if text_encoder is None:
437   - return
438   -
439   - try:
440   - encoded_option_vectors = text_encoder.encode(option1_values_to_encode, priority=1)
441   - except Exception as exc:
442   - logger.warning("Failed to encode SKU option1 values for final-page sorting: %s", exc, exc_info=True)
443   - if context is not None:
444   - context.add_warning(f"SKU option embedding failed: {exc}")
445   - return
446   -
447   - option1_vectors: Dict[str, np.ndarray] = {}
448   - for option1_value, vector in zip(option1_values_to_encode, encoded_option_vectors):
449   - if vector is None:
450   - continue
451   - option1_vectors[option1_value] = np.asarray(vector, dtype=np.float32)
452   -
453   - query_vector_array = np.asarray(query_vector, dtype=np.float32)
454   - for hit in unmatched_hits:
455   - source = hit.get("_source")
456   - if not isinstance(source, dict):
457   - continue
458   - skus = source.get("skus")
459   - if not isinstance(skus, list) or not skus:
460   - continue
461   - match_index, _ = self._select_sku_by_embedding(
462   - skus,
463   - option1_vectors,
464   - query_vector_array,
465   - spu_option1_name=source.get("option1_name"),
466   - )
467   - if match_index is None:
468   - continue
469   - self._promote_matching_sku(source, match_index)
470   - embedding_matched += 1
471   -
472   - if text_matched or embedding_matched:
473   - logger.info(
474   - "Final-page SKU sorting completed | text_matched=%s | embedding_matched=%s",
475   - text_matched,
476   - embedding_matched,
  258 + ) -> Dict[str, SkuSelectionDecision]:
  259 + decisions = self.style_sku_selector.prepare_hits(es_hits, parsed_query)
  260 + if decisions and context is not None:
  261 + context.store_intermediate_result(
  262 + "style_intent_sku_decisions",
  263 + {doc_id: decision.to_dict() for doc_id, decision in decisions.items()},
477 264 )
  265 + return decisions
478 266  
479 267 def search(
480 268 self,
... ... @@ -583,7 +371,8 @@ class Searcher:
583 371 context.metadata['feature_flags'] = {
584 372 'translation_enabled': enable_translation,
585 373 'embedding_enabled': enable_embedding,
586   - 'rerank_enabled': do_rerank
  374 + 'rerank_enabled': do_rerank,
  375 + 'style_intent_enabled': bool(self.style_intent_registry.enabled),
587 376 }
588 377  
589 378 # Step 1: Parse query
... ... @@ -607,6 +396,7 @@ class Searcher:
607 396 domain="default",
608 397 is_simple_query=True
609 398 )
  399 + context.metadata["feature_flags"]["style_intent_active"] = self._has_style_intent(parsed_query)
610 400  
611 401 context.logger.info(
612 402 f"查询解析完成 | 原查询: '{parsed_query.original_query}' | "
... ... @@ -667,7 +457,10 @@ class Searcher:
667 457 es_query_for_fetch = es_query
668 458 rerank_prefetch_source = None
669 459 if in_rerank_window:
670   - rerank_prefetch_source = self._resolve_rerank_source_filter(effective_doc_template)
  460 + rerank_prefetch_source = self._resolve_rerank_source_filter(
  461 + effective_doc_template,
  462 + parsed_query=parsed_query,
  463 + )
671 464 es_query_for_fetch = dict(es_query)
672 465 es_query_for_fetch["_source"] = rerank_prefetch_source
673 466  
... ... @@ -751,6 +544,20 @@ class Searcher:
751 544 finally:
752 545 context.end_stage(RequestContextStage.ELASTICSEARCH_SEARCH_PRIMARY)
753 546  
  547 + style_intent_decisions: Dict[str, SkuSelectionDecision] = {}
  548 + if self._has_style_intent(parsed_query) and in_rerank_window:
  549 + style_intent_decisions = self._apply_style_intent_to_hits(
  550 + es_response.get("hits", {}).get("hits") or [],
  551 + parsed_query,
  552 + context=context,
  553 + )
  554 + if style_intent_decisions:
  555 + context.logger.info(
  556 + "款式意图 SKU 预筛选完成 | hits=%s",
  557 + len(style_intent_decisions),
  558 + extra={'reqid': context.reqid, 'uid': context.uid}
  559 + )
  560 +
754 561 # Optional Step 4.5: AI reranking(仅当请求范围在重排窗口内时执行)
755 562 if do_rerank and in_rerank_window:
756 563 context.start_stage(RequestContextStage.RERANKING)
... ... @@ -841,6 +648,11 @@ class Searcher:
841 648 if "_source" in detail_hit:
842 649 hit["_source"] = detail_hit.get("_source") or {}
843 650 filled += 1
  651 + if style_intent_decisions:
  652 + self.style_sku_selector.apply_precomputed_decisions(
  653 + sliced,
  654 + style_intent_decisions,
  655 + )
844 656 if fill_took:
845 657 es_response["took"] = int((es_response.get("took", 0) or 0) + fill_took)
846 658 context.logger.info(
... ... @@ -883,7 +695,18 @@ class Searcher:
883 695 continue
884 696 rerank_debug_by_doc[str(doc_id)] = item
885 697  
886   - self._apply_sku_sorting_for_page_hits(es_hits, parsed_query, context=context)
  698 + if self._has_style_intent(parsed_query):
  699 + if in_rerank_window and style_intent_decisions:
  700 + self.style_sku_selector.apply_precomputed_decisions(
  701 + es_hits,
  702 + style_intent_decisions,
  703 + )
  704 + elif not in_rerank_window:
  705 + style_intent_decisions = self._apply_style_intent_to_hits(
  706 + es_hits,
  707 + parsed_query,
  708 + context=context,
  709 + )
887 710  
888 711 # Format results using ResultFormatter
889 712 formatted_results = ResultFormatter.format_search_results(
... ... @@ -902,6 +725,11 @@ class Searcher:
902 725 rerank_debug = None
903 726 if doc_id is not None:
904 727 rerank_debug = rerank_debug_by_doc.get(str(doc_id))
  728 + style_intent_debug = None
  729 + if doc_id is not None and style_intent_decisions:
  730 + decision = style_intent_decisions.get(str(doc_id))
  731 + if decision is not None:
  732 + style_intent_debug = decision.to_dict()
905 733  
906 734 raw_score = hit.get("_score")
907 735 try:
... ... @@ -940,6 +768,9 @@ class Searcher:
940 768 debug_entry["fused_score"] = rerank_debug.get("fused_score")
941 769 debug_entry["matched_queries"] = rerank_debug.get("matched_queries")
942 770  
  771 + if style_intent_debug:
  772 + debug_entry["style_intent_sku"] = style_intent_debug
  773 +
943 774 per_result_debug.append(debug_entry)
944 775  
945 776 # Format facets
... ... @@ -987,7 +818,8 @@ class Searcher:
987 818 "translations": context.query_analysis.translations,
988 819 "has_vector": context.query_analysis.query_vector is not None,
989 820 "is_simple_query": context.query_analysis.is_simple_query,
990   - "domain": context.query_analysis.domain
  821 + "domain": context.query_analysis.domain,
  822 + "style_intent_profile": context.get_intermediate_result("style_intent_profile"),
991 823 },
992 824 "es_query": context.get_intermediate_result('es_query', {}),
993 825 "es_response": {
... ...
search/sku_intent_selector.py 0 → 100644
... ... @@ -0,0 +1,405 @@
  1 +"""
  2 +SKU selection for style-intent-aware search results.
  3 +"""
  4 +
  5 +from __future__ import annotations
  6 +
  7 +from dataclasses import dataclass, field
  8 +from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple
  9 +
  10 +import numpy as np
  11 +
  12 +from query.style_intent import StyleIntentProfile, StyleIntentRegistry
  13 +from query.tokenization import normalize_query_text
  14 +
  15 +
  16 +@dataclass(frozen=True)
  17 +class SkuSelectionDecision:
  18 + selected_sku_id: Optional[str]
  19 + rerank_suffix: str
  20 + selected_text: str
  21 + matched_stage: str
  22 + similarity_score: Optional[float] = None
  23 + resolved_dimensions: Dict[str, Optional[str]] = field(default_factory=dict)
  24 +
  25 + def to_dict(self) -> Dict[str, Any]:
  26 + return {
  27 + "selected_sku_id": self.selected_sku_id,
  28 + "rerank_suffix": self.rerank_suffix,
  29 + "selected_text": self.selected_text,
  30 + "matched_stage": self.matched_stage,
  31 + "similarity_score": self.similarity_score,
  32 + "resolved_dimensions": dict(self.resolved_dimensions),
  33 + }
  34 +
  35 +
  36 +@dataclass
  37 +class _SkuCandidate:
  38 + index: int
  39 + sku_id: str
  40 + sku: Dict[str, Any]
  41 + selection_text: str
  42 + intent_texts: Dict[str, str]
  43 +
  44 +
  45 +class StyleSkuSelector:
  46 + """Selects the best SKU for an SPU based on detected style intent."""
  47 +
  48 + def __init__(
  49 + self,
  50 + registry: StyleIntentRegistry,
  51 + *,
  52 + text_encoder_getter: Optional[Callable[[], Any]] = None,
  53 + tokenizer_getter: Optional[Callable[[], Any]] = None,
  54 + ) -> None:
  55 + self.registry = registry
  56 + self._text_encoder_getter = text_encoder_getter
  57 + self._tokenizer_getter = tokenizer_getter
  58 +
  59 + def prepare_hits(
  60 + self,
  61 + es_hits: List[Dict[str, Any]],
  62 + parsed_query: Any,
  63 + ) -> Dict[str, SkuSelectionDecision]:
  64 + decisions: Dict[str, SkuSelectionDecision] = {}
  65 + style_profile = getattr(parsed_query, "style_intent_profile", None)
  66 + if not isinstance(style_profile, StyleIntentProfile) or not style_profile.is_active:
  67 + return decisions
  68 +
  69 + query_texts = self._build_query_texts(parsed_query, style_profile)
  70 + query_vector = self._get_query_vector(parsed_query)
  71 + tokenizer = self._get_tokenizer()
  72 +
  73 + for hit in es_hits:
  74 + source = hit.get("_source")
  75 + if not isinstance(source, dict):
  76 + continue
  77 +
  78 + decision = self._select_for_source(
  79 + source,
  80 + style_profile=style_profile,
  81 + query_texts=query_texts,
  82 + query_vector=query_vector,
  83 + tokenizer=tokenizer,
  84 + )
  85 + if decision is None:
  86 + continue
  87 +
  88 + self._apply_decision_to_source(source, decision)
  89 + if decision.rerank_suffix:
  90 + hit["_style_rerank_suffix"] = decision.rerank_suffix
  91 +
  92 + doc_id = hit.get("_id")
  93 + if doc_id is not None:
  94 + decisions[str(doc_id)] = decision
  95 +
  96 + return decisions
  97 +
  98 + def apply_precomputed_decisions(
  99 + self,
  100 + es_hits: List[Dict[str, Any]],
  101 + decisions: Dict[str, SkuSelectionDecision],
  102 + ) -> None:
  103 + if not es_hits or not decisions:
  104 + return
  105 +
  106 + for hit in es_hits:
  107 + doc_id = hit.get("_id")
  108 + if doc_id is None:
  109 + continue
  110 + decision = decisions.get(str(doc_id))
  111 + if decision is None:
  112 + continue
  113 + source = hit.get("_source")
  114 + if not isinstance(source, dict):
  115 + continue
  116 + self._apply_decision_to_source(source, decision)
  117 + if decision.rerank_suffix:
  118 + hit["_style_rerank_suffix"] = decision.rerank_suffix
  119 +
  120 + def _build_query_texts(
  121 + self,
  122 + parsed_query: Any,
  123 + style_profile: StyleIntentProfile,
  124 + ) -> List[str]:
  125 + texts = [variant.normalized_text for variant in style_profile.query_variants if variant.normalized_text]
  126 + if texts:
  127 + return list(dict.fromkeys(texts))
  128 +
  129 + fallbacks: List[str] = []
  130 + for value in (
  131 + getattr(parsed_query, "original_query", None),
  132 + getattr(parsed_query, "query_normalized", None),
  133 + getattr(parsed_query, "rewritten_query", None),
  134 + ):
  135 + normalized = normalize_query_text(value)
  136 + if normalized:
  137 + fallbacks.append(normalized)
  138 + translations = getattr(parsed_query, "translations", {}) or {}
  139 + if isinstance(translations, dict):
  140 + for value in translations.values():
  141 + normalized = normalize_query_text(value)
  142 + if normalized:
  143 + fallbacks.append(normalized)
  144 + return list(dict.fromkeys(fallbacks))
  145 +
  146 + def _get_query_vector(self, parsed_query: Any) -> Optional[np.ndarray]:
  147 + query_vector = getattr(parsed_query, "query_vector", None)
  148 + if query_vector is not None:
  149 + return np.asarray(query_vector, dtype=np.float32)
  150 +
  151 + text_encoder = self._get_text_encoder()
  152 + if text_encoder is None:
  153 + return None
  154 +
  155 + query_text = (
  156 + getattr(parsed_query, "rewritten_query", None)
  157 + or getattr(parsed_query, "query_normalized", None)
  158 + or getattr(parsed_query, "original_query", None)
  159 + )
  160 + if not query_text:
  161 + return None
  162 +
  163 + vectors = text_encoder.encode([query_text], priority=1)
  164 + if vectors is None or len(vectors) == 0 or vectors[0] is None:
  165 + return None
  166 + return np.asarray(vectors[0], dtype=np.float32)
  167 +
  168 + def _get_text_encoder(self) -> Any:
  169 + if self._text_encoder_getter is None:
  170 + return None
  171 + return self._text_encoder_getter()
  172 +
  173 + def _get_tokenizer(self) -> Any:
  174 + if self._tokenizer_getter is None:
  175 + return None
  176 + return self._tokenizer_getter()
  177 +
  178 + @staticmethod
  179 + def _fallback_sku_text(sku: Dict[str, Any]) -> str:
  180 + parts = []
  181 + for field_name in ("option1_value", "option2_value", "option3_value"):
  182 + value = str(sku.get(field_name) or "").strip()
  183 + if value:
  184 + parts.append(value)
  185 + return " ".join(parts)
  186 +
  187 + def _resolve_dimensions(
  188 + self,
  189 + source: Dict[str, Any],
  190 + style_profile: StyleIntentProfile,
  191 + ) -> Dict[str, Optional[str]]:
  192 + option_names = {
  193 + "option1_value": normalize_query_text(source.get("option1_name")),
  194 + "option2_value": normalize_query_text(source.get("option2_name")),
  195 + "option3_value": normalize_query_text(source.get("option3_name")),
  196 + }
  197 + resolved: Dict[str, Optional[str]] = {}
  198 + for intent in style_profile.intents:
  199 + if intent.intent_type in resolved:
  200 + continue
  201 + aliases = set(intent.dimension_aliases or self.registry.get_dimension_aliases(intent.intent_type))
  202 + matched_field = None
  203 + for field_name, option_name in option_names.items():
  204 + if option_name and option_name in aliases:
  205 + matched_field = field_name
  206 + break
  207 + resolved[intent.intent_type] = matched_field
  208 + return resolved
  209 +
  210 + def _build_candidates(
  211 + self,
  212 + skus: List[Dict[str, Any]],
  213 + resolved_dimensions: Dict[str, Optional[str]],
  214 + ) -> List[_SkuCandidate]:
  215 + candidates: List[_SkuCandidate] = []
  216 + for index, sku in enumerate(skus):
  217 + fallback_text = self._fallback_sku_text(sku)
  218 + intent_texts: Dict[str, str] = {}
  219 + for intent_type, field_name in resolved_dimensions.items():
  220 + if field_name:
  221 + value = str(sku.get(field_name) or "").strip()
  222 + intent_texts[intent_type] = value or fallback_text
  223 + else:
  224 + intent_texts[intent_type] = fallback_text
  225 +
  226 + selection_parts: List[str] = []
  227 + seen = set()
  228 + for value in intent_texts.values():
  229 + normalized = normalize_query_text(value)
  230 + if not normalized or normalized in seen:
  231 + continue
  232 + seen.add(normalized)
  233 + selection_parts.append(str(value).strip())
  234 +
  235 + selection_text = " ".join(selection_parts).strip() or fallback_text
  236 + candidates.append(
  237 + _SkuCandidate(
  238 + index=index,
  239 + sku_id=str(sku.get("sku_id") or ""),
  240 + sku=sku,
  241 + selection_text=selection_text,
  242 + intent_texts=intent_texts,
  243 + )
  244 + )
  245 + return candidates
  246 +
  247 + @staticmethod
  248 + def _is_direct_match(
  249 + candidate: _SkuCandidate,
  250 + query_texts: Sequence[str],
  251 + ) -> bool:
  252 + if not candidate.intent_texts or not query_texts:
  253 + return False
  254 + for value in candidate.intent_texts.values():
  255 + normalized_value = normalize_query_text(value)
  256 + if not normalized_value:
  257 + return False
  258 + if not any(normalized_value in query_text for query_text in query_texts):
  259 + return False
  260 + return True
  261 +
  262 + def _is_generalized_match(
  263 + self,
  264 + candidate: _SkuCandidate,
  265 + style_profile: StyleIntentProfile,
  266 + tokenizer: Any,
  267 + ) -> bool:
  268 + if not candidate.intent_texts:
  269 + return False
  270 +
  271 + for intent_type, value in candidate.intent_texts.items():
  272 + definition = self.registry.get_definition(intent_type)
  273 + if definition is None:
  274 + return False
  275 + matched_canonicals = definition.match_text(value, tokenizer=tokenizer)
  276 + if not matched_canonicals.intersection(style_profile.get_canonical_values(intent_type)):
  277 + return False
  278 + return True
  279 +
  280 + def _select_by_embedding(
  281 + self,
  282 + candidates: Sequence[_SkuCandidate],
  283 + query_vector: Optional[np.ndarray],
  284 + ) -> Tuple[Optional[_SkuCandidate], Optional[float]]:
  285 + if not candidates:
  286 + return None, None
  287 + text_encoder = self._get_text_encoder()
  288 + if query_vector is None or text_encoder is None:
  289 + return candidates[0], None
  290 +
  291 + unique_texts = list(
  292 + dict.fromkeys(
  293 + normalize_query_text(candidate.selection_text)
  294 + for candidate in candidates
  295 + if normalize_query_text(candidate.selection_text)
  296 + )
  297 + )
  298 + if not unique_texts:
  299 + return candidates[0], None
  300 +
  301 + vectors = text_encoder.encode(unique_texts, priority=1)
  302 + vector_map: Dict[str, np.ndarray] = {}
  303 + for key, vector in zip(unique_texts, vectors):
  304 + if vector is None:
  305 + continue
  306 + vector_map[key] = np.asarray(vector, dtype=np.float32)
  307 +
  308 + best_candidate: Optional[_SkuCandidate] = None
  309 + best_score: Optional[float] = None
  310 + query_vector_array = np.asarray(query_vector, dtype=np.float32)
  311 + for candidate in candidates:
  312 + normalized_text = normalize_query_text(candidate.selection_text)
  313 + candidate_vector = vector_map.get(normalized_text)
  314 + if candidate_vector is None:
  315 + continue
  316 + score = float(np.inner(query_vector_array, candidate_vector))
  317 + if best_score is None or score > best_score:
  318 + best_candidate = candidate
  319 + best_score = score
  320 +
  321 + return best_candidate or candidates[0], best_score
  322 +
  323 + def _select_for_source(
  324 + self,
  325 + source: Dict[str, Any],
  326 + *,
  327 + style_profile: StyleIntentProfile,
  328 + query_texts: Sequence[str],
  329 + query_vector: Optional[np.ndarray],
  330 + tokenizer: Any,
  331 + ) -> Optional[SkuSelectionDecision]:
  332 + skus = source.get("skus")
  333 + if not isinstance(skus, list) or not skus:
  334 + return None
  335 +
  336 + resolved_dimensions = self._resolve_dimensions(source, style_profile)
  337 + candidates = self._build_candidates(skus, resolved_dimensions)
  338 + if not candidates:
  339 + return None
  340 +
  341 + direct_matches = [candidate for candidate in candidates if self._is_direct_match(candidate, query_texts)]
  342 + if len(direct_matches) == 1:
  343 + chosen = direct_matches[0]
  344 + return self._build_decision(chosen, resolved_dimensions, matched_stage="direct")
  345 +
  346 + generalized_matches: List[_SkuCandidate] = []
  347 + if not direct_matches:
  348 + generalized_matches = [
  349 + candidate
  350 + for candidate in candidates
  351 + if self._is_generalized_match(candidate, style_profile, tokenizer)
  352 + ]
  353 + if len(generalized_matches) == 1:
  354 + chosen = generalized_matches[0]
  355 + return self._build_decision(chosen, resolved_dimensions, matched_stage="generalized")
  356 +
  357 + embedding_pool = direct_matches or generalized_matches or candidates
  358 + chosen, similarity_score = self._select_by_embedding(embedding_pool, query_vector)
  359 + if chosen is None:
  360 + return None
  361 + stage = "embedding_from_matches" if direct_matches or generalized_matches else "embedding_from_all"
  362 + return self._build_decision(
  363 + chosen,
  364 + resolved_dimensions,
  365 + matched_stage=stage,
  366 + similarity_score=similarity_score,
  367 + )
  368 +
  369 + @staticmethod
  370 + def _build_decision(
  371 + candidate: _SkuCandidate,
  372 + resolved_dimensions: Dict[str, Optional[str]],
  373 + *,
  374 + matched_stage: str,
  375 + similarity_score: Optional[float] = None,
  376 + ) -> SkuSelectionDecision:
  377 + return SkuSelectionDecision(
  378 + selected_sku_id=candidate.sku_id or None,
  379 + rerank_suffix=str(candidate.selection_text or "").strip(),
  380 + selected_text=str(candidate.selection_text or "").strip(),
  381 + matched_stage=matched_stage,
  382 + similarity_score=similarity_score,
  383 + resolved_dimensions=dict(resolved_dimensions),
  384 + )
  385 +
  386 + @staticmethod
  387 + def _apply_decision_to_source(source: Dict[str, Any], decision: SkuSelectionDecision) -> None:
  388 + skus = source.get("skus")
  389 + if not isinstance(skus, list) or not skus or not decision.selected_sku_id:
  390 + return
  391 +
  392 + selected_index = None
  393 + for index, sku in enumerate(skus):
  394 + if str(sku.get("sku_id") or "") == decision.selected_sku_id:
  395 + selected_index = index
  396 + break
  397 + if selected_index is None:
  398 + return
  399 +
  400 + selected_sku = skus.pop(selected_index)
  401 + skus.insert(0, selected_sku)
  402 +
  403 + image_src = selected_sku.get("image_src") or selected_sku.get("imageSrc")
  404 + if image_src:
  405 + source["image_url"] = image_src
... ...
tests/test_search_rerank_window.py
... ... @@ -18,6 +18,7 @@ from config import (
18 18 SearchConfig,
19 19 )
20 20 from context import create_request_context
  21 +from query.style_intent import DetectedStyleIntent, StyleIntentProfile
21 22 from search.searcher import Searcher
22 23  
23 24  
... ... @@ -30,6 +31,7 @@ class _FakeParsedQuery:
30 31 translations: Dict[str, str] = None
31 32 query_vector: Any = None
32 33 domain: str = "default"
  34 + style_intent_profile: Any = None
33 35  
34 36 def to_dict(self) -> Dict[str, Any]:
35 37 return {
... ... @@ -39,9 +41,27 @@ class _FakeParsedQuery:
39 41 "detected_language": self.detected_language,
40 42 "translations": self.translations or {},
41 43 "domain": self.domain,
  44 + "style_intent_profile": (
  45 + self.style_intent_profile.to_dict() if self.style_intent_profile is not None else None
  46 + ),
42 47 }
43 48  
44 49  
  50 +def _build_style_intent_profile(intent_type: str, canonical_value: str, *dimension_aliases: str) -> StyleIntentProfile:
  51 + aliases = dimension_aliases or (intent_type,)
  52 + return StyleIntentProfile(
  53 + intents=(
  54 + DetectedStyleIntent(
  55 + intent_type=intent_type,
  56 + canonical_value=canonical_value,
  57 + matched_term=canonical_value,
  58 + matched_query_text=canonical_value,
  59 + dimension_aliases=tuple(aliases),
  60 + ),
  61 + )
  62 + )
  63 +
  64 +
45 65 class _FakeQueryParser:
46 66 def parse(
47 67 self,
... ... @@ -340,6 +360,57 @@ def test_searcher_rerank_prefetch_source_follows_doc_template(monkeypatch):
340 360 assert es_client.calls[0]["body"]["_source"] == {"includes": ["brief", "title", "vendor"]}
341 361  
342 362  
  363 +def test_searcher_rerank_prefetch_source_includes_sku_fields_when_style_intent_active(monkeypatch):
  364 + es_client = _FakeESClient()
  365 + searcher = _build_searcher(_build_search_config(rerank_enabled=True), es_client)
  366 + context = create_request_context(reqid="t1c", uid="u1c")
  367 +
  368 + monkeypatch.setattr(
  369 + "search.searcher.get_tenant_config_loader",
  370 + lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en"]}),
  371 + )
  372 + monkeypatch.setattr(
  373 + "search.rerank_client.run_rerank",
  374 + lambda **kwargs: (kwargs["es_response"], None, []),
  375 + )
  376 +
  377 + class _IntentQueryParser:
  378 + text_encoder = None
  379 +
  380 + def parse(
  381 + self,
  382 + query: str,
  383 + tenant_id: str,
  384 + generate_vector: bool,
  385 + context: Any,
  386 + target_languages: Any = None,
  387 + ):
  388 + return _FakeParsedQuery(
  389 + original_query=query,
  390 + query_normalized=query,
  391 + rewritten_query=query,
  392 + translations={},
  393 + style_intent_profile=_build_style_intent_profile(
  394 + "color", "black", "color", "colors", "颜色"
  395 + ),
  396 + )
  397 +
  398 + searcher.query_parser = _IntentQueryParser()
  399 +
  400 + searcher.search(
  401 + query="black dress",
  402 + tenant_id="162",
  403 + from_=0,
  404 + size=5,
  405 + context=context,
  406 + enable_rerank=None,
  407 + )
  408 +
  409 + assert es_client.calls[0]["body"]["_source"] == {
  410 + "includes": ["option1_name", "option2_name", "option3_name", "skus", "title"]
  411 + }
  412 +
  413 +
343 414 def test_searcher_skips_rerank_when_request_explicitly_false(monkeypatch):
344 415 es_client = _FakeESClient()
345 416 searcher = _build_searcher(_build_search_config(rerank_enabled=True), es_client)
... ... @@ -434,6 +505,9 @@ def test_searcher_promotes_sku_when_option1_matches_translated_query(monkeypatch
434 505 query_normalized=query,
435 506 rewritten_query=query,
436 507 translations={"en": "black dress"},
  508 + style_intent_profile=_build_style_intent_profile(
  509 + "color", "black", "color", "colors", "颜色"
  510 + ),
437 511 )
438 512  
439 513 searcher.query_parser = _TranslatedQueryParser()
... ... @@ -481,8 +555,8 @@ def test_searcher_promotes_sku_by_embedding_when_query_has_no_direct_option_matc
481 555 encoder = _FakeTextEncoder(
482 556 {
483 557 "linen summer dress": [0.8, 0.2],
484   - "color:red": [1.0, 0.0],
485   - "color:blue": [0.0, 1.0],
  558 + "red": [1.0, 0.0],
  559 + "blue": [0.0, 1.0],
486 560 }
487 561 )
488 562  
... ... @@ -503,6 +577,9 @@ def test_searcher_promotes_sku_by_embedding_when_query_has_no_direct_option_matc
503 577 rewritten_query=query,
504 578 translations={},
505 579 query_vector=np.array([0.0, 1.0], dtype=np.float32),
  580 + style_intent_profile=_build_style_intent_profile(
  581 + "color", "blue", "color", "colors", "颜色"
  582 + ),
506 583 )
507 584  
508 585 searcher.query_parser = _EmbeddingQueryParser()
... ...
tests/test_style_intent.py 0 → 100644
... ... @@ -0,0 +1,35 @@
  1 +from types import SimpleNamespace
  2 +
  3 +from config import QueryConfig
  4 +from query.style_intent import StyleIntentDetector, StyleIntentRegistry
  5 +
  6 +
  7 +def test_style_intent_detector_matches_original_and_translated_queries():
  8 + query_config = QueryConfig(
  9 + style_intent_terms={
  10 + "color": [["black", "黑色", "black"]],
  11 + "size": [["xl", "x-large", "加大码"]],
  12 + },
  13 + style_intent_dimension_aliases={
  14 + "color": ["color", "颜色"],
  15 + "size": ["size", "尺码"],
  16 + },
  17 + )
  18 + detector = StyleIntentDetector(
  19 + StyleIntentRegistry.from_query_config(query_config),
  20 + tokenizer=lambda text: text.split(),
  21 + )
  22 +
  23 + parsed_query = SimpleNamespace(
  24 + original_query="黑色 连衣裙",
  25 + query_normalized="黑色 连衣裙",
  26 + rewritten_query="黑色 连衣裙",
  27 + translations={"en": "black dress xl"},
  28 + )
  29 +
  30 + profile = detector.detect(parsed_query)
  31 +
  32 + assert profile.is_active is True
  33 + assert profile.get_canonical_values("color") == {"black"}
  34 + assert profile.get_canonical_values("size") == {"xl"}
  35 + assert len(profile.query_variants) == 2
... ...