Commit cda1cd6231ec713689f779d3a0f464b582f47110
1 parent
dad3c867
意图分析&应用 baseline
Showing
15 changed files
with
1254 additions
and
292 deletions
Show diff stats
config/config.yaml
| ... | ... | @@ -17,9 +17,9 @@ runtime: |
| 17 | 17 | embedding_port: 6005 |
| 18 | 18 | embedding_text_port: 6005 |
| 19 | 19 | embedding_image_port: 6008 |
| 20 | - translator_host: "127.0.0.1" | |
| 20 | + translator_host: "0.0.0.0" | |
| 21 | 21 | translator_port: 6006 |
| 22 | - reranker_host: "127.0.0.1" | |
| 22 | + reranker_host: "0.0.0.0" | |
| 23 | 23 | reranker_port: 6007 |
| 24 | 24 | |
| 25 | 25 | # 基础设施连接(敏感项优先读环境变量:ES_*、REDIS_*、DB_*、DASHSCOPE_API_KEY、DEEPL_AUTH_KEY) |
| ... | ... | @@ -116,6 +116,14 @@ query_config: |
| 116 | 116 | translation_embedding_wait_budget_ms_source_in_index: 500 # 80 |
| 117 | 117 | translation_embedding_wait_budget_ms_source_not_in_index: 500 #200 |
| 118 | 118 | |
| 119 | + style_intent: | |
| 120 | + enabled: true | |
| 121 | + color_dictionary_path: "config/dictionaries/style_intent_color.csv" | |
| 122 | + size_dictionary_path: "config/dictionaries/style_intent_size.csv" | |
| 123 | + dimension_aliases: | |
| 124 | + color: ["color", "colors", "colour", "colours", "颜色", "色", "色系"] | |
| 125 | + size: ["size", "sizes", "sizing", "尺码", "尺寸", "码数", "号码", "码"] | |
| 126 | + | |
| 119 | 127 | # 动态多语言检索字段配置 |
| 120 | 128 | # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式; |
| 121 | 129 | # shared_fields 为无语言后缀字段。 | ... | ... |
| ... | ... | @@ -0,0 +1,15 @@ |
| 1 | +black,black,blk,黑,黑色 | |
| 2 | +white,white,wht,白,白色 | |
| 3 | +red,red,reddish,红,红色 | |
| 4 | +blue,blue,blu,蓝,蓝色 | |
| 5 | +green,green,grn,绿,绿色 | |
| 6 | +yellow,yellow,ylw,黄,黄色 | |
| 7 | +pink,pink,粉,粉色 | |
| 8 | +purple,purple,violet,紫,紫色 | |
| 9 | +gray,gray,grey,灰,灰色 | |
| 10 | +brown,brown,棕,棕色,咖啡色 | |
| 11 | +beige,beige,khaki,米色,卡其色 | |
| 12 | +navy,navy,navy blue,藏青,藏蓝,深蓝 | |
| 13 | +silver,silver,银,银色 | |
| 14 | +gold,gold,金,金色 | |
| 15 | +orange,orange,橙,橙色 | ... | ... |
config/loader.py
| ... | ... | @@ -95,6 +95,29 @@ def _read_rewrite_dictionary(path: Path) -> Dict[str, str]: |
| 95 | 95 | return rewrite_dict |
| 96 | 96 | |
| 97 | 97 | |
| 98 | +def _read_synonym_csv_dictionary(path: Path) -> List[List[str]]: | |
| 99 | + rows: List[List[str]] = [] | |
| 100 | + if not path.exists(): | |
| 101 | + return rows | |
| 102 | + | |
| 103 | + with open(path, "r", encoding="utf-8") as handle: | |
| 104 | + for raw_line in handle: | |
| 105 | + line = raw_line.strip() | |
| 106 | + if not line or line.startswith("#"): | |
| 107 | + continue | |
| 108 | + parts = [segment.strip() for segment in line.split(",")] | |
| 109 | + normalized = [segment for segment in parts if segment] | |
| 110 | + if normalized: | |
| 111 | + rows.append(normalized) | |
| 112 | + return rows | |
| 113 | + | |
| 114 | + | |
| 115 | +_DEFAULT_STYLE_INTENT_DIMENSION_ALIASES: Dict[str, List[str]] = { | |
| 116 | + "color": ["color", "colors", "colour", "colours", "颜色", "色", "色系"], | |
| 117 | + "size": ["size", "sizes", "sizing", "尺码", "尺寸", "码数", "号码", "码"], | |
| 118 | +} | |
| 119 | + | |
| 120 | + | |
| 98 | 121 | class AppConfigLoader: |
| 99 | 122 | """Load the unified application configuration.""" |
| 100 | 123 | |
| ... | ... | @@ -253,6 +276,45 @@ class AppConfigLoader: |
| 253 | 276 | if isinstance(query_cfg.get("text_query_strategy"), dict) |
| 254 | 277 | else {} |
| 255 | 278 | ) |
| 279 | + style_intent_cfg = ( | |
| 280 | + query_cfg.get("style_intent") | |
| 281 | + if isinstance(query_cfg.get("style_intent"), dict) | |
| 282 | + else {} | |
| 283 | + ) | |
| 284 | + | |
| 285 | + def _resolve_project_path(value: Any, default_path: Path) -> Path: | |
| 286 | + if value in (None, ""): | |
| 287 | + return default_path | |
| 288 | + candidate = Path(str(value)) | |
| 289 | + if candidate.is_absolute(): | |
| 290 | + return candidate | |
| 291 | + return self.project_root / candidate | |
| 292 | + | |
| 293 | + style_color_path = _resolve_project_path( | |
| 294 | + style_intent_cfg.get("color_dictionary_path"), | |
| 295 | + self.config_dir / "dictionaries" / "style_intent_color.csv", | |
| 296 | + ) | |
| 297 | + style_size_path = _resolve_project_path( | |
| 298 | + style_intent_cfg.get("size_dictionary_path"), | |
| 299 | + self.config_dir / "dictionaries" / "style_intent_size.csv", | |
| 300 | + ) | |
| 301 | + configured_dimension_aliases = ( | |
| 302 | + style_intent_cfg.get("dimension_aliases") | |
| 303 | + if isinstance(style_intent_cfg.get("dimension_aliases"), dict) | |
| 304 | + else {} | |
| 305 | + ) | |
| 306 | + style_dimension_aliases: Dict[str, List[str]] = {} | |
| 307 | + for intent_type, default_aliases in _DEFAULT_STYLE_INTENT_DIMENSION_ALIASES.items(): | |
| 308 | + aliases = configured_dimension_aliases.get(intent_type) | |
| 309 | + if isinstance(aliases, list) and aliases: | |
| 310 | + style_dimension_aliases[intent_type] = [str(alias) for alias in aliases if str(alias).strip()] | |
| 311 | + else: | |
| 312 | + style_dimension_aliases[intent_type] = list(default_aliases) | |
| 313 | + | |
| 314 | + style_intent_terms = { | |
| 315 | + "color": _read_synonym_csv_dictionary(style_color_path), | |
| 316 | + "size": _read_synonym_csv_dictionary(style_size_path), | |
| 317 | + } | |
| 256 | 318 | query_config = QueryConfig( |
| 257 | 319 | supported_languages=list(query_cfg.get("supported_languages") or ["zh", "en"]), |
| 258 | 320 | default_language=str(query_cfg.get("default_language") or "en"), |
| ... | ... | @@ -324,6 +386,9 @@ class AppConfigLoader: |
| 324 | 386 | translation_embedding_wait_budget_ms_source_not_in_index=int( |
| 325 | 387 | query_cfg.get("translation_embedding_wait_budget_ms_source_not_in_index", 200) |
| 326 | 388 | ), |
| 389 | + style_intent_enabled=bool(style_intent_cfg.get("enabled", True)), | |
| 390 | + style_intent_terms=style_intent_terms, | |
| 391 | + style_intent_dimension_aliases=style_dimension_aliases, | |
| 327 | 392 | ) |
| 328 | 393 | |
| 329 | 394 | function_score_cfg = raw.get("function_score") if isinstance(raw.get("function_score"), dict) else {} | ... | ... |
config/schema.py
| ... | ... | @@ -64,6 +64,9 @@ class QueryConfig: |
| 64 | 64 | # 检测语言不在 index_languages 内:翻译对召回更关键,预算较长。 |
| 65 | 65 | translation_embedding_wait_budget_ms_source_in_index: int = 80 |
| 66 | 66 | translation_embedding_wait_budget_ms_source_not_in_index: int = 200 |
| 67 | + style_intent_enabled: bool = True | |
| 68 | + style_intent_terms: Dict[str, List[List[str]]] = field(default_factory=dict) | |
| 69 | + style_intent_dimension_aliases: Dict[str, List[str]] = field(default_factory=dict) | |
| 67 | 70 | |
| 68 | 71 | |
| 69 | 72 | @dataclass(frozen=True) | ... | ... |
docs/TODO-意图判断.md
| ... | ... | @@ -39,3 +39,15 @@ intent 考虑由 QueryParser 编排、具体实现拆成独立模块,主义好 |
| 39 | 39 | |
| 40 | 40 | 5. TODO: 搜索接口里,results[].skus 不是全量子 SKU:由 sku_filter_dimension 控制在应用层按维度分组折叠,每个「维度取值组合」只保留一条 SKU(组内第一条)。请求未传该字段时,Pydantic 默认是 ["option1"],等价于只按 option1_value 去重;服务端不会读取店铺主题的「主展示维」,需调用方与装修配置对齐并传入正确维度。因此当用户有款式等更细粒度意图、而款式落在 option2/option3(或对应 option*_name)时,若仍用默认只按 option1(常见为颜色)折叠,同一颜色下多种款式只会出现一条代表 SKU,无法从返回的 skus 里拿到该颜色下的全部款式行。(若业务需要全量子款,需传包含对应维度的 sku_filter_dimension,或传 null/[] 跳过折叠——以当前 ResultFormatter 实现为准。) |
| 41 | 41 | |
| 42 | + | |
| 43 | + | |
| 44 | + | |
| 45 | + | |
| 46 | +GPT5.4 主要思路: | |
| 47 | +我准备按这条线改,尽量把逻辑收束成两个新模块而不是继续堆在 Searcher 里: | |
| 48 | + | |
| 49 | +QueryParser 负责产出统一的“款式意图分析结果”,包含:命中的意图类型、标准化值、命中的 query 变体、分词结果、维度候选词。 | |
| 50 | +独立的 SKU 选择模块负责:维度解析、候选值抽取、三阶段匹配、选中 SKU 后的主图替换,以及生成 rerank 用的属性后缀。 | |
| 51 | +Searcher 只编排:解析 query -> 首次 ES 召回 -> 若有款式意图则对 rerank 窗口内全部 hits 做 SKU 预筛选并补充 rerank doc 文本 -> run_rerank -> 分页 -> page fill -> ResultFormatter。这样旧的页内 SKU 置顶逻辑可以下沉或删除,避免前后两个阶段重复做同一件事。我再补看一下你最近写的意图草稿文档,确认有没有现成词表示例或边界条件,避免我这边把词表格式和你预期做偏。 | |
| 52 | + | |
| 53 | + | ... | ... |
| ... | ... | @@ -0,0 +1,118 @@ |
| 1 | + | |
| 2 | +## prompt | |
| 3 | +参考 @docs/常用查询 - ES.md 帮我查询 tenant_Id=163的@search_products.json (1314-1332) 这些字段的统计分布,也就是这里每个字段的所有的值及其对应的文档数 | |
| 4 | + | |
| 5 | + | |
| 6 | +已按 `docs/常用查询 - ES.md` 的写法,在索引 **`search_products_tenant_163`** 上做了 `terms` 聚合(租户已按索引拆分,用 `match_all` 即可,等价于只查 tenant 163)。集群在本机 `localhost:9200` 上执行成功。 | |
| 7 | + | |
| 8 | +## 结果概要 | |
| 9 | + | |
| 10 | +- **总文档数**:8977 | |
| 11 | +- 六个字段的 **`sum_other_doc_count` 均为 0**,在 `size: 20000` 下没有截断,即:**当前数据里每个字段的取值都已全部列出**。 | |
| 12 | +- **`__missing__`**:来自聚合里的 `"missing": "__missing__"`,表示该字段缺失或为空的文档数(便于和「有值」一起看)。 | |
| 13 | +- **`option1_values` / `option2_values`** 多为多值数组:同一篇文档可落入多个桶,因此 **各桶 `doc_count` 之和会大于 8977**(这是 ES `terms` 的正常语义,表示「含该取值的文档数」,不是互斥分区)。 | |
| 14 | +- **`option1_name` / `option2_name` / `option3_name`** 及 **`option3_values`** 的桶计数之和与文档关系更接近单值/稀疏(见下方校验)。 | |
| 15 | + | |
| 16 | +### `option1_name`(13 个取值) | |
| 17 | + | |
| 18 | +| 值 | 文档数 | | |
| 19 | +|---:|---:| | |
| 20 | +| __missing__ | 4538 | | |
| 21 | +| Size | 2916 | | |
| 22 | +| Color | 1174 | | |
| 23 | +| 颜色 | 244 | | |
| 24 | +| COLOR | 56 | | |
| 25 | +| color | 16 | | |
| 26 | +| Colour | 15 | | |
| 27 | +| Variant | 9 | | |
| 28 | +| Style | 3 | | |
| 29 | +| Colors | 2 | | |
| 30 | +| Scent | 2 | | |
| 31 | +| Cup Size | 1 | | |
| 32 | +| Pattern Name | 1 | | |
| 33 | + | |
| 34 | +### `option2_name`(13 个取值) | |
| 35 | + | |
| 36 | +| 值 | 文档数 | | |
| 37 | +|---:|---:| | |
| 38 | +| __missing__ | 4666 | | |
| 39 | +| Color | 2879 | | |
| 40 | +| Size | 1134 | | |
| 41 | +| 尺码 | 244 | | |
| 42 | +| SIZE | 17 | | |
| 43 | +| size | 16 | | |
| 44 | +| Style | 12 | | |
| 45 | +| Cup Size | 4 | | |
| 46 | +| Item Package Quantity | 1 | | |
| 47 | +| Number of Items | 1 | | |
| 48 | +| Ring Size | 1 | | |
| 49 | +| Scent | 1 | | |
| 50 | +| Team Name | 1 | | |
| 51 | + | |
| 52 | +### `option3_name`(11 个取值) | |
| 53 | + | |
| 54 | +| 值 | 文档数 | | |
| 55 | +|---:|---:| | |
| 56 | +| __missing__ | 8831 | | |
| 57 | +| Fit Type | 52 | | |
| 58 | +| Special Size | 31 | | |
| 59 | +| Number of Items | 17 | | |
| 60 | +| Special Size Type | 16 | | |
| 61 | +| Team Name | 15 | | |
| 62 | +| Material Type | 8 | | |
| 63 | +| Style | 4 | | |
| 64 | +| Configuration | 1 | | |
| 65 | +| Hand Orientation | 1 | | |
| 66 | +| Pattern Name | 1 | | |
| 67 | + | |
| 68 | +### `option1_values` / `option2_values` | |
| 69 | + | |
| 70 | +- **桶数量**:`option1_values` **3463**,`option2_values` **3605**(完整列表过长,不宜贴在对话里)。 | |
| 71 | +- **文档中出现最多的取值(前 10)** | |
| 72 | + - **option1_values**:`__missing__` 4538,其后如 Medium 749、Black 653、Large 635、8/9/White/10/7/Small… | |
| 73 | + - **option2_values**:`__missing__` 4666,其后如 L 1161、M 1158、S 1064、XL 1036、Black 876… | |
| 74 | + | |
| 75 | +### `option3_values`(94 个桶,已全部枚举) | |
| 76 | + | |
| 77 | +除 `__missing__` 8831 外,另有 93 个具体取值;上文 Markdown 表里已列全(从 Standard 到 `standard` 等)。 | |
| 78 | + | |
| 79 | +--- | |
| 80 | + | |
| 81 | +## 可复用的查询(与文档一致) | |
| 82 | + | |
| 83 | +```bash | |
| 84 | +curl -sS -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_163/_search?pretty' \ | |
| 85 | + -H 'Content-Type: application/json' -d '{ | |
| 86 | + "query": { "match_all": {} }, | |
| 87 | + "size": 0, | |
| 88 | + "track_total_hits": true, | |
| 89 | + "aggs": { | |
| 90 | + "option1_name_dist": { | |
| 91 | + "terms": { "field": "option1_name", "size": 20000, "order": { "_count": "desc" }, "missing": "__missing__" } | |
| 92 | + }, | |
| 93 | + "option2_name_dist": { | |
| 94 | + "terms": { "field": "option2_name", "size": 20000, "order": { "_count": "desc" }, "missing": "__missing__" } | |
| 95 | + }, | |
| 96 | + "option3_name_dist": { | |
| 97 | + "terms": { "field": "option3_name", "size": 20000, "order": { "_count": "desc" }, "missing": "__missing__" } | |
| 98 | + }, | |
| 99 | + "option1_values_dist": { | |
| 100 | + "terms": { "field": "option1_values", "size": 20000, "order": { "_count": "desc" }, "missing": "__missing__" } | |
| 101 | + }, | |
| 102 | + "option2_values_dist": { | |
| 103 | + "terms": { "field": "option2_values", "size": 20000, "order": { "_count": "desc" }, "missing": "__missing__" } | |
| 104 | + }, | |
| 105 | + "option3_values_dist": { | |
| 106 | + "terms": { "field": "option3_values", "size": 20000, "order": { "_count": "desc" }, "missing": "__missing__" } | |
| 107 | + } | |
| 108 | + } | |
| 109 | +}' | |
| 110 | +``` | |
| 111 | + | |
| 112 | +若某租户取值超过 2 万,需把 `size` 再加大或用 **composite** 分页扫完全部桶。 | |
| 113 | + | |
| 114 | +## 完整 JSON(含 3463 + 3605 条 value 分布) | |
| 115 | + | |
| 116 | +已整理为:`/tmp/tenant163_option_fields_distribution.json`(约 620KB),结构为按字段分组的 `values: [{ "value", "doc_count" }, ...]`,便于你用脚本或表格工具打开。 | |
| 117 | + | |
| 118 | +如需去掉 `__missing__` 桶,从请求里删掉各聚合中的 `"missing": "__missing__"` 即可。 | |
| 0 | 119 | \ No newline at end of file | ... | ... |
query/query_parser.py
| ... | ... | @@ -12,7 +12,6 @@ from dataclasses import dataclass, field |
| 12 | 12 | from typing import Any, Callable, Dict, List, Optional, Tuple |
| 13 | 13 | import numpy as np |
| 14 | 14 | import logging |
| 15 | -import re | |
| 16 | 15 | from concurrent.futures import ThreadPoolExecutor, wait |
| 17 | 16 | |
| 18 | 17 | from embeddings.text_encoder import TextEmbeddingEncoder |
| ... | ... | @@ -20,25 +19,14 @@ from config import SearchConfig |
| 20 | 19 | from translation import create_translation_client |
| 21 | 20 | from .language_detector import LanguageDetector |
| 22 | 21 | from .query_rewriter import QueryRewriter, QueryNormalizer |
| 22 | +from .style_intent import StyleIntentDetector, StyleIntentProfile, StyleIntentRegistry | |
| 23 | +from .tokenization import extract_token_strings, simple_tokenize_query | |
| 23 | 24 | |
| 24 | 25 | logger = logging.getLogger(__name__) |
| 25 | 26 | |
| 26 | 27 | import hanlp # type: ignore |
| 27 | 28 | |
| 28 | 29 | |
| 29 | -def simple_tokenize_query(text: str) -> List[str]: | |
| 30 | - """ | |
| 31 | - Lightweight tokenizer for suggestion-side heuristics only. | |
| 32 | - | |
| 33 | - - Consecutive CJK characters form one token | |
| 34 | - - Latin / digit runs (with internal hyphens) form tokens | |
| 35 | - """ | |
| 36 | - if not text: | |
| 37 | - return [] | |
| 38 | - pattern = re.compile(r"[\u4e00-\u9fff]+|[A-Za-z0-9_]+(?:-[A-Za-z0-9_]+)*") | |
| 39 | - return pattern.findall(text) | |
| 40 | - | |
| 41 | - | |
| 42 | 30 | @dataclass(slots=True) |
| 43 | 31 | class ParsedQuery: |
| 44 | 32 | """Container for query parser facts.""" |
| ... | ... | @@ -50,6 +38,7 @@ class ParsedQuery: |
| 50 | 38 | translations: Dict[str, str] = field(default_factory=dict) |
| 51 | 39 | query_vector: Optional[np.ndarray] = None |
| 52 | 40 | query_tokens: List[str] = field(default_factory=list) |
| 41 | + style_intent_profile: Optional[StyleIntentProfile] = None | |
| 53 | 42 | |
| 54 | 43 | def to_dict(self) -> Dict[str, Any]: |
| 55 | 44 | """Convert to dictionary representation.""" |
| ... | ... | @@ -60,6 +49,9 @@ class ParsedQuery: |
| 60 | 49 | "detected_language": self.detected_language, |
| 61 | 50 | "translations": self.translations, |
| 62 | 51 | "query_tokens": self.query_tokens, |
| 52 | + "style_intent_profile": ( | |
| 53 | + self.style_intent_profile.to_dict() if self.style_intent_profile is not None else None | |
| 54 | + ), | |
| 63 | 55 | } |
| 64 | 56 | |
| 65 | 57 | |
| ... | ... | @@ -97,6 +89,11 @@ class QueryParser: |
| 97 | 89 | self.language_detector = LanguageDetector() |
| 98 | 90 | self.rewriter = QueryRewriter(config.query_config.rewrite_dictionary) |
| 99 | 91 | self._tokenizer = tokenizer or self._build_tokenizer() |
| 92 | + self.style_intent_registry = StyleIntentRegistry.from_query_config(config.query_config) | |
| 93 | + self.style_intent_detector = StyleIntentDetector( | |
| 94 | + self.style_intent_registry, | |
| 95 | + tokenizer=self._tokenizer, | |
| 96 | + ) | |
| 100 | 97 | |
| 101 | 98 | # Eager initialization (startup-time failure visibility, no lazy init in request path) |
| 102 | 99 | if self.config.query_config.enable_text_embedding and self._text_encoder is None: |
| ... | ... | @@ -172,28 +169,7 @@ class QueryParser: |
| 172 | 169 | @staticmethod |
| 173 | 170 | def _extract_tokens(tokenizer_result: Any) -> List[str]: |
| 174 | 171 | """Normalize tokenizer output into a flat token string list.""" |
| 175 | - if not tokenizer_result: | |
| 176 | - return [] | |
| 177 | - if isinstance(tokenizer_result, str): | |
| 178 | - token = tokenizer_result.strip() | |
| 179 | - return [token] if token else [] | |
| 180 | - | |
| 181 | - tokens: List[str] = [] | |
| 182 | - for item in tokenizer_result: | |
| 183 | - token: Optional[str] = None | |
| 184 | - if isinstance(item, str): | |
| 185 | - token = item | |
| 186 | - elif isinstance(item, (list, tuple)) and item: | |
| 187 | - token = str(item[0]) | |
| 188 | - elif item is not None: | |
| 189 | - token = str(item) | |
| 190 | - | |
| 191 | - if token is None: | |
| 192 | - continue | |
| 193 | - token = token.strip() | |
| 194 | - if token: | |
| 195 | - tokens.append(token) | |
| 196 | - return tokens | |
| 172 | + return extract_token_strings(tokenizer_result) | |
| 197 | 173 | |
| 198 | 174 | def _get_query_tokens(self, query: str) -> List[str]: |
| 199 | 175 | return self._extract_tokens(self._tokenizer(query)) |
| ... | ... | @@ -425,6 +401,22 @@ class QueryParser: |
| 425 | 401 | context.store_intermediate_result("translations", translations) |
| 426 | 402 | |
| 427 | 403 | # Build result |
| 404 | + base_result = ParsedQuery( | |
| 405 | + original_query=query, | |
| 406 | + query_normalized=normalized, | |
| 407 | + rewritten_query=query_text, | |
| 408 | + detected_language=detected_lang, | |
| 409 | + translations=translations, | |
| 410 | + query_vector=query_vector, | |
| 411 | + query_tokens=query_tokens, | |
| 412 | + ) | |
| 413 | + style_intent_profile = self.style_intent_detector.detect(base_result) | |
| 414 | + if context: | |
| 415 | + context.store_intermediate_result( | |
| 416 | + "style_intent_profile", | |
| 417 | + style_intent_profile.to_dict(), | |
| 418 | + ) | |
| 419 | + | |
| 428 | 420 | result = ParsedQuery( |
| 429 | 421 | original_query=query, |
| 430 | 422 | query_normalized=normalized, |
| ... | ... | @@ -433,6 +425,7 @@ class QueryParser: |
| 433 | 425 | translations=translations, |
| 434 | 426 | query_vector=query_vector, |
| 435 | 427 | query_tokens=query_tokens, |
| 428 | + style_intent_profile=style_intent_profile, | |
| 436 | 429 | ) |
| 437 | 430 | |
| 438 | 431 | if context and hasattr(context, 'logger'): | ... | ... |
| ... | ... | @@ -0,0 +1,261 @@ |
| 1 | +""" | |
| 2 | +Style intent detection for query understanding. | |
| 3 | +""" | |
| 4 | + | |
| 5 | +from __future__ import annotations | |
| 6 | + | |
| 7 | +from dataclasses import dataclass, field | |
| 8 | +from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Set, Tuple | |
| 9 | + | |
| 10 | +from .tokenization import TokenizedText, normalize_query_text, tokenize_text | |
| 11 | + | |
| 12 | + | |
| 13 | +@dataclass(frozen=True) | |
| 14 | +class StyleIntentDefinition: | |
| 15 | + intent_type: str | |
| 16 | + term_groups: Tuple[Tuple[str, ...], ...] | |
| 17 | + dimension_aliases: Tuple[str, ...] | |
| 18 | + synonym_to_canonical: Dict[str, str] | |
| 19 | + max_term_ngram: int = 3 | |
| 20 | + | |
| 21 | + @classmethod | |
| 22 | + def from_rows( | |
| 23 | + cls, | |
| 24 | + intent_type: str, | |
| 25 | + rows: Sequence[Sequence[str]], | |
| 26 | + dimension_aliases: Sequence[str], | |
| 27 | + ) -> "StyleIntentDefinition": | |
| 28 | + term_groups: List[Tuple[str, ...]] = [] | |
| 29 | + synonym_to_canonical: Dict[str, str] = {} | |
| 30 | + max_ngram = 1 | |
| 31 | + | |
| 32 | + for row in rows: | |
| 33 | + normalized_terms: List[str] = [] | |
| 34 | + for raw_term in row: | |
| 35 | + term = normalize_query_text(raw_term) | |
| 36 | + if not term or term in normalized_terms: | |
| 37 | + continue | |
| 38 | + normalized_terms.append(term) | |
| 39 | + if not normalized_terms: | |
| 40 | + continue | |
| 41 | + | |
| 42 | + canonical = normalized_terms[0] | |
| 43 | + term_groups.append(tuple(normalized_terms)) | |
| 44 | + for term in normalized_terms: | |
| 45 | + synonym_to_canonical[term] = canonical | |
| 46 | + max_ngram = max(max_ngram, len(term.split())) | |
| 47 | + | |
| 48 | + aliases = tuple( | |
| 49 | + dict.fromkeys( | |
| 50 | + term | |
| 51 | + for term in ( | |
| 52 | + normalize_query_text(alias) | |
| 53 | + for alias in dimension_aliases | |
| 54 | + ) | |
| 55 | + if term | |
| 56 | + ) | |
| 57 | + ) | |
| 58 | + | |
| 59 | + return cls( | |
| 60 | + intent_type=intent_type, | |
| 61 | + term_groups=tuple(term_groups), | |
| 62 | + dimension_aliases=aliases, | |
| 63 | + synonym_to_canonical=synonym_to_canonical, | |
| 64 | + max_term_ngram=max_ngram, | |
| 65 | + ) | |
| 66 | + | |
| 67 | + def match_candidates(self, candidates: Iterable[str]) -> Set[str]: | |
| 68 | + matched: Set[str] = set() | |
| 69 | + for candidate in candidates: | |
| 70 | + canonical = self.synonym_to_canonical.get(normalize_query_text(candidate)) | |
| 71 | + if canonical: | |
| 72 | + matched.add(canonical) | |
| 73 | + return matched | |
| 74 | + | |
| 75 | + def match_text( | |
| 76 | + self, | |
| 77 | + text: str, | |
| 78 | + *, | |
| 79 | + tokenizer: Optional[Callable[[str], Any]] = None, | |
| 80 | + ) -> Set[str]: | |
| 81 | + bundle = tokenize_text(text, tokenizer=tokenizer, max_ngram=self.max_term_ngram) | |
| 82 | + return self.match_candidates(bundle.candidates) | |
| 83 | + | |
| 84 | + | |
| 85 | +@dataclass(frozen=True) | |
| 86 | +class DetectedStyleIntent: | |
| 87 | + intent_type: str | |
| 88 | + canonical_value: str | |
| 89 | + matched_term: str | |
| 90 | + matched_query_text: str | |
| 91 | + dimension_aliases: Tuple[str, ...] | |
| 92 | + | |
| 93 | + def to_dict(self) -> Dict[str, Any]: | |
| 94 | + return { | |
| 95 | + "intent_type": self.intent_type, | |
| 96 | + "canonical_value": self.canonical_value, | |
| 97 | + "matched_term": self.matched_term, | |
| 98 | + "matched_query_text": self.matched_query_text, | |
| 99 | + "dimension_aliases": list(self.dimension_aliases), | |
| 100 | + } | |
| 101 | + | |
| 102 | + | |
| 103 | +@dataclass(frozen=True) | |
| 104 | +class StyleIntentProfile: | |
| 105 | + query_variants: Tuple[TokenizedText, ...] = field(default_factory=tuple) | |
| 106 | + intents: Tuple[DetectedStyleIntent, ...] = field(default_factory=tuple) | |
| 107 | + | |
| 108 | + @property | |
| 109 | + def is_active(self) -> bool: | |
| 110 | + return bool(self.intents) | |
| 111 | + | |
| 112 | + def get_intents(self, intent_type: Optional[str] = None) -> List[DetectedStyleIntent]: | |
| 113 | + if intent_type is None: | |
| 114 | + return list(self.intents) | |
| 115 | + normalized = normalize_query_text(intent_type) | |
| 116 | + return [intent for intent in self.intents if intent.intent_type == normalized] | |
| 117 | + | |
| 118 | + def get_canonical_values(self, intent_type: str) -> Set[str]: | |
| 119 | + return {intent.canonical_value for intent in self.get_intents(intent_type)} | |
| 120 | + | |
| 121 | + def to_dict(self) -> Dict[str, Any]: | |
| 122 | + return { | |
| 123 | + "active": self.is_active, | |
| 124 | + "intents": [intent.to_dict() for intent in self.intents], | |
| 125 | + "query_variants": [ | |
| 126 | + { | |
| 127 | + "text": variant.text, | |
| 128 | + "normalized_text": variant.normalized_text, | |
| 129 | + "fine_tokens": list(variant.fine_tokens), | |
| 130 | + "coarse_tokens": list(variant.coarse_tokens), | |
| 131 | + "candidates": list(variant.candidates), | |
| 132 | + } | |
| 133 | + for variant in self.query_variants | |
| 134 | + ], | |
| 135 | + } | |
| 136 | + | |
| 137 | + | |
| 138 | +class StyleIntentRegistry: | |
| 139 | + """Holds style intent vocabularies and matching helpers.""" | |
| 140 | + | |
| 141 | + def __init__( | |
| 142 | + self, | |
| 143 | + definitions: Dict[str, StyleIntentDefinition], | |
| 144 | + *, | |
| 145 | + enabled: bool = True, | |
| 146 | + ) -> None: | |
| 147 | + self.definitions = definitions | |
| 148 | + self.enabled = bool(enabled) | |
| 149 | + | |
| 150 | + @classmethod | |
| 151 | + def from_query_config(cls, query_config: Any) -> "StyleIntentRegistry": | |
| 152 | + style_terms = getattr(query_config, "style_intent_terms", {}) or {} | |
| 153 | + dimension_aliases = getattr(query_config, "style_intent_dimension_aliases", {}) or {} | |
| 154 | + definitions: Dict[str, StyleIntentDefinition] = {} | |
| 155 | + | |
| 156 | + for intent_type, rows in style_terms.items(): | |
| 157 | + definition = StyleIntentDefinition.from_rows( | |
| 158 | + intent_type=normalize_query_text(intent_type), | |
| 159 | + rows=rows or [], | |
| 160 | + dimension_aliases=dimension_aliases.get(intent_type, []), | |
| 161 | + ) | |
| 162 | + if definition.synonym_to_canonical: | |
| 163 | + definitions[definition.intent_type] = definition | |
| 164 | + | |
| 165 | + return cls( | |
| 166 | + definitions, | |
| 167 | + enabled=bool(getattr(query_config, "style_intent_enabled", True)), | |
| 168 | + ) | |
| 169 | + | |
| 170 | + def get_definition(self, intent_type: str) -> Optional[StyleIntentDefinition]: | |
| 171 | + return self.definitions.get(normalize_query_text(intent_type)) | |
| 172 | + | |
| 173 | + def get_dimension_aliases(self, intent_type: str) -> Tuple[str, ...]: | |
| 174 | + definition = self.get_definition(intent_type) | |
| 175 | + return definition.dimension_aliases if definition else tuple() | |
| 176 | + | |
| 177 | + | |
| 178 | +class StyleIntentDetector: | |
| 179 | + """Detects style intents from parsed query variants.""" | |
| 180 | + | |
| 181 | + def __init__( | |
| 182 | + self, | |
| 183 | + registry: StyleIntentRegistry, | |
| 184 | + *, | |
| 185 | + tokenizer: Optional[Callable[[str], Any]] = None, | |
| 186 | + ) -> None: | |
| 187 | + self.registry = registry | |
| 188 | + self.tokenizer = tokenizer | |
| 189 | + | |
| 190 | + def _build_query_variants(self, parsed_query: Any) -> Tuple[TokenizedText, ...]: | |
| 191 | + seen = set() | |
| 192 | + variants: List[TokenizedText] = [] | |
| 193 | + texts = [ | |
| 194 | + getattr(parsed_query, "original_query", None), | |
| 195 | + getattr(parsed_query, "query_normalized", None), | |
| 196 | + getattr(parsed_query, "rewritten_query", None), | |
| 197 | + ] | |
| 198 | + | |
| 199 | + translations = getattr(parsed_query, "translations", {}) or {} | |
| 200 | + if isinstance(translations, dict): | |
| 201 | + texts.extend(translations.values()) | |
| 202 | + | |
| 203 | + for raw_text in texts: | |
| 204 | + text = str(raw_text or "").strip() | |
| 205 | + if not text: | |
| 206 | + continue | |
| 207 | + normalized = normalize_query_text(text) | |
| 208 | + if not normalized or normalized in seen: | |
| 209 | + continue | |
| 210 | + seen.add(normalized) | |
| 211 | + variants.append( | |
| 212 | + tokenize_text( | |
| 213 | + text, | |
| 214 | + tokenizer=self.tokenizer, | |
| 215 | + max_ngram=max( | |
| 216 | + (definition.max_term_ngram for definition in self.registry.definitions.values()), | |
| 217 | + default=3, | |
| 218 | + ), | |
| 219 | + ) | |
| 220 | + ) | |
| 221 | + | |
| 222 | + return tuple(variants) | |
| 223 | + | |
| 224 | + def detect(self, parsed_query: Any) -> StyleIntentProfile: | |
| 225 | + if not self.registry.enabled or not self.registry.definitions: | |
| 226 | + return StyleIntentProfile() | |
| 227 | + | |
| 228 | + query_variants = self._build_query_variants(parsed_query) | |
| 229 | + detected: List[DetectedStyleIntent] = [] | |
| 230 | + seen_pairs = set() | |
| 231 | + | |
| 232 | + for variant in query_variants: | |
| 233 | + for intent_type, definition in self.registry.definitions.items(): | |
| 234 | + matched_canonicals = definition.match_candidates(variant.candidates) | |
| 235 | + if not matched_canonicals: | |
| 236 | + continue | |
| 237 | + | |
| 238 | + for candidate in variant.candidates: | |
| 239 | + normalized_candidate = normalize_query_text(candidate) | |
| 240 | + canonical = definition.synonym_to_canonical.get(normalized_candidate) | |
| 241 | + if not canonical or canonical not in matched_canonicals: | |
| 242 | + continue | |
| 243 | + pair = (intent_type, canonical) | |
| 244 | + if pair in seen_pairs: | |
| 245 | + continue | |
| 246 | + seen_pairs.add(pair) | |
| 247 | + detected.append( | |
| 248 | + DetectedStyleIntent( | |
| 249 | + intent_type=intent_type, | |
| 250 | + canonical_value=canonical, | |
| 251 | + matched_term=normalized_candidate, | |
| 252 | + matched_query_text=variant.text, | |
| 253 | + dimension_aliases=definition.dimension_aliases, | |
| 254 | + ) | |
| 255 | + ) | |
| 256 | + break | |
| 257 | + | |
| 258 | + return StyleIntentProfile( | |
| 259 | + query_variants=query_variants, | |
| 260 | + intents=tuple(detected), | |
| 261 | + ) | ... | ... |
| ... | ... | @@ -0,0 +1,122 @@ |
| 1 | +""" | |
| 2 | +Shared tokenization helpers for query understanding. | |
| 3 | +""" | |
| 4 | + | |
| 5 | +from __future__ import annotations | |
| 6 | + | |
| 7 | +from dataclasses import dataclass | |
| 8 | +import re | |
| 9 | +from typing import Any, Callable, Iterable, List, Optional, Sequence, Tuple | |
| 10 | + | |
| 11 | + | |
| 12 | +_TOKEN_PATTERN = re.compile(r"[\u4e00-\u9fff]+|[A-Za-z0-9_]+(?:-[A-Za-z0-9_]+)*") | |
| 13 | + | |
| 14 | + | |
| 15 | +def normalize_query_text(text: Optional[str]) -> str: | |
| 16 | + if text is None: | |
| 17 | + return "" | |
| 18 | + return " ".join(str(text).strip().casefold().split()) | |
| 19 | + | |
| 20 | + | |
| 21 | +def simple_tokenize_query(text: str) -> List[str]: | |
| 22 | + """ | |
| 23 | + Lightweight tokenizer for coarse query matching. | |
| 24 | + | |
| 25 | + - Consecutive CJK characters form one token | |
| 26 | + - Latin / digit runs (with internal hyphens) form tokens | |
| 27 | + """ | |
| 28 | + if not text: | |
| 29 | + return [] | |
| 30 | + return _TOKEN_PATTERN.findall(text) | |
| 31 | + | |
| 32 | + | |
| 33 | +def extract_token_strings(tokenizer_result: Any) -> List[str]: | |
| 34 | + """Normalize tokenizer output into a flat token string list.""" | |
| 35 | + if not tokenizer_result: | |
| 36 | + return [] | |
| 37 | + if isinstance(tokenizer_result, str): | |
| 38 | + token = tokenizer_result.strip() | |
| 39 | + return [token] if token else [] | |
| 40 | + | |
| 41 | + tokens: List[str] = [] | |
| 42 | + for item in tokenizer_result: | |
| 43 | + token: Optional[str] = None | |
| 44 | + if isinstance(item, str): | |
| 45 | + token = item | |
| 46 | + elif isinstance(item, (list, tuple)) and item: | |
| 47 | + token = str(item[0]) | |
| 48 | + elif item is not None: | |
| 49 | + token = str(item) | |
| 50 | + | |
| 51 | + if token is None: | |
| 52 | + continue | |
| 53 | + token = token.strip() | |
| 54 | + if token: | |
| 55 | + tokens.append(token) | |
| 56 | + return tokens | |
| 57 | + | |
| 58 | + | |
| 59 | +def _dedupe_preserve_order(values: Iterable[str]) -> List[str]: | |
| 60 | + result: List[str] = [] | |
| 61 | + seen = set() | |
| 62 | + for value in values: | |
| 63 | + normalized = normalize_query_text(value) | |
| 64 | + if not normalized or normalized in seen: | |
| 65 | + continue | |
| 66 | + seen.add(normalized) | |
| 67 | + result.append(normalized) | |
| 68 | + return result | |
| 69 | + | |
| 70 | + | |
| 71 | +def _build_phrase_candidates(tokens: Sequence[str], max_ngram: int) -> List[str]: | |
| 72 | + if not tokens: | |
| 73 | + return [] | |
| 74 | + | |
| 75 | + phrases: List[str] = [] | |
| 76 | + upper = max(1, int(max_ngram)) | |
| 77 | + for size in range(1, upper + 1): | |
| 78 | + if size > len(tokens): | |
| 79 | + break | |
| 80 | + for start in range(0, len(tokens) - size + 1): | |
| 81 | + phrase = " ".join(tokens[start:start + size]).strip() | |
| 82 | + if phrase: | |
| 83 | + phrases.append(phrase) | |
| 84 | + return phrases | |
| 85 | + | |
| 86 | + | |
| 87 | +@dataclass(frozen=True) | |
| 88 | +class TokenizedText: | |
| 89 | + text: str | |
| 90 | + normalized_text: str | |
| 91 | + fine_tokens: Tuple[str, ...] | |
| 92 | + coarse_tokens: Tuple[str, ...] | |
| 93 | + candidates: Tuple[str, ...] | |
| 94 | + | |
| 95 | + | |
| 96 | +def tokenize_text( | |
| 97 | + text: str, | |
| 98 | + *, | |
| 99 | + tokenizer: Optional[Callable[[str], Any]] = None, | |
| 100 | + max_ngram: int = 3, | |
| 101 | +) -> TokenizedText: | |
| 102 | + normalized_text = normalize_query_text(text) | |
| 103 | + coarse_tokens = _dedupe_preserve_order(simple_tokenize_query(text)) | |
| 104 | + | |
| 105 | + fine_raw = extract_token_strings(tokenizer(text)) if tokenizer is not None and text else [] | |
| 106 | + fine_tokens = _dedupe_preserve_order(fine_raw) | |
| 107 | + | |
| 108 | + candidates = _dedupe_preserve_order( | |
| 109 | + list(fine_tokens) | |
| 110 | + + list(coarse_tokens) | |
| 111 | + + _build_phrase_candidates(fine_tokens, max_ngram=max_ngram) | |
| 112 | + + _build_phrase_candidates(coarse_tokens, max_ngram=max_ngram) | |
| 113 | + + ([normalized_text] if normalized_text else []) | |
| 114 | + ) | |
| 115 | + | |
| 116 | + return TokenizedText( | |
| 117 | + text=text, | |
| 118 | + normalized_text=normalized_text, | |
| 119 | + fine_tokens=tuple(fine_tokens), | |
| 120 | + coarse_tokens=tuple(coarse_tokens), | |
| 121 | + candidates=tuple(candidates), | |
| 122 | + ) | ... | ... |
search/rerank_client.py
| ... | ... | @@ -62,11 +62,19 @@ def build_docs_from_hits( |
| 62 | 62 | need_category_path = "{category_path}" in doc_template |
| 63 | 63 | for hit in es_hits: |
| 64 | 64 | src = hit.get("_source") or {} |
| 65 | + title_suffix = str(hit.get("_style_rerank_suffix") or "").strip() | |
| 65 | 66 | if only_title: |
| 66 | - docs.append(pick_lang_text(src.get("title"))) | |
| 67 | + title = pick_lang_text(src.get("title")) | |
| 68 | + if title_suffix: | |
| 69 | + title = f"{title} {title_suffix}".strip() | |
| 70 | + docs.append(title) | |
| 67 | 71 | else: |
| 68 | 72 | values = _SafeDict( |
| 69 | - title=pick_lang_text(src.get("title")), | |
| 73 | + title=( | |
| 74 | + f"{pick_lang_text(src.get('title'))} {title_suffix}".strip() | |
| 75 | + if title_suffix | |
| 76 | + else pick_lang_text(src.get("title")) | |
| 77 | + ), | |
| 70 | 78 | brief=pick_lang_text(src.get("brief")) if need_brief else "", |
| 71 | 79 | vendor=pick_lang_text(src.get("vendor")) if need_vendor else "", |
| 72 | 80 | description=pick_lang_text(src.get("description")) if need_description else "", | ... | ... |
search/searcher.py
| ... | ... | @@ -10,12 +10,13 @@ import time, json |
| 10 | 10 | import logging |
| 11 | 11 | import hashlib |
| 12 | 12 | from string import Formatter |
| 13 | -import numpy as np | |
| 14 | 13 | |
| 15 | 14 | from utils.es_client import ESClient |
| 16 | 15 | from query import QueryParser, ParsedQuery |
| 16 | +from query.style_intent import StyleIntentRegistry | |
| 17 | 17 | from embeddings.image_encoder import CLIPImageEncoder |
| 18 | 18 | from .es_query_builder import ESQueryBuilder |
| 19 | +from .sku_intent_selector import SkuSelectionDecision, StyleSkuSelector | |
| 19 | 20 | from config import SearchConfig |
| 20 | 21 | from config.tenant_config_loader import get_tenant_config_loader |
| 21 | 22 | from context.request_context import RequestContext, RequestContextStage |
| ... | ... | @@ -115,6 +116,12 @@ class Searcher: |
| 115 | 116 | else: |
| 116 | 117 | self.image_encoder = image_encoder |
| 117 | 118 | self.source_fields = config.query_config.source_fields |
| 119 | + self.style_intent_registry = StyleIntentRegistry.from_query_config(self.config.query_config) | |
| 120 | + self.style_sku_selector = StyleSkuSelector( | |
| 121 | + self.style_intent_registry, | |
| 122 | + text_encoder_getter=lambda: getattr(self.query_parser, "text_encoder", None), | |
| 123 | + tokenizer_getter=lambda: getattr(self.query_parser, "_tokenizer", None), | |
| 124 | + ) | |
| 118 | 125 | |
| 119 | 126 | # Query builder - simplified single-layer architecture |
| 120 | 127 | self.query_builder = ESQueryBuilder( |
| ... | ... | @@ -155,7 +162,11 @@ class Searcher: |
| 155 | 162 | return |
| 156 | 163 | es_query["_source"] = {"includes": self.source_fields} |
| 157 | 164 | |
| 158 | - def _resolve_rerank_source_filter(self, doc_template: str) -> Dict[str, Any]: | |
| 165 | + def _resolve_rerank_source_filter( | |
| 166 | + self, | |
| 167 | + doc_template: str, | |
| 168 | + parsed_query: Optional[ParsedQuery] = None, | |
| 169 | + ) -> Dict[str, Any]: | |
| 159 | 170 | """ |
| 160 | 171 | Build a lightweight _source filter for rerank prefetch. |
| 161 | 172 | |
| ... | ... | @@ -182,6 +193,16 @@ class Searcher: |
| 182 | 193 | if not includes: |
| 183 | 194 | includes.add("title") |
| 184 | 195 | |
| 196 | + if self._has_style_intent(parsed_query): | |
| 197 | + includes.update( | |
| 198 | + { | |
| 199 | + "skus", | |
| 200 | + "option1_name", | |
| 201 | + "option2_name", | |
| 202 | + "option3_name", | |
| 203 | + } | |
| 204 | + ) | |
| 205 | + | |
| 185 | 206 | return {"includes": sorted(includes)} |
| 186 | 207 | |
| 187 | 208 | def _fetch_hits_by_ids( |
| ... | ... | @@ -225,256 +246,23 @@ class Searcher: |
| 225 | 246 | return hits_by_id, int(resp.get("took", 0) or 0) |
| 226 | 247 | |
| 227 | 248 | @staticmethod |
| 228 | - def _normalize_sku_match_text(value: Optional[str]) -> str: | |
| 229 | - """Normalize free text for lightweight SKU option matching.""" | |
| 230 | - if value is None: | |
| 231 | - return "" | |
| 232 | - return " ".join(str(value).strip().casefold().split()) | |
| 233 | - | |
| 234 | - @staticmethod | |
| 235 | - def _sku_option1_embedding_key( | |
| 236 | - sku: Dict[str, Any], | |
| 237 | - spu_option1_name: Optional[Any] = None, | |
| 238 | - ) -> Optional[str]: | |
| 239 | - """ | |
| 240 | - Text sent to the embedding service for option1 must be "name:value" | |
| 241 | - (option name from SKU row or SPU-level option1_name). | |
| 242 | - """ | |
| 243 | - value_raw = sku.get("option1_value") | |
| 244 | - if value_raw is None: | |
| 245 | - return None | |
| 246 | - value = str(value_raw).strip() | |
| 247 | - if not value: | |
| 248 | - return None | |
| 249 | - name = sku.get("option1_name") | |
| 250 | - if name is None or not str(name).strip(): | |
| 251 | - name = spu_option1_name | |
| 252 | - name_str = str(name).strip() if name is not None and str(name).strip() else "" | |
| 253 | - if name_str: | |
| 254 | - value = f"{name_str}:{value}" | |
| 255 | - return value.casefold() | |
| 256 | - | |
| 257 | - def _build_sku_query_texts(self, parsed_query: ParsedQuery) -> List[str]: | |
| 258 | - """Collect original and translated query texts for SKU option matching.""" | |
| 259 | - candidates: List[str] = [] | |
| 260 | - for text in ( | |
| 261 | - getattr(parsed_query, "original_query", None), | |
| 262 | - getattr(parsed_query, "query_normalized", None), | |
| 263 | - getattr(parsed_query, "rewritten_query", None), | |
| 264 | - ): | |
| 265 | - normalized = self._normalize_sku_match_text(text) | |
| 266 | - if normalized: | |
| 267 | - candidates.append(normalized) | |
| 268 | - | |
| 269 | - translations = getattr(parsed_query, "translations", {}) or {} | |
| 270 | - if isinstance(translations, dict): | |
| 271 | - for text in translations.values(): | |
| 272 | - normalized = self._normalize_sku_match_text(text) | |
| 273 | - if normalized: | |
| 274 | - candidates.append(normalized) | |
| 275 | - | |
| 276 | - deduped: List[str] = [] | |
| 277 | - seen = set() | |
| 278 | - for text in candidates: | |
| 279 | - if text in seen: | |
| 280 | - continue | |
| 281 | - seen.add(text) | |
| 282 | - deduped.append(text) | |
| 283 | - return deduped | |
| 284 | - | |
| 285 | - def _find_query_matching_sku_index( | |
| 286 | - self, | |
| 287 | - skus: List[Dict[str, Any]], | |
| 288 | - query_texts: List[str], | |
| 289 | - spu_option1_name: Optional[Any] = None, | |
| 290 | - ) -> Optional[int]: | |
| 291 | - """Return the first SKU whose option1_value (or name:value) appears in query texts.""" | |
| 292 | - if not skus or not query_texts: | |
| 293 | - return None | |
| 294 | - | |
| 295 | - for index, sku in enumerate(skus): | |
| 296 | - option1_value = self._normalize_sku_match_text(sku.get("option1_value")) | |
| 297 | - if not option1_value: | |
| 298 | - continue | |
| 299 | - if any(option1_value in query_text for query_text in query_texts): | |
| 300 | - return index | |
| 301 | - embed_key = self._sku_option1_embedding_key(sku, spu_option1_name) | |
| 302 | - if embed_key and embed_key != option1_value: | |
| 303 | - composite_norm = self._normalize_sku_match_text(embed_key.replace(":", " ")) | |
| 304 | - if any(composite_norm in query_text for query_text in query_texts): | |
| 305 | - return index | |
| 306 | - if any(embed_key.casefold() in query_text for query_text in query_texts): | |
| 307 | - return index | |
| 308 | - return None | |
| 309 | - | |
| 310 | - def _encode_query_vector_for_sku_matching( | |
| 311 | - self, | |
| 312 | - parsed_query: ParsedQuery, | |
| 313 | - context: Optional[RequestContext] = None, | |
| 314 | - ) -> Optional[np.ndarray]: | |
| 315 | - """Best-effort fallback query embedding for final-page SKU matching.""" | |
| 316 | - query_text = ( | |
| 317 | - getattr(parsed_query, "rewritten_query", None) | |
| 318 | - or getattr(parsed_query, "query_normalized", None) | |
| 319 | - or getattr(parsed_query, "original_query", None) | |
| 320 | - ) | |
| 321 | - if not query_text: | |
| 322 | - return None | |
| 323 | - | |
| 324 | - text_encoder = getattr(self.query_parser, "text_encoder", None) | |
| 325 | - if text_encoder is None: | |
| 326 | - return None | |
| 327 | - | |
| 328 | - try: | |
| 329 | - vectors = text_encoder.encode([query_text], priority=1) | |
| 330 | - except Exception as exc: | |
| 331 | - logger.warning("Failed to encode query vector for SKU matching: %s", exc, exc_info=True) | |
| 332 | - if context is not None: | |
| 333 | - context.add_warning(f"SKU query embedding failed: {exc}") | |
| 334 | - return None | |
| 335 | - | |
| 336 | - if vectors is None or len(vectors) == 0: | |
| 337 | - return None | |
| 338 | - | |
| 339 | - vector = vectors[0] | |
| 340 | - if vector is None: | |
| 341 | - return None | |
| 342 | - return np.asarray(vector, dtype=np.float32) | |
| 343 | - | |
| 344 | - def _select_sku_by_embedding( | |
| 345 | - self, | |
| 346 | - skus: List[Dict[str, Any]], | |
| 347 | - option1_vectors: Dict[str, np.ndarray], | |
| 348 | - query_vector: np.ndarray, | |
| 349 | - spu_option1_name: Optional[Any] = None, | |
| 350 | - ) -> Tuple[Optional[int], Optional[float]]: | |
| 351 | - """Select the SKU whose option1 embedding key (name:value) is most similar to the query.""" | |
| 352 | - best_index: Optional[int] = None | |
| 353 | - best_score: Optional[float] = None | |
| 354 | - | |
| 355 | - for index, sku in enumerate(skus): | |
| 356 | - embed_key = self._sku_option1_embedding_key(sku, spu_option1_name) | |
| 357 | - if not embed_key: | |
| 358 | - continue | |
| 359 | - option_vector = option1_vectors.get(embed_key) | |
| 360 | - if option_vector is None: | |
| 361 | - continue | |
| 362 | - score = float(np.inner(query_vector, option_vector)) | |
| 363 | - if best_score is None or score > best_score: | |
| 364 | - best_index = index | |
| 365 | - best_score = score | |
| 366 | - | |
| 367 | - return best_index, best_score | |
| 368 | - | |
| 369 | - @staticmethod | |
| 370 | - def _promote_matching_sku(source: Dict[str, Any], match_index: int) -> Optional[Dict[str, Any]]: | |
| 371 | - """Move the matched SKU to the front and swap the SPU image.""" | |
| 372 | - skus = source.get("skus") | |
| 373 | - if not isinstance(skus, list) or match_index < 0 or match_index >= len(skus): | |
| 374 | - return None | |
| 375 | - | |
| 376 | - matched_sku = skus.pop(match_index) | |
| 377 | - skus.insert(0, matched_sku) | |
| 249 | + def _has_style_intent(parsed_query: Optional[ParsedQuery]) -> bool: | |
| 250 | + profile = getattr(parsed_query, "style_intent_profile", None) | |
| 251 | + return bool(getattr(profile, "is_active", False)) | |
| 378 | 252 | |
| 379 | - image_src = matched_sku.get("image_src") or matched_sku.get("imageSrc") | |
| 380 | - if image_src: | |
| 381 | - source["image_url"] = image_src | |
| 382 | - return matched_sku | |
| 383 | - | |
| 384 | - def _apply_sku_sorting_for_page_hits( | |
| 253 | + def _apply_style_intent_to_hits( | |
| 385 | 254 | self, |
| 386 | 255 | es_hits: List[Dict[str, Any]], |
| 387 | 256 | parsed_query: ParsedQuery, |
| 388 | 257 | context: Optional[RequestContext] = None, |
| 389 | - ) -> None: | |
| 390 | - """Sort each page hit's SKUs so the best-matching SKU is first.""" | |
| 391 | - if not es_hits: | |
| 392 | - return | |
| 393 | - | |
| 394 | - query_texts = self._build_sku_query_texts(parsed_query) | |
| 395 | - unmatched_hits: List[Dict[str, Any]] = [] | |
| 396 | - option1_values_to_encode: List[str] = [] | |
| 397 | - seen_option1_values = set() | |
| 398 | - text_matched = 0 | |
| 399 | - embedding_matched = 0 | |
| 400 | - | |
| 401 | - for hit in es_hits: | |
| 402 | - source = hit.get("_source") | |
| 403 | - if not isinstance(source, dict): | |
| 404 | - continue | |
| 405 | - skus = source.get("skus") | |
| 406 | - if not isinstance(skus, list) or not skus: | |
| 407 | - continue | |
| 408 | - | |
| 409 | - spu_option1_name = source.get("option1_name") | |
| 410 | - match_index = self._find_query_matching_sku_index( | |
| 411 | - skus, query_texts, spu_option1_name=spu_option1_name | |
| 412 | - ) | |
| 413 | - if match_index is not None: | |
| 414 | - self._promote_matching_sku(source, match_index) | |
| 415 | - text_matched += 1 | |
| 416 | - continue | |
| 417 | - | |
| 418 | - unmatched_hits.append(hit) | |
| 419 | - for sku in skus: | |
| 420 | - embed_key = self._sku_option1_embedding_key(sku, spu_option1_name) | |
| 421 | - if not embed_key or embed_key in seen_option1_values: | |
| 422 | - continue | |
| 423 | - seen_option1_values.add(embed_key) | |
| 424 | - option1_values_to_encode.append(embed_key) | |
| 425 | - | |
| 426 | - if not unmatched_hits or not option1_values_to_encode: | |
| 427 | - return | |
| 428 | - | |
| 429 | - query_vector = getattr(parsed_query, "query_vector", None) | |
| 430 | - if query_vector is None: | |
| 431 | - query_vector = self._encode_query_vector_for_sku_matching(parsed_query, context=context) | |
| 432 | - if query_vector is None: | |
| 433 | - return | |
| 434 | - | |
| 435 | - text_encoder = getattr(self.query_parser, "text_encoder", None) | |
| 436 | - if text_encoder is None: | |
| 437 | - return | |
| 438 | - | |
| 439 | - try: | |
| 440 | - encoded_option_vectors = text_encoder.encode(option1_values_to_encode, priority=1) | |
| 441 | - except Exception as exc: | |
| 442 | - logger.warning("Failed to encode SKU option1 values for final-page sorting: %s", exc, exc_info=True) | |
| 443 | - if context is not None: | |
| 444 | - context.add_warning(f"SKU option embedding failed: {exc}") | |
| 445 | - return | |
| 446 | - | |
| 447 | - option1_vectors: Dict[str, np.ndarray] = {} | |
| 448 | - for option1_value, vector in zip(option1_values_to_encode, encoded_option_vectors): | |
| 449 | - if vector is None: | |
| 450 | - continue | |
| 451 | - option1_vectors[option1_value] = np.asarray(vector, dtype=np.float32) | |
| 452 | - | |
| 453 | - query_vector_array = np.asarray(query_vector, dtype=np.float32) | |
| 454 | - for hit in unmatched_hits: | |
| 455 | - source = hit.get("_source") | |
| 456 | - if not isinstance(source, dict): | |
| 457 | - continue | |
| 458 | - skus = source.get("skus") | |
| 459 | - if not isinstance(skus, list) or not skus: | |
| 460 | - continue | |
| 461 | - match_index, _ = self._select_sku_by_embedding( | |
| 462 | - skus, | |
| 463 | - option1_vectors, | |
| 464 | - query_vector_array, | |
| 465 | - spu_option1_name=source.get("option1_name"), | |
| 466 | - ) | |
| 467 | - if match_index is None: | |
| 468 | - continue | |
| 469 | - self._promote_matching_sku(source, match_index) | |
| 470 | - embedding_matched += 1 | |
| 471 | - | |
| 472 | - if text_matched or embedding_matched: | |
| 473 | - logger.info( | |
| 474 | - "Final-page SKU sorting completed | text_matched=%s | embedding_matched=%s", | |
| 475 | - text_matched, | |
| 476 | - embedding_matched, | |
| 258 | + ) -> Dict[str, SkuSelectionDecision]: | |
| 259 | + decisions = self.style_sku_selector.prepare_hits(es_hits, parsed_query) | |
| 260 | + if decisions and context is not None: | |
| 261 | + context.store_intermediate_result( | |
| 262 | + "style_intent_sku_decisions", | |
| 263 | + {doc_id: decision.to_dict() for doc_id, decision in decisions.items()}, | |
| 477 | 264 | ) |
| 265 | + return decisions | |
| 478 | 266 | |
| 479 | 267 | def search( |
| 480 | 268 | self, |
| ... | ... | @@ -583,7 +371,8 @@ class Searcher: |
| 583 | 371 | context.metadata['feature_flags'] = { |
| 584 | 372 | 'translation_enabled': enable_translation, |
| 585 | 373 | 'embedding_enabled': enable_embedding, |
| 586 | - 'rerank_enabled': do_rerank | |
| 374 | + 'rerank_enabled': do_rerank, | |
| 375 | + 'style_intent_enabled': bool(self.style_intent_registry.enabled), | |
| 587 | 376 | } |
| 588 | 377 | |
| 589 | 378 | # Step 1: Parse query |
| ... | ... | @@ -607,6 +396,7 @@ class Searcher: |
| 607 | 396 | domain="default", |
| 608 | 397 | is_simple_query=True |
| 609 | 398 | ) |
| 399 | + context.metadata["feature_flags"]["style_intent_active"] = self._has_style_intent(parsed_query) | |
| 610 | 400 | |
| 611 | 401 | context.logger.info( |
| 612 | 402 | f"查询解析完成 | 原查询: '{parsed_query.original_query}' | " |
| ... | ... | @@ -667,7 +457,10 @@ class Searcher: |
| 667 | 457 | es_query_for_fetch = es_query |
| 668 | 458 | rerank_prefetch_source = None |
| 669 | 459 | if in_rerank_window: |
| 670 | - rerank_prefetch_source = self._resolve_rerank_source_filter(effective_doc_template) | |
| 460 | + rerank_prefetch_source = self._resolve_rerank_source_filter( | |
| 461 | + effective_doc_template, | |
| 462 | + parsed_query=parsed_query, | |
| 463 | + ) | |
| 671 | 464 | es_query_for_fetch = dict(es_query) |
| 672 | 465 | es_query_for_fetch["_source"] = rerank_prefetch_source |
| 673 | 466 | |
| ... | ... | @@ -751,6 +544,20 @@ class Searcher: |
| 751 | 544 | finally: |
| 752 | 545 | context.end_stage(RequestContextStage.ELASTICSEARCH_SEARCH_PRIMARY) |
| 753 | 546 | |
| 547 | + style_intent_decisions: Dict[str, SkuSelectionDecision] = {} | |
| 548 | + if self._has_style_intent(parsed_query) and in_rerank_window: | |
| 549 | + style_intent_decisions = self._apply_style_intent_to_hits( | |
| 550 | + es_response.get("hits", {}).get("hits") or [], | |
| 551 | + parsed_query, | |
| 552 | + context=context, | |
| 553 | + ) | |
| 554 | + if style_intent_decisions: | |
| 555 | + context.logger.info( | |
| 556 | + "款式意图 SKU 预筛选完成 | hits=%s", | |
| 557 | + len(style_intent_decisions), | |
| 558 | + extra={'reqid': context.reqid, 'uid': context.uid} | |
| 559 | + ) | |
| 560 | + | |
| 754 | 561 | # Optional Step 4.5: AI reranking(仅当请求范围在重排窗口内时执行) |
| 755 | 562 | if do_rerank and in_rerank_window: |
| 756 | 563 | context.start_stage(RequestContextStage.RERANKING) |
| ... | ... | @@ -841,6 +648,11 @@ class Searcher: |
| 841 | 648 | if "_source" in detail_hit: |
| 842 | 649 | hit["_source"] = detail_hit.get("_source") or {} |
| 843 | 650 | filled += 1 |
| 651 | + if style_intent_decisions: | |
| 652 | + self.style_sku_selector.apply_precomputed_decisions( | |
| 653 | + sliced, | |
| 654 | + style_intent_decisions, | |
| 655 | + ) | |
| 844 | 656 | if fill_took: |
| 845 | 657 | es_response["took"] = int((es_response.get("took", 0) or 0) + fill_took) |
| 846 | 658 | context.logger.info( |
| ... | ... | @@ -883,7 +695,18 @@ class Searcher: |
| 883 | 695 | continue |
| 884 | 696 | rerank_debug_by_doc[str(doc_id)] = item |
| 885 | 697 | |
| 886 | - self._apply_sku_sorting_for_page_hits(es_hits, parsed_query, context=context) | |
| 698 | + if self._has_style_intent(parsed_query): | |
| 699 | + if in_rerank_window and style_intent_decisions: | |
| 700 | + self.style_sku_selector.apply_precomputed_decisions( | |
| 701 | + es_hits, | |
| 702 | + style_intent_decisions, | |
| 703 | + ) | |
| 704 | + elif not in_rerank_window: | |
| 705 | + style_intent_decisions = self._apply_style_intent_to_hits( | |
| 706 | + es_hits, | |
| 707 | + parsed_query, | |
| 708 | + context=context, | |
| 709 | + ) | |
| 887 | 710 | |
| 888 | 711 | # Format results using ResultFormatter |
| 889 | 712 | formatted_results = ResultFormatter.format_search_results( |
| ... | ... | @@ -902,6 +725,11 @@ class Searcher: |
| 902 | 725 | rerank_debug = None |
| 903 | 726 | if doc_id is not None: |
| 904 | 727 | rerank_debug = rerank_debug_by_doc.get(str(doc_id)) |
| 728 | + style_intent_debug = None | |
| 729 | + if doc_id is not None and style_intent_decisions: | |
| 730 | + decision = style_intent_decisions.get(str(doc_id)) | |
| 731 | + if decision is not None: | |
| 732 | + style_intent_debug = decision.to_dict() | |
| 905 | 733 | |
| 906 | 734 | raw_score = hit.get("_score") |
| 907 | 735 | try: |
| ... | ... | @@ -940,6 +768,9 @@ class Searcher: |
| 940 | 768 | debug_entry["fused_score"] = rerank_debug.get("fused_score") |
| 941 | 769 | debug_entry["matched_queries"] = rerank_debug.get("matched_queries") |
| 942 | 770 | |
| 771 | + if style_intent_debug: | |
| 772 | + debug_entry["style_intent_sku"] = style_intent_debug | |
| 773 | + | |
| 943 | 774 | per_result_debug.append(debug_entry) |
| 944 | 775 | |
| 945 | 776 | # Format facets |
| ... | ... | @@ -987,7 +818,8 @@ class Searcher: |
| 987 | 818 | "translations": context.query_analysis.translations, |
| 988 | 819 | "has_vector": context.query_analysis.query_vector is not None, |
| 989 | 820 | "is_simple_query": context.query_analysis.is_simple_query, |
| 990 | - "domain": context.query_analysis.domain | |
| 821 | + "domain": context.query_analysis.domain, | |
| 822 | + "style_intent_profile": context.get_intermediate_result("style_intent_profile"), | |
| 991 | 823 | }, |
| 992 | 824 | "es_query": context.get_intermediate_result('es_query', {}), |
| 993 | 825 | "es_response": { | ... | ... |
| ... | ... | @@ -0,0 +1,405 @@ |
| 1 | +""" | |
| 2 | +SKU selection for style-intent-aware search results. | |
| 3 | +""" | |
| 4 | + | |
| 5 | +from __future__ import annotations | |
| 6 | + | |
| 7 | +from dataclasses import dataclass, field | |
| 8 | +from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple | |
| 9 | + | |
| 10 | +import numpy as np | |
| 11 | + | |
| 12 | +from query.style_intent import StyleIntentProfile, StyleIntentRegistry | |
| 13 | +from query.tokenization import normalize_query_text | |
| 14 | + | |
| 15 | + | |
| 16 | +@dataclass(frozen=True) | |
| 17 | +class SkuSelectionDecision: | |
| 18 | + selected_sku_id: Optional[str] | |
| 19 | + rerank_suffix: str | |
| 20 | + selected_text: str | |
| 21 | + matched_stage: str | |
| 22 | + similarity_score: Optional[float] = None | |
| 23 | + resolved_dimensions: Dict[str, Optional[str]] = field(default_factory=dict) | |
| 24 | + | |
| 25 | + def to_dict(self) -> Dict[str, Any]: | |
| 26 | + return { | |
| 27 | + "selected_sku_id": self.selected_sku_id, | |
| 28 | + "rerank_suffix": self.rerank_suffix, | |
| 29 | + "selected_text": self.selected_text, | |
| 30 | + "matched_stage": self.matched_stage, | |
| 31 | + "similarity_score": self.similarity_score, | |
| 32 | + "resolved_dimensions": dict(self.resolved_dimensions), | |
| 33 | + } | |
| 34 | + | |
| 35 | + | |
| 36 | +@dataclass | |
| 37 | +class _SkuCandidate: | |
| 38 | + index: int | |
| 39 | + sku_id: str | |
| 40 | + sku: Dict[str, Any] | |
| 41 | + selection_text: str | |
| 42 | + intent_texts: Dict[str, str] | |
| 43 | + | |
| 44 | + | |
| 45 | +class StyleSkuSelector: | |
| 46 | + """Selects the best SKU for an SPU based on detected style intent.""" | |
| 47 | + | |
| 48 | + def __init__( | |
| 49 | + self, | |
| 50 | + registry: StyleIntentRegistry, | |
| 51 | + *, | |
| 52 | + text_encoder_getter: Optional[Callable[[], Any]] = None, | |
| 53 | + tokenizer_getter: Optional[Callable[[], Any]] = None, | |
| 54 | + ) -> None: | |
| 55 | + self.registry = registry | |
| 56 | + self._text_encoder_getter = text_encoder_getter | |
| 57 | + self._tokenizer_getter = tokenizer_getter | |
| 58 | + | |
| 59 | + def prepare_hits( | |
| 60 | + self, | |
| 61 | + es_hits: List[Dict[str, Any]], | |
| 62 | + parsed_query: Any, | |
| 63 | + ) -> Dict[str, SkuSelectionDecision]: | |
| 64 | + decisions: Dict[str, SkuSelectionDecision] = {} | |
| 65 | + style_profile = getattr(parsed_query, "style_intent_profile", None) | |
| 66 | + if not isinstance(style_profile, StyleIntentProfile) or not style_profile.is_active: | |
| 67 | + return decisions | |
| 68 | + | |
| 69 | + query_texts = self._build_query_texts(parsed_query, style_profile) | |
| 70 | + query_vector = self._get_query_vector(parsed_query) | |
| 71 | + tokenizer = self._get_tokenizer() | |
| 72 | + | |
| 73 | + for hit in es_hits: | |
| 74 | + source = hit.get("_source") | |
| 75 | + if not isinstance(source, dict): | |
| 76 | + continue | |
| 77 | + | |
| 78 | + decision = self._select_for_source( | |
| 79 | + source, | |
| 80 | + style_profile=style_profile, | |
| 81 | + query_texts=query_texts, | |
| 82 | + query_vector=query_vector, | |
| 83 | + tokenizer=tokenizer, | |
| 84 | + ) | |
| 85 | + if decision is None: | |
| 86 | + continue | |
| 87 | + | |
| 88 | + self._apply_decision_to_source(source, decision) | |
| 89 | + if decision.rerank_suffix: | |
| 90 | + hit["_style_rerank_suffix"] = decision.rerank_suffix | |
| 91 | + | |
| 92 | + doc_id = hit.get("_id") | |
| 93 | + if doc_id is not None: | |
| 94 | + decisions[str(doc_id)] = decision | |
| 95 | + | |
| 96 | + return decisions | |
| 97 | + | |
| 98 | + def apply_precomputed_decisions( | |
| 99 | + self, | |
| 100 | + es_hits: List[Dict[str, Any]], | |
| 101 | + decisions: Dict[str, SkuSelectionDecision], | |
| 102 | + ) -> None: | |
| 103 | + if not es_hits or not decisions: | |
| 104 | + return | |
| 105 | + | |
| 106 | + for hit in es_hits: | |
| 107 | + doc_id = hit.get("_id") | |
| 108 | + if doc_id is None: | |
| 109 | + continue | |
| 110 | + decision = decisions.get(str(doc_id)) | |
| 111 | + if decision is None: | |
| 112 | + continue | |
| 113 | + source = hit.get("_source") | |
| 114 | + if not isinstance(source, dict): | |
| 115 | + continue | |
| 116 | + self._apply_decision_to_source(source, decision) | |
| 117 | + if decision.rerank_suffix: | |
| 118 | + hit["_style_rerank_suffix"] = decision.rerank_suffix | |
| 119 | + | |
| 120 | + def _build_query_texts( | |
| 121 | + self, | |
| 122 | + parsed_query: Any, | |
| 123 | + style_profile: StyleIntentProfile, | |
| 124 | + ) -> List[str]: | |
| 125 | + texts = [variant.normalized_text for variant in style_profile.query_variants if variant.normalized_text] | |
| 126 | + if texts: | |
| 127 | + return list(dict.fromkeys(texts)) | |
| 128 | + | |
| 129 | + fallbacks: List[str] = [] | |
| 130 | + for value in ( | |
| 131 | + getattr(parsed_query, "original_query", None), | |
| 132 | + getattr(parsed_query, "query_normalized", None), | |
| 133 | + getattr(parsed_query, "rewritten_query", None), | |
| 134 | + ): | |
| 135 | + normalized = normalize_query_text(value) | |
| 136 | + if normalized: | |
| 137 | + fallbacks.append(normalized) | |
| 138 | + translations = getattr(parsed_query, "translations", {}) or {} | |
| 139 | + if isinstance(translations, dict): | |
| 140 | + for value in translations.values(): | |
| 141 | + normalized = normalize_query_text(value) | |
| 142 | + if normalized: | |
| 143 | + fallbacks.append(normalized) | |
| 144 | + return list(dict.fromkeys(fallbacks)) | |
| 145 | + | |
| 146 | + def _get_query_vector(self, parsed_query: Any) -> Optional[np.ndarray]: | |
| 147 | + query_vector = getattr(parsed_query, "query_vector", None) | |
| 148 | + if query_vector is not None: | |
| 149 | + return np.asarray(query_vector, dtype=np.float32) | |
| 150 | + | |
| 151 | + text_encoder = self._get_text_encoder() | |
| 152 | + if text_encoder is None: | |
| 153 | + return None | |
| 154 | + | |
| 155 | + query_text = ( | |
| 156 | + getattr(parsed_query, "rewritten_query", None) | |
| 157 | + or getattr(parsed_query, "query_normalized", None) | |
| 158 | + or getattr(parsed_query, "original_query", None) | |
| 159 | + ) | |
| 160 | + if not query_text: | |
| 161 | + return None | |
| 162 | + | |
| 163 | + vectors = text_encoder.encode([query_text], priority=1) | |
| 164 | + if vectors is None or len(vectors) == 0 or vectors[0] is None: | |
| 165 | + return None | |
| 166 | + return np.asarray(vectors[0], dtype=np.float32) | |
| 167 | + | |
| 168 | + def _get_text_encoder(self) -> Any: | |
| 169 | + if self._text_encoder_getter is None: | |
| 170 | + return None | |
| 171 | + return self._text_encoder_getter() | |
| 172 | + | |
| 173 | + def _get_tokenizer(self) -> Any: | |
| 174 | + if self._tokenizer_getter is None: | |
| 175 | + return None | |
| 176 | + return self._tokenizer_getter() | |
| 177 | + | |
| 178 | + @staticmethod | |
| 179 | + def _fallback_sku_text(sku: Dict[str, Any]) -> str: | |
| 180 | + parts = [] | |
| 181 | + for field_name in ("option1_value", "option2_value", "option3_value"): | |
| 182 | + value = str(sku.get(field_name) or "").strip() | |
| 183 | + if value: | |
| 184 | + parts.append(value) | |
| 185 | + return " ".join(parts) | |
| 186 | + | |
| 187 | + def _resolve_dimensions( | |
| 188 | + self, | |
| 189 | + source: Dict[str, Any], | |
| 190 | + style_profile: StyleIntentProfile, | |
| 191 | + ) -> Dict[str, Optional[str]]: | |
| 192 | + option_names = { | |
| 193 | + "option1_value": normalize_query_text(source.get("option1_name")), | |
| 194 | + "option2_value": normalize_query_text(source.get("option2_name")), | |
| 195 | + "option3_value": normalize_query_text(source.get("option3_name")), | |
| 196 | + } | |
| 197 | + resolved: Dict[str, Optional[str]] = {} | |
| 198 | + for intent in style_profile.intents: | |
| 199 | + if intent.intent_type in resolved: | |
| 200 | + continue | |
| 201 | + aliases = set(intent.dimension_aliases or self.registry.get_dimension_aliases(intent.intent_type)) | |
| 202 | + matched_field = None | |
| 203 | + for field_name, option_name in option_names.items(): | |
| 204 | + if option_name and option_name in aliases: | |
| 205 | + matched_field = field_name | |
| 206 | + break | |
| 207 | + resolved[intent.intent_type] = matched_field | |
| 208 | + return resolved | |
| 209 | + | |
| 210 | + def _build_candidates( | |
| 211 | + self, | |
| 212 | + skus: List[Dict[str, Any]], | |
| 213 | + resolved_dimensions: Dict[str, Optional[str]], | |
| 214 | + ) -> List[_SkuCandidate]: | |
| 215 | + candidates: List[_SkuCandidate] = [] | |
| 216 | + for index, sku in enumerate(skus): | |
| 217 | + fallback_text = self._fallback_sku_text(sku) | |
| 218 | + intent_texts: Dict[str, str] = {} | |
| 219 | + for intent_type, field_name in resolved_dimensions.items(): | |
| 220 | + if field_name: | |
| 221 | + value = str(sku.get(field_name) or "").strip() | |
| 222 | + intent_texts[intent_type] = value or fallback_text | |
| 223 | + else: | |
| 224 | + intent_texts[intent_type] = fallback_text | |
| 225 | + | |
| 226 | + selection_parts: List[str] = [] | |
| 227 | + seen = set() | |
| 228 | + for value in intent_texts.values(): | |
| 229 | + normalized = normalize_query_text(value) | |
| 230 | + if not normalized or normalized in seen: | |
| 231 | + continue | |
| 232 | + seen.add(normalized) | |
| 233 | + selection_parts.append(str(value).strip()) | |
| 234 | + | |
| 235 | + selection_text = " ".join(selection_parts).strip() or fallback_text | |
| 236 | + candidates.append( | |
| 237 | + _SkuCandidate( | |
| 238 | + index=index, | |
| 239 | + sku_id=str(sku.get("sku_id") or ""), | |
| 240 | + sku=sku, | |
| 241 | + selection_text=selection_text, | |
| 242 | + intent_texts=intent_texts, | |
| 243 | + ) | |
| 244 | + ) | |
| 245 | + return candidates | |
| 246 | + | |
| 247 | + @staticmethod | |
| 248 | + def _is_direct_match( | |
| 249 | + candidate: _SkuCandidate, | |
| 250 | + query_texts: Sequence[str], | |
| 251 | + ) -> bool: | |
| 252 | + if not candidate.intent_texts or not query_texts: | |
| 253 | + return False | |
| 254 | + for value in candidate.intent_texts.values(): | |
| 255 | + normalized_value = normalize_query_text(value) | |
| 256 | + if not normalized_value: | |
| 257 | + return False | |
| 258 | + if not any(normalized_value in query_text for query_text in query_texts): | |
| 259 | + return False | |
| 260 | + return True | |
| 261 | + | |
| 262 | + def _is_generalized_match( | |
| 263 | + self, | |
| 264 | + candidate: _SkuCandidate, | |
| 265 | + style_profile: StyleIntentProfile, | |
| 266 | + tokenizer: Any, | |
| 267 | + ) -> bool: | |
| 268 | + if not candidate.intent_texts: | |
| 269 | + return False | |
| 270 | + | |
| 271 | + for intent_type, value in candidate.intent_texts.items(): | |
| 272 | + definition = self.registry.get_definition(intent_type) | |
| 273 | + if definition is None: | |
| 274 | + return False | |
| 275 | + matched_canonicals = definition.match_text(value, tokenizer=tokenizer) | |
| 276 | + if not matched_canonicals.intersection(style_profile.get_canonical_values(intent_type)): | |
| 277 | + return False | |
| 278 | + return True | |
| 279 | + | |
| 280 | + def _select_by_embedding( | |
| 281 | + self, | |
| 282 | + candidates: Sequence[_SkuCandidate], | |
| 283 | + query_vector: Optional[np.ndarray], | |
| 284 | + ) -> Tuple[Optional[_SkuCandidate], Optional[float]]: | |
| 285 | + if not candidates: | |
| 286 | + return None, None | |
| 287 | + text_encoder = self._get_text_encoder() | |
| 288 | + if query_vector is None or text_encoder is None: | |
| 289 | + return candidates[0], None | |
| 290 | + | |
| 291 | + unique_texts = list( | |
| 292 | + dict.fromkeys( | |
| 293 | + normalize_query_text(candidate.selection_text) | |
| 294 | + for candidate in candidates | |
| 295 | + if normalize_query_text(candidate.selection_text) | |
| 296 | + ) | |
| 297 | + ) | |
| 298 | + if not unique_texts: | |
| 299 | + return candidates[0], None | |
| 300 | + | |
| 301 | + vectors = text_encoder.encode(unique_texts, priority=1) | |
| 302 | + vector_map: Dict[str, np.ndarray] = {} | |
| 303 | + for key, vector in zip(unique_texts, vectors): | |
| 304 | + if vector is None: | |
| 305 | + continue | |
| 306 | + vector_map[key] = np.asarray(vector, dtype=np.float32) | |
| 307 | + | |
| 308 | + best_candidate: Optional[_SkuCandidate] = None | |
| 309 | + best_score: Optional[float] = None | |
| 310 | + query_vector_array = np.asarray(query_vector, dtype=np.float32) | |
| 311 | + for candidate in candidates: | |
| 312 | + normalized_text = normalize_query_text(candidate.selection_text) | |
| 313 | + candidate_vector = vector_map.get(normalized_text) | |
| 314 | + if candidate_vector is None: | |
| 315 | + continue | |
| 316 | + score = float(np.inner(query_vector_array, candidate_vector)) | |
| 317 | + if best_score is None or score > best_score: | |
| 318 | + best_candidate = candidate | |
| 319 | + best_score = score | |
| 320 | + | |
| 321 | + return best_candidate or candidates[0], best_score | |
| 322 | + | |
| 323 | + def _select_for_source( | |
| 324 | + self, | |
| 325 | + source: Dict[str, Any], | |
| 326 | + *, | |
| 327 | + style_profile: StyleIntentProfile, | |
| 328 | + query_texts: Sequence[str], | |
| 329 | + query_vector: Optional[np.ndarray], | |
| 330 | + tokenizer: Any, | |
| 331 | + ) -> Optional[SkuSelectionDecision]: | |
| 332 | + skus = source.get("skus") | |
| 333 | + if not isinstance(skus, list) or not skus: | |
| 334 | + return None | |
| 335 | + | |
| 336 | + resolved_dimensions = self._resolve_dimensions(source, style_profile) | |
| 337 | + candidates = self._build_candidates(skus, resolved_dimensions) | |
| 338 | + if not candidates: | |
| 339 | + return None | |
| 340 | + | |
| 341 | + direct_matches = [candidate for candidate in candidates if self._is_direct_match(candidate, query_texts)] | |
| 342 | + if len(direct_matches) == 1: | |
| 343 | + chosen = direct_matches[0] | |
| 344 | + return self._build_decision(chosen, resolved_dimensions, matched_stage="direct") | |
| 345 | + | |
| 346 | + generalized_matches: List[_SkuCandidate] = [] | |
| 347 | + if not direct_matches: | |
| 348 | + generalized_matches = [ | |
| 349 | + candidate | |
| 350 | + for candidate in candidates | |
| 351 | + if self._is_generalized_match(candidate, style_profile, tokenizer) | |
| 352 | + ] | |
| 353 | + if len(generalized_matches) == 1: | |
| 354 | + chosen = generalized_matches[0] | |
| 355 | + return self._build_decision(chosen, resolved_dimensions, matched_stage="generalized") | |
| 356 | + | |
| 357 | + embedding_pool = direct_matches or generalized_matches or candidates | |
| 358 | + chosen, similarity_score = self._select_by_embedding(embedding_pool, query_vector) | |
| 359 | + if chosen is None: | |
| 360 | + return None | |
| 361 | + stage = "embedding_from_matches" if direct_matches or generalized_matches else "embedding_from_all" | |
| 362 | + return self._build_decision( | |
| 363 | + chosen, | |
| 364 | + resolved_dimensions, | |
| 365 | + matched_stage=stage, | |
| 366 | + similarity_score=similarity_score, | |
| 367 | + ) | |
| 368 | + | |
| 369 | + @staticmethod | |
| 370 | + def _build_decision( | |
| 371 | + candidate: _SkuCandidate, | |
| 372 | + resolved_dimensions: Dict[str, Optional[str]], | |
| 373 | + *, | |
| 374 | + matched_stage: str, | |
| 375 | + similarity_score: Optional[float] = None, | |
| 376 | + ) -> SkuSelectionDecision: | |
| 377 | + return SkuSelectionDecision( | |
| 378 | + selected_sku_id=candidate.sku_id or None, | |
| 379 | + rerank_suffix=str(candidate.selection_text or "").strip(), | |
| 380 | + selected_text=str(candidate.selection_text or "").strip(), | |
| 381 | + matched_stage=matched_stage, | |
| 382 | + similarity_score=similarity_score, | |
| 383 | + resolved_dimensions=dict(resolved_dimensions), | |
| 384 | + ) | |
| 385 | + | |
| 386 | + @staticmethod | |
| 387 | + def _apply_decision_to_source(source: Dict[str, Any], decision: SkuSelectionDecision) -> None: | |
| 388 | + skus = source.get("skus") | |
| 389 | + if not isinstance(skus, list) or not skus or not decision.selected_sku_id: | |
| 390 | + return | |
| 391 | + | |
| 392 | + selected_index = None | |
| 393 | + for index, sku in enumerate(skus): | |
| 394 | + if str(sku.get("sku_id") or "") == decision.selected_sku_id: | |
| 395 | + selected_index = index | |
| 396 | + break | |
| 397 | + if selected_index is None: | |
| 398 | + return | |
| 399 | + | |
| 400 | + selected_sku = skus.pop(selected_index) | |
| 401 | + skus.insert(0, selected_sku) | |
| 402 | + | |
| 403 | + image_src = selected_sku.get("image_src") or selected_sku.get("imageSrc") | |
| 404 | + if image_src: | |
| 405 | + source["image_url"] = image_src | ... | ... |
tests/test_search_rerank_window.py
| ... | ... | @@ -18,6 +18,7 @@ from config import ( |
| 18 | 18 | SearchConfig, |
| 19 | 19 | ) |
| 20 | 20 | from context import create_request_context |
| 21 | +from query.style_intent import DetectedStyleIntent, StyleIntentProfile | |
| 21 | 22 | from search.searcher import Searcher |
| 22 | 23 | |
| 23 | 24 | |
| ... | ... | @@ -30,6 +31,7 @@ class _FakeParsedQuery: |
| 30 | 31 | translations: Dict[str, str] = None |
| 31 | 32 | query_vector: Any = None |
| 32 | 33 | domain: str = "default" |
| 34 | + style_intent_profile: Any = None | |
| 33 | 35 | |
| 34 | 36 | def to_dict(self) -> Dict[str, Any]: |
| 35 | 37 | return { |
| ... | ... | @@ -39,9 +41,27 @@ class _FakeParsedQuery: |
| 39 | 41 | "detected_language": self.detected_language, |
| 40 | 42 | "translations": self.translations or {}, |
| 41 | 43 | "domain": self.domain, |
| 44 | + "style_intent_profile": ( | |
| 45 | + self.style_intent_profile.to_dict() if self.style_intent_profile is not None else None | |
| 46 | + ), | |
| 42 | 47 | } |
| 43 | 48 | |
| 44 | 49 | |
| 50 | +def _build_style_intent_profile(intent_type: str, canonical_value: str, *dimension_aliases: str) -> StyleIntentProfile: | |
| 51 | + aliases = dimension_aliases or (intent_type,) | |
| 52 | + return StyleIntentProfile( | |
| 53 | + intents=( | |
| 54 | + DetectedStyleIntent( | |
| 55 | + intent_type=intent_type, | |
| 56 | + canonical_value=canonical_value, | |
| 57 | + matched_term=canonical_value, | |
| 58 | + matched_query_text=canonical_value, | |
| 59 | + dimension_aliases=tuple(aliases), | |
| 60 | + ), | |
| 61 | + ) | |
| 62 | + ) | |
| 63 | + | |
| 64 | + | |
| 45 | 65 | class _FakeQueryParser: |
| 46 | 66 | def parse( |
| 47 | 67 | self, |
| ... | ... | @@ -340,6 +360,57 @@ def test_searcher_rerank_prefetch_source_follows_doc_template(monkeypatch): |
| 340 | 360 | assert es_client.calls[0]["body"]["_source"] == {"includes": ["brief", "title", "vendor"]} |
| 341 | 361 | |
| 342 | 362 | |
| 363 | +def test_searcher_rerank_prefetch_source_includes_sku_fields_when_style_intent_active(monkeypatch): | |
| 364 | + es_client = _FakeESClient() | |
| 365 | + searcher = _build_searcher(_build_search_config(rerank_enabled=True), es_client) | |
| 366 | + context = create_request_context(reqid="t1c", uid="u1c") | |
| 367 | + | |
| 368 | + monkeypatch.setattr( | |
| 369 | + "search.searcher.get_tenant_config_loader", | |
| 370 | + lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en"]}), | |
| 371 | + ) | |
| 372 | + monkeypatch.setattr( | |
| 373 | + "search.rerank_client.run_rerank", | |
| 374 | + lambda **kwargs: (kwargs["es_response"], None, []), | |
| 375 | + ) | |
| 376 | + | |
| 377 | + class _IntentQueryParser: | |
| 378 | + text_encoder = None | |
| 379 | + | |
| 380 | + def parse( | |
| 381 | + self, | |
| 382 | + query: str, | |
| 383 | + tenant_id: str, | |
| 384 | + generate_vector: bool, | |
| 385 | + context: Any, | |
| 386 | + target_languages: Any = None, | |
| 387 | + ): | |
| 388 | + return _FakeParsedQuery( | |
| 389 | + original_query=query, | |
| 390 | + query_normalized=query, | |
| 391 | + rewritten_query=query, | |
| 392 | + translations={}, | |
| 393 | + style_intent_profile=_build_style_intent_profile( | |
| 394 | + "color", "black", "color", "colors", "颜色" | |
| 395 | + ), | |
| 396 | + ) | |
| 397 | + | |
| 398 | + searcher.query_parser = _IntentQueryParser() | |
| 399 | + | |
| 400 | + searcher.search( | |
| 401 | + query="black dress", | |
| 402 | + tenant_id="162", | |
| 403 | + from_=0, | |
| 404 | + size=5, | |
| 405 | + context=context, | |
| 406 | + enable_rerank=None, | |
| 407 | + ) | |
| 408 | + | |
| 409 | + assert es_client.calls[0]["body"]["_source"] == { | |
| 410 | + "includes": ["option1_name", "option2_name", "option3_name", "skus", "title"] | |
| 411 | + } | |
| 412 | + | |
| 413 | + | |
| 343 | 414 | def test_searcher_skips_rerank_when_request_explicitly_false(monkeypatch): |
| 344 | 415 | es_client = _FakeESClient() |
| 345 | 416 | searcher = _build_searcher(_build_search_config(rerank_enabled=True), es_client) |
| ... | ... | @@ -434,6 +505,9 @@ def test_searcher_promotes_sku_when_option1_matches_translated_query(monkeypatch |
| 434 | 505 | query_normalized=query, |
| 435 | 506 | rewritten_query=query, |
| 436 | 507 | translations={"en": "black dress"}, |
| 508 | + style_intent_profile=_build_style_intent_profile( | |
| 509 | + "color", "black", "color", "colors", "颜色" | |
| 510 | + ), | |
| 437 | 511 | ) |
| 438 | 512 | |
| 439 | 513 | searcher.query_parser = _TranslatedQueryParser() |
| ... | ... | @@ -481,8 +555,8 @@ def test_searcher_promotes_sku_by_embedding_when_query_has_no_direct_option_matc |
| 481 | 555 | encoder = _FakeTextEncoder( |
| 482 | 556 | { |
| 483 | 557 | "linen summer dress": [0.8, 0.2], |
| 484 | - "color:red": [1.0, 0.0], | |
| 485 | - "color:blue": [0.0, 1.0], | |
| 558 | + "red": [1.0, 0.0], | |
| 559 | + "blue": [0.0, 1.0], | |
| 486 | 560 | } |
| 487 | 561 | ) |
| 488 | 562 | |
| ... | ... | @@ -503,6 +577,9 @@ def test_searcher_promotes_sku_by_embedding_when_query_has_no_direct_option_matc |
| 503 | 577 | rewritten_query=query, |
| 504 | 578 | translations={}, |
| 505 | 579 | query_vector=np.array([0.0, 1.0], dtype=np.float32), |
| 580 | + style_intent_profile=_build_style_intent_profile( | |
| 581 | + "color", "blue", "color", "colors", "颜色" | |
| 582 | + ), | |
| 506 | 583 | ) |
| 507 | 584 | |
| 508 | 585 | searcher.query_parser = _EmbeddingQueryParser() | ... | ... |
| ... | ... | @@ -0,0 +1,35 @@ |
| 1 | +from types import SimpleNamespace | |
| 2 | + | |
| 3 | +from config import QueryConfig | |
| 4 | +from query.style_intent import StyleIntentDetector, StyleIntentRegistry | |
| 5 | + | |
| 6 | + | |
| 7 | +def test_style_intent_detector_matches_original_and_translated_queries(): | |
| 8 | + query_config = QueryConfig( | |
| 9 | + style_intent_terms={ | |
| 10 | + "color": [["black", "黑色", "black"]], | |
| 11 | + "size": [["xl", "x-large", "加大码"]], | |
| 12 | + }, | |
| 13 | + style_intent_dimension_aliases={ | |
| 14 | + "color": ["color", "颜色"], | |
| 15 | + "size": ["size", "尺码"], | |
| 16 | + }, | |
| 17 | + ) | |
| 18 | + detector = StyleIntentDetector( | |
| 19 | + StyleIntentRegistry.from_query_config(query_config), | |
| 20 | + tokenizer=lambda text: text.split(), | |
| 21 | + ) | |
| 22 | + | |
| 23 | + parsed_query = SimpleNamespace( | |
| 24 | + original_query="黑色 连衣裙", | |
| 25 | + query_normalized="黑色 连衣裙", | |
| 26 | + rewritten_query="黑色 连衣裙", | |
| 27 | + translations={"en": "black dress xl"}, | |
| 28 | + ) | |
| 29 | + | |
| 30 | + profile = detector.detect(parsed_query) | |
| 31 | + | |
| 32 | + assert profile.is_active is True | |
| 33 | + assert profile.get_canonical_values("color") == {"black"} | |
| 34 | + assert profile.get_canonical_values("size") == {"xl"} | |
| 35 | + assert len(profile.query_variants) == 2 | ... | ... |