Blame view

search/sku_intent_selector.py 31.1 KB
cda1cd62   tangwang   意图分析&应用 baseline
1
  """
5c9baf91   tangwang   feat(search): 款式意...
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
  SKU selection for style-intent-aware and image-aware search results.
  
  Unified algorithm (one pass per hit, no cascading fallback stages):
  
  1. Per active style intent, a SKU's attribute value for that dimension comes
     from ONE of two sources, in priority order:
     - ``option``: the SKU's own ``optionN_value`` on the slot resolved by the
       intent's dimension aliases — authoritative whenever non-empty.
     - ``taxonomy``: the SPU-level ``enriched_taxonomy_attributes`` value on the
       same dimension  used only when the SKU has no own value (slot unresolved
       or value empty). Never overrides a contradicting SKU-level value.
  2. A SKU is "text-matched" iff every active intent finds a match on its
     selected value source (tokens of zh/en/attribute synonyms; values are first
     passed through ``_with_segment_boundaries_for_matching`` so brackets and
     common separators split segments; pure-CJK terms still use a substring
     fallback when the value is one undivided CJK run, e.g. ``卡其色棉``). We
     remember the matching source and the raw matched
     text per intent so the final decision can surface it.
  3. The image-pick comes straight from the nested ``image_embedding`` inner_hits
     (``exact_image_knn_query_hits`` preferred, ``image_knn_query_hits``
     otherwise): the SKU whose ``image_src`` equals the top-scoring url.
  4. Unified selection:
     - if the text-matched set is non-empty  pick image_pick when it lies in
       that set (visual tie-break among text-matched), otherwise the first
       text-matched SKU;
     - else  pick image_pick if any;
     - else  no decision (``final_source == "none"``).
  
  ``final_source`` values (weakest  strongest text evidence, reversed):
    ``option`` > ``taxonomy`` > ``image`` > ``none``. If any intent was satisfied
    only via taxonomy the overall source degrades to ``taxonomy`` so downstream
    callers can decide whether to differentiate the SPU-level signal from a
    true SKU-level option match.
  
  No embedding fallback, no stage cascade, no score thresholds.
cda1cd62   tangwang   意图分析&应用 baseline
37
38
39
40
41
  """
  
  from __future__ import annotations
  
  from dataclasses import dataclass, field
b712a831   tangwang   意图识别策略和性能优化
42
  from typing import Any, Callable, Dict, List, Optional, Tuple
99b72698   tangwang   测试回归钩子梳理
43
44
  import posixpath
  from urllib.parse import unquote, urlsplit
5c9baf91   tangwang   feat(search): 款式意...
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
  
  from query.style_intent import (
      DetectedStyleIntent,
      StyleIntentProfile,
      StyleIntentRegistry,
  )
  from query.tokenization import (
      contains_han_text,
      normalize_query_text,
      simple_tokenize_query,
  )
  
  import re
  
  _NON_HAN_RE = re.compile(r"[^\u4e00-\u9fff]")
  # Zero-width / BOM (often pasted from Excel or CMS).
  _ZW_AND_BOM_RE = re.compile(r"[\u200b-\u200d\ufeff\u2060]")
  # Brackets, slashes, and common commerce/list punctuation → segment boundaries so
  # tokenization can align intent terms (e.g. 卡其色) with the leading segment of
  # 卡其色(无内衬) / 卡其色/常规 / 卡其色·麻 等,without relying only on substring.
  _ATTRIBUTE_BOUNDARY_RE = re.compile(
      r"[\s\u3000]"  # ASCII / ideographic space
      r"|[\(\)\[\]\{\}()【】{}〈〉《》「」『』[]「」]"
      r"|[/\\||/\︱丨]"
      r"|[,,、;;::.。]"
      r"|[·•・]"
      r"|[~~]"
      r"|[+\=#%&*×※]"
      r"|[\u2010-\u2015\u2212]"  # hyphen, en dash, minus, etc.
  )
  
  
  def _is_pure_han(value: str) -> bool:
      """True if the string is non-empty and contains only CJK Unified Ideographs."""
      return bool(value) and not _NON_HAN_RE.search(value)
  
  
  def _with_segment_boundaries_for_matching(normalized_value: str) -> str:
      """Normalize commerce-style option/taxonomy strings for token matching.
  
      Inserts word boundaries at brackets and typical separators so
      ``simple_tokenize_query`` yields segments like ``['卡其色', '无内衬']`` instead
      of one undifferentiated CJK blob when unusual punctuation appears.
      """
      if not normalized_value:
          return ""
      s = _ZW_AND_BOM_RE.sub("", normalized_value)
      s = _ATTRIBUTE_BOUNDARY_RE.sub(" ", s)
      return " ".join(s.split())
  
  
  _IMAGE_INNER_HITS_KEYS: Tuple[str, ...] = (
      "exact_image_knn_query_hits",
      "image_knn_query_hits",
  )
cda1cd62   tangwang   意图分析&应用 baseline
100
  
5c9baf91   tangwang   feat(search): 款式意...
101
102
103
104
105
106
  
  @dataclass(frozen=True)
  class ImagePick:
      sku_id: str
      url: str
      score: float
cda1cd62   tangwang   意图分析&应用 baseline
107
108
109
110
111
112
113
  
  
  @dataclass(frozen=True)
  class SkuSelectionDecision:
      selected_sku_id: Optional[str]
      rerank_suffix: str
      selected_text: str
5c9baf91   tangwang   feat(search): 款式意...
114
115
      # "option" | "taxonomy" | "image" | "none"
      final_source: str
cda1cd62   tangwang   意图分析&应用 baseline
116
      resolved_dimensions: Dict[str, Optional[str]] = field(default_factory=dict)
5c9baf91   tangwang   feat(search): 款式意...
117
118
119
120
121
122
123
124
125
126
      # Per-intent matching-source breakdown, e.g. {"color": "option", "size": "taxonomy"}.
      matched_sources: Dict[str, str] = field(default_factory=dict)
      image_pick_sku_id: Optional[str] = None
      image_pick_url: Optional[str] = None
      image_pick_score: Optional[float] = None
  
      # Backward-compat alias; some older callers/tests look at ``matched_stage``.
      @property
      def matched_stage(self) -> str:
          return self.final_source
cda1cd62   tangwang   意图分析&应用 baseline
127
128
129
130
131
132
  
      def to_dict(self) -> Dict[str, Any]:
          return {
              "selected_sku_id": self.selected_sku_id,
              "rerank_suffix": self.rerank_suffix,
              "selected_text": self.selected_text,
5c9baf91   tangwang   feat(search): 款式意...
133
134
              "final_source": self.final_source,
              "matched_sources": dict(self.matched_sources),
cda1cd62   tangwang   意图分析&应用 baseline
135
              "resolved_dimensions": dict(self.resolved_dimensions),
5c9baf91   tangwang   feat(search): 款式意...
136
137
138
139
140
141
142
143
144
              "image_pick": (
                  {
                      "sku_id": self.image_pick_sku_id,
                      "url": self.image_pick_url,
                      "score": self.image_pick_score,
                  }
                  if self.image_pick_sku_id or self.image_pick_url
                  else None
              ),
cda1cd62   tangwang   意图分析&应用 baseline
145
146
147
148
          }
  
  
  @dataclass
2efad04b   tangwang   意图匹配的性能优化:
149
  class _SelectionContext:
5c9baf91   tangwang   feat(search): 款式意...
150
151
152
      """Request-scoped memo for term tokenization and substring match probes."""
  
      terms_by_intent: Dict[str, Tuple[str, ...]]
b712a831   tangwang   意图识别策略和性能优化
153
      normalized_text_cache: Dict[str, str] = field(default_factory=dict)
837d5d76   tangwang   sku筛选匹配规则优化,按 tok...
154
      tokenized_text_cache: Dict[str, Tuple[str, ...]] = field(default_factory=dict)
2efad04b   tangwang   意图匹配的性能优化:
155
      text_match_cache: Dict[Tuple[str, str], bool] = field(default_factory=dict)
cda1cd62   tangwang   意图分析&应用 baseline
156
157
158
  
  
  class StyleSkuSelector:
5c9baf91   tangwang   feat(search): 款式意...
159
      """Selects the best SKU per hit from style-intent text match + image KNN."""
cda1cd62   tangwang   意图分析&应用 baseline
160
161
162
163
164
165
  
      def __init__(
          self,
          registry: StyleIntentRegistry,
          *,
          text_encoder_getter: Optional[Callable[[], Any]] = None,
cda1cd62   tangwang   意图分析&应用 baseline
166
167
      ) -> None:
          self.registry = registry
5c9baf91   tangwang   feat(search): 款式意...
168
          # Retained for API back-compat; no longer used now that embedding fallback is gone.
cda1cd62   tangwang   意图分析&应用 baseline
169
          self._text_encoder_getter = text_encoder_getter
cda1cd62   tangwang   意图分析&应用 baseline
170
  
5c9baf91   tangwang   feat(search): 款式意...
171
172
173
      # ------------------------------------------------------------------
      # Public entry points
      # ------------------------------------------------------------------
cda1cd62   tangwang   意图分析&应用 baseline
174
175
176
177
178
      def prepare_hits(
          self,
          es_hits: List[Dict[str, Any]],
          parsed_query: Any,
      ) -> Dict[str, SkuSelectionDecision]:
5c9baf91   tangwang   feat(search): 款式意...
179
180
181
182
183
184
          """Compute selection decisions (without mutating ``_source``).
  
          Runs if either a style intent is active OR any hit carries image
          inner_hits. Decisions are keyed by ES ``_id`` and meant to be applied
          later via :meth:`apply_precomputed_decisions` (after page fill).
          """
cda1cd62   tangwang   意图分析&应用 baseline
185
186
          decisions: Dict[str, SkuSelectionDecision] = {}
          style_profile = getattr(parsed_query, "style_intent_profile", None)
5c9baf91   tangwang   feat(search): 款式意...
187
188
189
190
191
192
          style_active = (
              isinstance(style_profile, StyleIntentProfile) and style_profile.is_active
          )
          selection_context = (
              self._build_selection_context(style_profile) if style_active else None
          )
cda1cd62   tangwang   意图分析&应用 baseline
193
194
195
196
197
198
  
          for hit in es_hits:
              source = hit.get("_source")
              if not isinstance(source, dict):
                  continue
  
5c9baf91   tangwang   feat(search): 款式意...
199
200
201
202
203
204
205
206
              image_pick = self._pick_sku_by_image(hit, source)
              if not style_active and image_pick is None:
                  # Nothing to do for this hit.
                  continue
  
              decision = self._select(
                  source=source,
                  style_profile=style_profile if style_active else None,
2efad04b   tangwang   意图匹配的性能优化:
207
                  selection_context=selection_context,
5c9baf91   tangwang   feat(search): 款式意...
208
                  image_pick=image_pick,
cda1cd62   tangwang   意图分析&应用 baseline
209
210
211
212
              )
              if decision is None:
                  continue
  
cda1cd62   tangwang   意图分析&应用 baseline
213
214
              if decision.rerank_suffix:
                  hit["_style_rerank_suffix"] = decision.rerank_suffix
2efad04b   tangwang   意图匹配的性能优化:
215
216
              else:
                  hit.pop("_style_rerank_suffix", None)
cda1cd62   tangwang   意图分析&应用 baseline
217
218
219
220
221
222
223
224
225
226
227
228
229
230
  
              doc_id = hit.get("_id")
              if doc_id is not None:
                  decisions[str(doc_id)] = decision
  
          return decisions
  
      def apply_precomputed_decisions(
          self,
          es_hits: List[Dict[str, Any]],
          decisions: Dict[str, SkuSelectionDecision],
      ) -> None:
          if not es_hits or not decisions:
              return
cda1cd62   tangwang   意图分析&应用 baseline
231
232
233
234
235
236
237
238
239
240
241
242
243
          for hit in es_hits:
              doc_id = hit.get("_id")
              if doc_id is None:
                  continue
              decision = decisions.get(str(doc_id))
              if decision is None:
                  continue
              source = hit.get("_source")
              if not isinstance(source, dict):
                  continue
              self._apply_decision_to_source(source, decision)
              if decision.rerank_suffix:
                  hit["_style_rerank_suffix"] = decision.rerank_suffix
2efad04b   tangwang   意图匹配的性能优化:
244
245
              else:
                  hit.pop("_style_rerank_suffix", None)
cda1cd62   tangwang   意图分析&应用 baseline
246
  
5c9baf91   tangwang   feat(search): 款式意...
247
248
249
      # ------------------------------------------------------------------
      # Selection context & text matching
      # ------------------------------------------------------------------
2efad04b   tangwang   意图匹配的性能优化:
250
251
      def _build_selection_context(
          self,
2efad04b   tangwang   意图匹配的性能优化:
252
253
          style_profile: StyleIntentProfile,
      ) -> _SelectionContext:
5c9baf91   tangwang   feat(search): 款式意...
254
          terms_by_intent: Dict[str, List[str]] = {}
2efad04b   tangwang   意图匹配的性能优化:
255
          for intent in style_profile.intents:
5c9baf91   tangwang   feat(search): 款式意...
256
257
              terms = terms_by_intent.setdefault(intent.intent_type, [])
              for raw_term in intent.matching_terms:
b712a831   tangwang   意图识别策略和性能优化
258
                  normalized_term = normalize_query_text(raw_term)
5c9baf91   tangwang   feat(search): 款式意...
259
260
                  if normalized_term and normalized_term not in terms:
                      terms.append(normalized_term)
2efad04b   tangwang   意图匹配的性能优化:
261
          return _SelectionContext(
5c9baf91   tangwang   feat(search): 款式意...
262
              terms_by_intent={
2efad04b   tangwang   意图匹配的性能优化:
263
                  intent_type: tuple(terms)
5c9baf91   tangwang   feat(search): 款式意...
264
                  for intent_type, terms in terms_by_intent.items()
2efad04b   tangwang   意图匹配的性能优化:
265
              },
2efad04b   tangwang   意图匹配的性能优化:
266
267
          )
  
5c9baf91   tangwang   feat(search): 款式意...
268
      def _normalize_cached(self, ctx: _SelectionContext, value: Any) -> str:
b712a831   tangwang   意图识别策略和性能优化
269
270
271
          raw = str(value or "").strip()
          if not raw:
              return ""
5c9baf91   tangwang   feat(search): 款式意...
272
          cached = ctx.normalized_text_cache.get(raw)
b712a831   tangwang   意图识别策略和性能优化
273
274
275
          if cached is not None:
              return cached
          normalized = normalize_query_text(raw)
5c9baf91   tangwang   feat(search): 款式意...
276
          ctx.normalized_text_cache[raw] = normalized
b712a831   tangwang   意图识别策略和性能优化
277
          return normalized
cda1cd62   tangwang   意图分析&应用 baseline
278
  
5c9baf91   tangwang   feat(search): 款式意...
279
280
281
282
283
284
285
286
287
288
289
      def _tokenize_cached(self, ctx: _SelectionContext, value: str) -> Tuple[str, ...]:
          normalized_value = normalize_query_text(value)
          if not normalized_value:
              return ()
          cached = ctx.tokenized_text_cache.get(normalized_value)
          if cached is not None:
              return cached
          tokens = tuple(
              normalize_query_text(token)
              for token in simple_tokenize_query(normalized_value)
              if token
2efad04b   tangwang   意图匹配的性能优化:
290
          )
5c9baf91   tangwang   feat(search): 款式意...
291
292
          ctx.tokenized_text_cache[normalized_value] = tokens
          return tokens
2efad04b   tangwang   意图匹配的性能优化:
293
294
  
      def _is_text_match(
cda1cd62   tangwang   意图分析&应用 baseline
295
          self,
2efad04b   tangwang   意图匹配的性能优化:
296
          intent_type: str,
5c9baf91   tangwang   feat(search): 款式意...
297
          ctx: _SelectionContext,
6adbf18a   tangwang   reranker提示词优化
298
          *,
b712a831   tangwang   意图识别策略和性能优化
299
          normalized_value: str,
cda1cd62   tangwang   意图分析&应用 baseline
300
      ) -> bool:
5c9baf91   tangwang   feat(search): 款式意...
301
          """True iff any intent term token-boundary matches the given value."""
2efad04b   tangwang   意图匹配的性能优化:
302
          if not normalized_value:
cda1cd62   tangwang   意图分析&应用 baseline
303
              return False
2efad04b   tangwang   意图匹配的性能优化:
304
          cache_key = (intent_type, normalized_value)
5c9baf91   tangwang   feat(search): 款式意...
305
          cached = ctx.text_match_cache.get(cache_key)
2efad04b   tangwang   意图匹配的性能优化:
306
307
308
          if cached is not None:
              return cached
  
5c9baf91   tangwang   feat(search): 款式意...
309
310
311
          terms = ctx.terms_by_intent.get(intent_type, ())
          segmented = _with_segment_boundaries_for_matching(normalized_value)
          value_tokens = self._tokenize_cached(ctx, segmented)
837d5d76   tangwang   sku筛选匹配规则优化,按 tok...
312
313
314
315
          matched = any(
              self._matches_term_tokens(
                  term=term,
                  value_tokens=value_tokens,
5c9baf91   tangwang   feat(search): 款式意...
316
                  ctx=ctx,
837d5d76   tangwang   sku筛选匹配规则优化,按 tok...
317
318
                  normalized_value=normalized_value,
              )
5c9baf91   tangwang   feat(search): 款式意...
319
              for term in terms
837d5d76   tangwang   sku筛选匹配规则优化,按 tok...
320
321
              if term
          )
5c9baf91   tangwang   feat(search): 款式意...
322
          ctx.text_match_cache[cache_key] = matched
2efad04b   tangwang   意图匹配的性能优化:
323
324
          return matched
  
837d5d76   tangwang   sku筛选匹配规则优化,按 tok...
325
326
327
328
329
      def _matches_term_tokens(
          self,
          *,
          term: str,
          value_tokens: Tuple[str, ...],
5c9baf91   tangwang   feat(search): 款式意...
330
          ctx: _SelectionContext,
837d5d76   tangwang   sku筛选匹配规则优化,按 tok...
331
332
333
334
335
336
337
          normalized_value: str,
      ) -> bool:
          normalized_term = normalize_query_text(term)
          if not normalized_term:
              return False
          if normalized_term == normalized_value:
              return True
5c9baf91   tangwang   feat(search): 款式意...
338
339
340
341
342
343
344
          # Pure-CJK terms can't be split further by the whitespace/regex tokenizer
          # ("卡其色棉" is one token), so sliding-window token match would miss the prefix.
          # Fall back to normalized substring containment — safe because this branch
          # never triggers for Latin tokens where substring would cause "l" ⊂ "xl" issues.
          if _is_pure_han(normalized_term) and contains_han_text(normalized_value):
              return normalized_term in normalized_value
          term_tokens = self._tokenize_cached(ctx, normalized_term)
837d5d76   tangwang   sku筛选匹配规则优化,按 tok...
345
346
347
348
349
350
351
          if not term_tokens or not value_tokens:
              return normalized_term in normalized_value
  
          term_length = len(term_tokens)
          value_length = len(value_tokens)
          if term_length > value_length:
              return False
837d5d76   tangwang   sku筛选匹配规则优化,按 tok...
352
          for start in range(value_length - term_length + 1):
5c9baf91   tangwang   feat(search): 款式意...
353
              if value_tokens[start : start + term_length] == term_tokens:
837d5d76   tangwang   sku筛选匹配规则优化,按 tok...
354
355
356
                  return True
          return False
  
5c9baf91   tangwang   feat(search): 款式意...
357
358
359
360
      # ------------------------------------------------------------------
      # Dimension resolution (option slot + taxonomy values)
      # ------------------------------------------------------------------
      def _resolve_dimensions(
2efad04b   tangwang   意图匹配的性能优化:
361
          self,
5c9baf91   tangwang   feat(search): 款式意...
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
          source: Dict[str, Any],
          style_profile: StyleIntentProfile,
      ) -> Dict[str, Optional[str]]:
          option_fields = (
              ("option1_value", source.get("option1_name")),
              ("option2_value", source.get("option2_name")),
              ("option3_value", source.get("option3_name")),
          )
          option_aliases = [
              (field_name, normalize_query_text(name))
              for field_name, name in option_fields
          ]
          resolved: Dict[str, Optional[str]] = {}
          for intent in style_profile.intents:
              if intent.intent_type in resolved:
                  continue
              aliases = set(
                  intent.dimension_aliases
                  or self.registry.get_dimension_aliases(intent.intent_type)
              )
              matched_field: Optional[str] = None
              for field_name, option_name in option_aliases:
                  if option_name and option_name in aliases:
                      matched_field = field_name
b712a831   tangwang   意图识别策略和性能优化
386
                      break
5c9baf91   tangwang   feat(search): 款式意...
387
388
              resolved[intent.intent_type] = matched_field
          return resolved
2efad04b   tangwang   意图匹配的性能优化:
389
  
5c9baf91   tangwang   feat(search): 款式意...
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
      def _collect_taxonomy_values(
          self,
          source: Dict[str, Any],
          style_profile: StyleIntentProfile,
      ) -> Dict[str, Tuple[Tuple[str, str], ...]]:
          """Extract SPU-level enriched_taxonomy_attributes values per intent dimension.
  
          Returns a mapping ``intent_type -> ((normalized, raw), ...)`` so the
          selection layer can (a) match against ``normalized`` and (b) surface
          the human-readable ``raw`` form in ``selected_text``.
          """
          attrs = source.get("enriched_taxonomy_attributes")
          if not isinstance(attrs, list) or not attrs:
              return {}
          aliases_by_intent = {
              intent.intent_type: set(
                  intent.dimension_aliases
                  or self.registry.get_dimension_aliases(intent.intent_type)
              )
              for intent in style_profile.intents
          }
          values_by_intent: Dict[str, List[Tuple[str, str]]] = {
              t: [] for t in aliases_by_intent
          }
          for attr in attrs:
              if not isinstance(attr, dict):
                  continue
              attr_name = normalize_query_text(attr.get("name"))
              if not attr_name:
                  continue
              matching_intents = [
                  t for t, aliases in aliases_by_intent.items() if attr_name in aliases
              ]
              if not matching_intents:
                  continue
              for raw_text in _iter_multilingual_texts(attr.get("value")):
                  raw = str(raw_text).strip()
                  if not raw:
                      continue
                  normalized = normalize_query_text(raw)
                  if not normalized:
                      continue
                  for intent_type in matching_intents:
                      bucket = values_by_intent[intent_type]
                      if not any(existing_norm == normalized for existing_norm, _ in bucket):
                          bucket.append((normalized, raw))
          return {t: tuple(v) for t, v in values_by_intent.items() if v}
  
      # ------------------------------------------------------------------
      # Image pick
      # ------------------------------------------------------------------
      @staticmethod
      def _normalize_url(url: Any) -> str:
99b72698   tangwang   测试回归钩子梳理
443
          """host + path, no query/fragment; casefolded — primary equality key."""
5c9baf91   tangwang   feat(search): 款式意...
444
445
446
447
448
449
450
451
452
          raw = str(url or "").strip()
          if not raw:
              return ""
          # Accept protocol-relative URLs like "//cdn/..." or full URLs.
          if raw.startswith("//"):
              raw = "https:" + raw
          try:
              parts = urlsplit(raw)
          except ValueError:
99b72698   tangwang   测试回归钩子梳理
453
              return str(url).strip().casefold()
5c9baf91   tangwang   feat(search): 款式意...
454
          host = (parts.netloc or "").casefold()
99b72698   tangwang   测试回归钩子梳理
455
          path = unquote(parts.path or "")
5c9baf91   tangwang   feat(search): 款式意...
456
457
          return f"{host}{path}".casefold()
  
99b72698   tangwang   测试回归钩子梳理
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
      @staticmethod
      def _normalize_path_only(url: Any) -> str:
          """Path-only key for cross-CDN / host-alias cases."""
          raw = str(url or "").strip()
          if not raw:
              return ""
          if raw.startswith("//"):
              raw = "https:" + raw
          try:
              parts = urlsplit(raw)
              path = unquote(parts.path or "")
          except ValueError:
              return ""
          return path.casefold().rstrip("/")
  
      @classmethod
      def _url_filename(cls, url: Any) -> str:
          p = cls._normalize_path_only(url)
          if not p:
              return ""
          return posixpath.basename(p).casefold()
  
      @classmethod
      def _urls_equivalent(cls, a: Any, b: Any) -> bool:
          if not a or not b:
              return False
          na, nb = cls._normalize_url(a), cls._normalize_url(b)
          if na and nb and na == nb:
              return True
          pa, pb = cls._normalize_path_only(a), cls._normalize_path_only(b)
          if pa and pb and pa == pb:
              return True
          fa, fb = cls._url_filename(a), cls._url_filename(b)
          if fa and fb and fa == fb and len(fa) > 4:
              return True
          return False
  
      @staticmethod
      def _inner_hit_url_candidates(entry: Dict[str, Any], source: Dict[str, Any]) -> List[str]:
          """URLs to try for this inner_hit: _source.url plus image_embedding[offset].url."""
          out: List[str] = []
          src = entry.get("_source") or {}
          u = src.get("url")
          if u:
              out.append(str(u).strip())
          nested = entry.get("_nested")
          if not isinstance(nested, dict):
              return out
          off = nested.get("offset")
          if not isinstance(off, int):
              return out
          embs = source.get("image_embedding")
          if not isinstance(embs, list) or not (0 <= off < len(embs)):
              return out
          emb = embs[off]
          if isinstance(emb, dict) and emb.get("url"):
              u2 = str(emb.get("url")).strip()
              if u2 and u2 not in out:
                  out.append(u2)
          return out
  
5c9baf91   tangwang   feat(search): 款式意...
519
520
521
522
523
      def _pick_sku_by_image(
          self,
          hit: Dict[str, Any],
          source: Dict[str, Any],
      ) -> Optional[ImagePick]:
99b72698   tangwang   测试回归钩子梳理
524
525
526
527
528
529
530
531
532
533
534
535
          """Map ES nested image KNN inner_hits to a SKU via image URL alignment.
  
          ``image_pick`` is empty when:
          - ES did not return ``inner_hits`` for this hit (e.g. doc outside
            ``rescore.window_size`` so no exact-image rescore inner_hits; or the
            nested image clause did not match this document).
          - The winning nested ``url`` cannot be aligned to any ``skus[].image_src``
            even after path/filename normalization (rare CDN / encoding edge cases).
  
          We try ``_source.url``, ``_nested.offset`` + ``image_embedding[offset].url``,
          and loose path/filename matching to reduce false negatives.
          """
5c9baf91   tangwang   feat(search): 款式意...
536
537
538
          inner_hits = hit.get("inner_hits")
          if not isinstance(inner_hits, dict):
              return None
99b72698   tangwang   测试回归钩子梳理
539
          best_entry: Optional[Dict[str, Any]] = None
5c9baf91   tangwang   feat(search): 款式意...
540
541
542
543
544
545
546
547
548
549
550
551
          top_score: Optional[float] = None
          for key in _IMAGE_INNER_HITS_KEYS:
              payload = inner_hits.get(key)
              if not isinstance(payload, dict):
                  continue
              hits_block = payload.get("hits")
              inner_list = hits_block.get("hits") if isinstance(hits_block, dict) else None
              if not isinstance(inner_list, list) or not inner_list:
                  continue
              for entry in inner_list:
                  if not isinstance(entry, dict):
                      continue
99b72698   tangwang   测试回归钩子梳理
552
                  if not self._inner_hit_url_candidates(entry, source):
5c9baf91   tangwang   feat(search): 款式意...
553
554
555
556
557
558
                      continue
                  try:
                      score = float(entry.get("_score") or 0.0)
                  except (TypeError, ValueError):
                      score = 0.0
                  if top_score is None or score > top_score:
99b72698   tangwang   测试回归钩子梳理
559
                      best_entry = entry
5c9baf91   tangwang   feat(search): 款式意...
560
                      top_score = score
99b72698   tangwang   测试回归钩子梳理
561
562
563
564
565
566
567
              if best_entry is not None:
                  break  # Prefer exact_image_knn_query_hits over image_knn_query_hits.
          if best_entry is None:
              return None
  
          candidates = self._inner_hit_url_candidates(best_entry, source)
          if not candidates:
5c9baf91   tangwang   feat(search): 款式意...
568
              return None
cda1cd62   tangwang   意图分析&应用 baseline
569
  
5c9baf91   tangwang   feat(search): 款式意...
570
571
572
          skus = source.get("skus")
          if not isinstance(skus, list):
              return None
5c9baf91   tangwang   feat(search): 款式意...
573
          for sku in skus:
99b72698   tangwang   测试回归钩子梳理
574
575
576
577
578
579
580
581
              sku_raw = sku.get("image_src") or sku.get("imageSrc")
              for cand in candidates:
                  if self._urls_equivalent(cand, sku_raw):
                      return ImagePick(
                          sku_id=str(sku.get("sku_id") or ""),
                          url=cand,
                          score=float(top_score or 0.0),
                      )
b712a831   tangwang   意图识别策略和性能优化
582
          return None
cda1cd62   tangwang   意图分析&应用 baseline
583
  
5c9baf91   tangwang   feat(search): 款式意...
584
585
586
587
      # ------------------------------------------------------------------
      # Unified per-hit selection
      # ------------------------------------------------------------------
      def _select(
cda1cd62   tangwang   意图分析&应用 baseline
588
          self,
cda1cd62   tangwang   意图分析&应用 baseline
589
          *,
5c9baf91   tangwang   feat(search): 款式意...
590
591
592
593
          source: Dict[str, Any],
          style_profile: Optional[StyleIntentProfile],
          selection_context: Optional[_SelectionContext],
          image_pick: Optional[ImagePick],
cda1cd62   tangwang   意图分析&应用 baseline
594
595
596
597
598
      ) -> Optional[SkuSelectionDecision]:
          skus = source.get("skus")
          if not isinstance(skus, list) or not skus:
              return None
  
5c9baf91   tangwang   feat(search): 款式意...
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
          resolved_dimensions: Dict[str, Optional[str]] = {}
          text_matched: List[Tuple[Dict[str, Any], Dict[str, Tuple[str, str]]]] = []
  
          if style_profile is not None and selection_context is not None:
              resolved_dimensions = self._resolve_dimensions(source, style_profile)
              taxonomy_values = self._collect_taxonomy_values(source, style_profile)
              # Only attempt text match when there is at least one value source
              # per intent (SKU option or SPU taxonomy).
              if all(
                  resolved_dimensions.get(intent.intent_type) is not None
                  or taxonomy_values.get(intent.intent_type)
                  for intent in style_profile.intents
              ):
                  text_matched = self._find_text_matched_skus(
                      skus=skus,
                      style_profile=style_profile,
                      resolved_dimensions=resolved_dimensions,
                      taxonomy_values=taxonomy_values,
                      ctx=selection_context,
                  )
  
          selected_sku_id: Optional[str] = None
          selected_text = ""
          final_source = "none"
          matched_sources: Dict[str, str] = {}
  
          if text_matched:
              chosen_sku, per_intent = self._choose_among_text_matched(
                  text_matched, image_pick
              )
              selected_sku_id = str(chosen_sku.get("sku_id") or "") or None
              selected_text = self._text_from_matches(per_intent)
              matched_sources = {
                  intent_type: src for intent_type, (src, _) in per_intent.items()
              }
              final_source = (
                  "taxonomy" if "taxonomy" in matched_sources.values() else "option"
              )
          elif image_pick is not None:
              image_sku = self._find_sku_by_id(skus, image_pick.sku_id)
              if image_sku is not None:
                  selected_sku_id = image_pick.sku_id or None
                  selected_text = self._build_selected_text(image_sku, resolved_dimensions)
                  final_source = "image"
2efad04b   tangwang   意图匹配的性能优化:
643
  
5c9baf91   tangwang   feat(search): 款式意...
644
645
646
647
648
          return SkuSelectionDecision(
              selected_sku_id=selected_sku_id,
              rerank_suffix=selected_text,
              selected_text=selected_text,
              final_source=final_source,
b712a831   tangwang   意图识别策略和性能优化
649
              resolved_dimensions=resolved_dimensions,
5c9baf91   tangwang   feat(search): 款式意...
650
651
652
653
              matched_sources=matched_sources,
              image_pick_sku_id=(image_pick.sku_id or None) if image_pick else None,
              image_pick_url=image_pick.url if image_pick else None,
              image_pick_score=image_pick.score if image_pick else None,
cda1cd62   tangwang   意图分析&应用 baseline
654
655
          )
  
5c9baf91   tangwang   feat(search): 款式意...
656
657
      def _find_text_matched_skus(
          self,
cda1cd62   tangwang   意图分析&应用 baseline
658
          *,
5c9baf91   tangwang   feat(search): 款式意...
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
          skus: List[Dict[str, Any]],
          style_profile: StyleIntentProfile,
          resolved_dimensions: Dict[str, Optional[str]],
          taxonomy_values: Dict[str, Tuple[Tuple[str, str], ...]],
          ctx: _SelectionContext,
      ) -> List[Tuple[Dict[str, Any], Dict[str, Tuple[str, str]]]]:
          """Return every SKU that satisfies every active intent, with match meta.
  
          Authority rule per intent:
            - If the SKU has a non-empty value on the resolved option slot, that
              value ALONE decides the match (source = ``option``). Taxonomy cannot
              override a contradicting SKU-level value.
            - Only when the SKU has no own value on the dimension (slot unresolved
              or value empty) does the SPU-level taxonomy serve as the fallback
              value source (source = ``taxonomy``).
  
          For each matched SKU we also return a per-intent dict mapping
          ``intent_type -> (source, raw_matched_text)`` so the final decision can
          surface the genuinely matched string in ``selected_text`` /
          ``rerank_suffix`` rather than, e.g., a SKU's unrelated option value.
          """
          matched: List[Tuple[Dict[str, Any], Dict[str, Tuple[str, str]]]] = []
          for sku in skus:
              per_intent: Dict[str, Tuple[str, str]] = {}
              all_ok = True
              for intent in style_profile.intents:
                  slot = resolved_dimensions.get(intent.intent_type)
                  sku_raw = str(sku.get(slot) or "").strip() if slot else ""
                  sku_norm = normalize_query_text(sku_raw) if sku_raw else ""
  
                  if sku_norm:
                      if self._is_text_match(
                          intent.intent_type, ctx, normalized_value=sku_norm
                      ):
                          per_intent[intent.intent_type] = ("option", sku_raw)
                      else:
                          all_ok = False
                          break
                  else:
                      matched_raw: Optional[str] = None
                      for tax_norm, tax_raw in taxonomy_values.get(
                          intent.intent_type, ()
                      ):
                          if self._is_text_match(
                              intent.intent_type, ctx, normalized_value=tax_norm
                          ):
                              matched_raw = tax_raw
                              break
                      if matched_raw is None:
                          all_ok = False
                          break
                      per_intent[intent.intent_type] = ("taxonomy", matched_raw)
              if all_ok:
                  matched.append((sku, per_intent))
          return matched
  
      @staticmethod
      def _choose_among_text_matched(
          text_matched: List[Tuple[Dict[str, Any], Dict[str, Tuple[str, str]]]],
          image_pick: Optional[ImagePick],
      ) -> Tuple[Dict[str, Any], Dict[str, Tuple[str, str]]]:
          """Image-visual tie-break inside the text-matched set; else first match."""
          if image_pick and image_pick.sku_id:
              for sku, per_intent in text_matched:
                  if str(sku.get("sku_id") or "") == image_pick.sku_id:
                      return sku, per_intent
          return text_matched[0]
  
      @staticmethod
      def _text_from_matches(per_intent: Dict[str, Tuple[str, str]]) -> str:
          """Join the genuinely matched raw strings in intent declaration order."""
          parts: List[str] = []
          seen: set[str] = set()
          for _, raw in per_intent.values():
              if raw and raw not in seen:
                  seen.add(raw)
                  parts.append(raw)
          return " ".join(parts).strip()
cda1cd62   tangwang   意图分析&应用 baseline
737
738
  
      @staticmethod
5c9baf91   tangwang   feat(search): 款式意...
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
      def _find_sku_by_id(
          skus: List[Dict[str, Any]], sku_id: Optional[str]
      ) -> Optional[Dict[str, Any]]:
          if not sku_id:
              return None
          for sku in skus:
              if str(sku.get("sku_id") or "") == sku_id:
                  return sku
          return None
  
      @staticmethod
      def _build_selected_text(
          sku: Dict[str, Any],
          resolved_dimensions: Dict[str, Optional[str]],
      ) -> str:
          """Text carried into rerank doc suffix: joined raw values on the resolved slots."""
          parts: List[str] = []
          seen: set[str] = set()
          for slot in resolved_dimensions.values():
              if not slot:
                  continue
              raw = str(sku.get(slot) or "").strip()
              if raw and raw not in seen:
                  seen.add(raw)
                  parts.append(raw)
          return " ".join(parts).strip()
  
      # ------------------------------------------------------------------
      # Source mutation (applied after page fill)
      # ------------------------------------------------------------------
      @staticmethod
      def _apply_decision_to_source(
          source: Dict[str, Any], decision: SkuSelectionDecision
      ) -> None:
          if not decision.selected_sku_id:
              return
cda1cd62   tangwang   意图分析&应用 baseline
775
          skus = source.get("skus")
5c9baf91   tangwang   feat(search): 款式意...
776
          if not isinstance(skus, list) or not skus:
cda1cd62   tangwang   意图分析&应用 baseline
777
              return
5c9baf91   tangwang   feat(search): 款式意...
778
          selected_index: Optional[int] = None
cda1cd62   tangwang   意图分析&应用 baseline
779
780
781
782
783
784
          for index, sku in enumerate(skus):
              if str(sku.get("sku_id") or "") == decision.selected_sku_id:
                  selected_index = index
                  break
          if selected_index is None:
              return
cda1cd62   tangwang   意图分析&应用 baseline
785
786
          selected_sku = skus.pop(selected_index)
          skus.insert(0, selected_sku)
cda1cd62   tangwang   意图分析&应用 baseline
787
788
789
          image_src = selected_sku.get("image_src") or selected_sku.get("imageSrc")
          if image_src:
              source["image_url"] = image_src
5c9baf91   tangwang   feat(search): 款式意...
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
  
  
  def _iter_multilingual_texts(value: Any) -> List[str]:
      """Flatten a value that may be str, list, or multilingual dict {zh, en, ...}."""
      if value is None:
          return []
      if isinstance(value, str):
          return [value] if value.strip() else []
      if isinstance(value, dict):
          out: List[str] = []
          for v in value.values():
              out.extend(_iter_multilingual_texts(v))
          return out
      if isinstance(value, (list, tuple)):
          out = []
          for v in value:
              out.extend(_iter_multilingual_texts(v))
          return out
      return []