Blame view

search/sku_intent_selector.py 12.2 KB
cda1cd62   tangwang   意图分析&应用 baseline
1
2
3
4
5
6
7
  """
  SKU selection for style-intent-aware search results.
  """
  
  from __future__ import annotations
  
  from dataclasses import dataclass, field
b712a831   tangwang   意图识别策略和性能优化
8
  from typing import Any, Callable, Dict, List, Optional, Tuple
cda1cd62   tangwang   意图分析&应用 baseline
9
10
  
  from query.style_intent import StyleIntentProfile, StyleIntentRegistry
837d5d76   tangwang   sku筛选匹配规则优化,按 tok...
11
  from query.tokenization import normalize_query_text, simple_tokenize_query
cda1cd62   tangwang   意图分析&应用 baseline
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
  
  
  @dataclass(frozen=True)
  class SkuSelectionDecision:
      selected_sku_id: Optional[str]
      rerank_suffix: str
      selected_text: str
      matched_stage: str
      similarity_score: Optional[float] = None
      resolved_dimensions: Dict[str, Optional[str]] = field(default_factory=dict)
  
      def to_dict(self) -> Dict[str, Any]:
          return {
              "selected_sku_id": self.selected_sku_id,
              "rerank_suffix": self.rerank_suffix,
              "selected_text": self.selected_text,
              "matched_stage": self.matched_stage,
              "similarity_score": self.similarity_score,
              "resolved_dimensions": dict(self.resolved_dimensions),
          }
  
  
  @dataclass
2efad04b   tangwang   意图匹配的性能优化:
35
  class _SelectionContext:
b712a831   tangwang   意图识别策略和性能优化
36
37
      attribute_terms_by_intent: Dict[str, Tuple[str, ...]]
      normalized_text_cache: Dict[str, str] = field(default_factory=dict)
837d5d76   tangwang   sku筛选匹配规则优化,按 tok...
38
      tokenized_text_cache: Dict[str, Tuple[str, ...]] = field(default_factory=dict)
2efad04b   tangwang   意图匹配的性能优化:
39
      text_match_cache: Dict[Tuple[str, str], bool] = field(default_factory=dict)
cda1cd62   tangwang   意图分析&应用 baseline
40
41
42
43
44
45
46
47
48
49
  
  
  class StyleSkuSelector:
      """Selects the best SKU for an SPU based on detected style intent."""
  
      def __init__(
          self,
          registry: StyleIntentRegistry,
          *,
          text_encoder_getter: Optional[Callable[[], Any]] = None,
cda1cd62   tangwang   意图分析&应用 baseline
50
51
52
      ) -> None:
          self.registry = registry
          self._text_encoder_getter = text_encoder_getter
cda1cd62   tangwang   意图分析&应用 baseline
53
54
55
56
57
58
59
60
61
62
63
  
      def prepare_hits(
          self,
          es_hits: List[Dict[str, Any]],
          parsed_query: Any,
      ) -> Dict[str, SkuSelectionDecision]:
          decisions: Dict[str, SkuSelectionDecision] = {}
          style_profile = getattr(parsed_query, "style_intent_profile", None)
          if not isinstance(style_profile, StyleIntentProfile) or not style_profile.is_active:
              return decisions
  
b712a831   tangwang   意图识别策略和性能优化
64
          selection_context = self._build_selection_context(style_profile)
cda1cd62   tangwang   意图分析&应用 baseline
65
66
67
68
69
70
71
72
73
  
          for hit in es_hits:
              source = hit.get("_source")
              if not isinstance(source, dict):
                  continue
  
              decision = self._select_for_source(
                  source,
                  style_profile=style_profile,
2efad04b   tangwang   意图匹配的性能优化:
74
                  selection_context=selection_context,
cda1cd62   tangwang   意图分析&应用 baseline
75
76
77
78
              )
              if decision is None:
                  continue
  
cda1cd62   tangwang   意图分析&应用 baseline
79
80
              if decision.rerank_suffix:
                  hit["_style_rerank_suffix"] = decision.rerank_suffix
2efad04b   tangwang   意图匹配的性能优化:
81
82
              else:
                  hit.pop("_style_rerank_suffix", None)
cda1cd62   tangwang   意图分析&应用 baseline
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
  
              doc_id = hit.get("_id")
              if doc_id is not None:
                  decisions[str(doc_id)] = decision
  
          return decisions
  
      def apply_precomputed_decisions(
          self,
          es_hits: List[Dict[str, Any]],
          decisions: Dict[str, SkuSelectionDecision],
      ) -> None:
          if not es_hits or not decisions:
              return
  
          for hit in es_hits:
              doc_id = hit.get("_id")
              if doc_id is None:
                  continue
              decision = decisions.get(str(doc_id))
              if decision is None:
                  continue
              source = hit.get("_source")
              if not isinstance(source, dict):
                  continue
              self._apply_decision_to_source(source, decision)
              if decision.rerank_suffix:
                  hit["_style_rerank_suffix"] = decision.rerank_suffix
2efad04b   tangwang   意图匹配的性能优化:
111
112
              else:
                  hit.pop("_style_rerank_suffix", None)
cda1cd62   tangwang   意图分析&应用 baseline
113
  
2efad04b   tangwang   意图匹配的性能优化:
114
115
      def _build_selection_context(
          self,
2efad04b   tangwang   意图匹配的性能优化:
116
117
          style_profile: StyleIntentProfile,
      ) -> _SelectionContext:
b712a831   tangwang   意图识别策略和性能优化
118
          attribute_terms_by_intent: Dict[str, List[str]] = {}
2efad04b   tangwang   意图匹配的性能优化:
119
          for intent in style_profile.intents:
b712a831   tangwang   意图识别策略和性能优化
120
121
122
123
124
125
              terms = attribute_terms_by_intent.setdefault(intent.intent_type, [])
              for raw_term in intent.attribute_terms:
                  normalized_term = normalize_query_text(raw_term)
                  if not normalized_term or normalized_term in terms:
                      continue
                  terms.append(normalized_term)
2efad04b   tangwang   意图匹配的性能优化:
126
127
  
          return _SelectionContext(
b712a831   tangwang   意图识别策略和性能优化
128
              attribute_terms_by_intent={
2efad04b   tangwang   意图匹配的性能优化:
129
                  intent_type: tuple(terms)
b712a831   tangwang   意图识别策略和性能优化
130
                  for intent_type, terms in attribute_terms_by_intent.items()
2efad04b   tangwang   意图匹配的性能优化:
131
              },
2efad04b   tangwang   意图匹配的性能优化:
132
133
          )
  
b712a831   tangwang   意图识别策略和性能优化
134
135
136
137
138
139
140
141
142
143
144
      @staticmethod
      def _normalize_cached(selection_context: _SelectionContext, value: Any) -> str:
          raw = str(value or "").strip()
          if not raw:
              return ""
          cached = selection_context.normalized_text_cache.get(raw)
          if cached is not None:
              return cached
          normalized = normalize_query_text(raw)
          selection_context.normalized_text_cache[raw] = normalized
          return normalized
cda1cd62   tangwang   意图分析&应用 baseline
145
  
cda1cd62   tangwang   意图分析&应用 baseline
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
      def _resolve_dimensions(
          self,
          source: Dict[str, Any],
          style_profile: StyleIntentProfile,
      ) -> Dict[str, Optional[str]]:
          option_names = {
              "option1_value": normalize_query_text(source.get("option1_name")),
              "option2_value": normalize_query_text(source.get("option2_name")),
              "option3_value": normalize_query_text(source.get("option3_name")),
          }
          resolved: Dict[str, Optional[str]] = {}
          for intent in style_profile.intents:
              if intent.intent_type in resolved:
                  continue
              aliases = set(intent.dimension_aliases or self.registry.get_dimension_aliases(intent.intent_type))
              matched_field = None
              for field_name, option_name in option_names.items():
                  if option_name and option_name in aliases:
                      matched_field = field_name
                      break
              resolved[intent.intent_type] = matched_field
          return resolved
  
cda1cd62   tangwang   意图分析&应用 baseline
169
      @staticmethod
2efad04b   tangwang   意图匹配的性能优化:
170
171
172
173
174
175
176
177
178
179
180
181
182
      def _empty_decision(
          resolved_dimensions: Dict[str, Optional[str]],
          matched_stage: str,
      ) -> SkuSelectionDecision:
          return SkuSelectionDecision(
              selected_sku_id=None,
              rerank_suffix="",
              selected_text="",
              matched_stage=matched_stage,
              resolved_dimensions=dict(resolved_dimensions),
          )
  
      def _is_text_match(
cda1cd62   tangwang   意图分析&应用 baseline
183
          self,
2efad04b   tangwang   意图匹配的性能优化:
184
          intent_type: str,
2efad04b   tangwang   意图匹配的性能优化:
185
          selection_context: _SelectionContext,
6adbf18a   tangwang   reranker提示词优化
186
          *,
b712a831   tangwang   意图识别策略和性能优化
187
          normalized_value: str,
cda1cd62   tangwang   意图分析&应用 baseline
188
      ) -> bool:
2efad04b   tangwang   意图匹配的性能优化:
189
          if not normalized_value:
cda1cd62   tangwang   意图分析&应用 baseline
190
191
              return False
  
2efad04b   tangwang   意图匹配的性能优化:
192
193
194
195
196
          cache_key = (intent_type, normalized_value)
          cached = selection_context.text_match_cache.get(cache_key)
          if cached is not None:
              return cached
  
b712a831   tangwang   意图识别策略和性能优化
197
          attribute_terms = selection_context.attribute_terms_by_intent.get(intent_type, ())
837d5d76   tangwang   sku筛选匹配规则优化,按 tok...
198
199
200
201
202
203
204
205
206
207
208
          value_tokens = self._tokenize_cached(selection_context, normalized_value)
          matched = any(
              self._matches_term_tokens(
                  term=term,
                  value_tokens=value_tokens,
                  selection_context=selection_context,
                  normalized_value=normalized_value,
              )
              for term in attribute_terms
              if term
          )
2efad04b   tangwang   意图匹配的性能优化:
209
210
211
          selection_context.text_match_cache[cache_key] = matched
          return matched
  
837d5d76   tangwang   sku筛选匹配规则优化,按 tok...
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
      @staticmethod
      def _tokenize_cached(selection_context: _SelectionContext, value: str) -> Tuple[str, ...]:
          normalized_value = normalize_query_text(value)
          if not normalized_value:
              return ()
          cached = selection_context.tokenized_text_cache.get(normalized_value)
          if cached is not None:
              return cached
          tokens = tuple(normalize_query_text(token) for token in simple_tokenize_query(normalized_value) if token)
          selection_context.tokenized_text_cache[normalized_value] = tokens
          return tokens
  
      def _matches_term_tokens(
          self,
          *,
          term: str,
          value_tokens: Tuple[str, ...],
          selection_context: _SelectionContext,
          normalized_value: str,
      ) -> bool:
          normalized_term = normalize_query_text(term)
          if not normalized_term:
              return False
          if normalized_term == normalized_value:
              return True
  
          term_tokens = self._tokenize_cached(selection_context, normalized_term)
          if not term_tokens or not value_tokens:
              return normalized_term in normalized_value
  
          term_length = len(term_tokens)
          value_length = len(value_tokens)
          if term_length > value_length:
              return False
  
          for start in range(value_length - term_length + 1):
              if value_tokens[start:start + term_length] == term_tokens:
                  return True
          return False
  
2efad04b   tangwang   意图匹配的性能优化:
252
253
      def _find_first_text_match(
          self,
b712a831   tangwang   意图识别策略和性能优化
254
255
          skus: List[Dict[str, Any]],
          resolved_dimensions: Dict[str, Optional[str]],
2efad04b   tangwang   意图匹配的性能优化:
256
          selection_context: _SelectionContext,
b712a831   tangwang   意图识别策略和性能优化
257
258
259
260
261
      ) -> Optional[Tuple[str, str]]:
          for sku in skus:
              selection_parts: List[str] = []
              seen_parts: set[str] = set()
              matched = True
cda1cd62   tangwang   意图分析&应用 baseline
262
  
b712a831   tangwang   意图识别策略和性能优化
263
264
265
266
              for intent_type, field_name in resolved_dimensions.items():
                  if not field_name:
                      matched = False
                      break
2efad04b   tangwang   意图匹配的性能优化:
267
  
b712a831   tangwang   意图识别策略和性能优化
268
269
270
271
272
273
274
275
276
                  raw_value = str(sku.get(field_name) or "").strip()
                  normalized_value = self._normalize_cached(selection_context, raw_value)
                  if not self._is_text_match(
                      intent_type,
                      selection_context,
                      normalized_value=normalized_value,
                  ):
                      matched = False
                      break
2efad04b   tangwang   意图匹配的性能优化:
277
  
b712a831   tangwang   意图识别策略和性能优化
278
279
280
                  if raw_value and normalized_value not in seen_parts:
                      seen_parts.add(normalized_value)
                      selection_parts.append(raw_value)
cda1cd62   tangwang   意图分析&应用 baseline
281
  
b712a831   tangwang   意图识别策略和性能优化
282
283
284
              if matched:
                  return str(sku.get("sku_id") or ""), " ".join(selection_parts).strip()
          return None
cda1cd62   tangwang   意图分析&应用 baseline
285
286
287
288
289
290
  
      def _select_for_source(
          self,
          source: Dict[str, Any],
          *,
          style_profile: StyleIntentProfile,
2efad04b   tangwang   意图匹配的性能优化:
291
          selection_context: _SelectionContext,
cda1cd62   tangwang   意图分析&应用 baseline
292
293
294
295
296
297
      ) -> Optional[SkuSelectionDecision]:
          skus = source.get("skus")
          if not isinstance(skus, list) or not skus:
              return None
  
          resolved_dimensions = self._resolve_dimensions(source, style_profile)
2efad04b   tangwang   意图匹配的性能优化:
298
299
300
          if not resolved_dimensions or any(not field_name for field_name in resolved_dimensions.values()):
              return self._empty_decision(resolved_dimensions, matched_stage="unresolved")
  
b712a831   tangwang   意图识别策略和性能优化
301
302
          text_match = self._find_first_text_match(skus, resolved_dimensions, selection_context)
          if text_match is None:
2efad04b   tangwang   意图匹配的性能优化:
303
              return self._empty_decision(resolved_dimensions, matched_stage="no_match")
cda1cd62   tangwang   意图分析&应用 baseline
304
          return self._build_decision(
b712a831   tangwang   意图识别策略和性能优化
305
306
307
308
              selected_sku_id=text_match[0],
              selected_text=text_match[1],
              resolved_dimensions=resolved_dimensions,
              matched_stage="text",
cda1cd62   tangwang   意图分析&应用 baseline
309
310
311
312
          )
  
      @staticmethod
      def _build_decision(
b712a831   tangwang   意图识别策略和性能优化
313
314
          selected_sku_id: str,
          selected_text: str,
cda1cd62   tangwang   意图分析&应用 baseline
315
316
317
318
319
320
          resolved_dimensions: Dict[str, Optional[str]],
          *,
          matched_stage: str,
          similarity_score: Optional[float] = None,
      ) -> SkuSelectionDecision:
          return SkuSelectionDecision(
b712a831   tangwang   意图识别策略和性能优化
321
322
323
              selected_sku_id=selected_sku_id or None,
              rerank_suffix=str(selected_text or "").strip(),
              selected_text=str(selected_text or "").strip(),
cda1cd62   tangwang   意图分析&应用 baseline
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
              matched_stage=matched_stage,
              similarity_score=similarity_score,
              resolved_dimensions=dict(resolved_dimensions),
          )
  
      @staticmethod
      def _apply_decision_to_source(source: Dict[str, Any], decision: SkuSelectionDecision) -> None:
          skus = source.get("skus")
          if not isinstance(skus, list) or not skus or not decision.selected_sku_id:
              return
  
          selected_index = None
          for index, sku in enumerate(skus):
              if str(sku.get("sku_id") or "") == decision.selected_sku_id:
                  selected_index = index
                  break
          if selected_index is None:
              return
  
          selected_sku = skus.pop(selected_index)
          skus.insert(0, selected_sku)
  
          image_src = selected_sku.get("image_src") or selected_sku.get("imageSrc")
          if image_src:
              source["image_url"] = image_src