Blame view

search/sku_intent_selector.py 10.3 KB
cda1cd62   tangwang   意图分析&应用 baseline
1
2
3
4
5
6
7
  """
  SKU selection for style-intent-aware search results.
  """
  
  from __future__ import annotations
  
  from dataclasses import dataclass, field
b712a831   tangwang   意图识别策略和性能优化
8
  from typing import Any, Callable, Dict, List, Optional, Tuple
cda1cd62   tangwang   意图分析&应用 baseline
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
  
  from query.style_intent import StyleIntentProfile, StyleIntentRegistry
  from query.tokenization import normalize_query_text
  
  
  @dataclass(frozen=True)
  class SkuSelectionDecision:
      selected_sku_id: Optional[str]
      rerank_suffix: str
      selected_text: str
      matched_stage: str
      similarity_score: Optional[float] = None
      resolved_dimensions: Dict[str, Optional[str]] = field(default_factory=dict)
  
      def to_dict(self) -> Dict[str, Any]:
          return {
              "selected_sku_id": self.selected_sku_id,
              "rerank_suffix": self.rerank_suffix,
              "selected_text": self.selected_text,
              "matched_stage": self.matched_stage,
              "similarity_score": self.similarity_score,
              "resolved_dimensions": dict(self.resolved_dimensions),
          }
  
  
  @dataclass
2efad04b   tangwang   意图匹配的性能优化:
35
  class _SelectionContext:
b712a831   tangwang   意图识别策略和性能优化
36
37
      attribute_terms_by_intent: Dict[str, Tuple[str, ...]]
      normalized_text_cache: Dict[str, str] = field(default_factory=dict)
2efad04b   tangwang   意图匹配的性能优化:
38
      text_match_cache: Dict[Tuple[str, str], bool] = field(default_factory=dict)
cda1cd62   tangwang   意图分析&应用 baseline
39
40
41
42
43
44
45
46
47
48
  
  
  class StyleSkuSelector:
      """Selects the best SKU for an SPU based on detected style intent."""
  
      def __init__(
          self,
          registry: StyleIntentRegistry,
          *,
          text_encoder_getter: Optional[Callable[[], Any]] = None,
cda1cd62   tangwang   意图分析&应用 baseline
49
50
51
      ) -> None:
          self.registry = registry
          self._text_encoder_getter = text_encoder_getter
cda1cd62   tangwang   意图分析&应用 baseline
52
53
54
55
56
57
58
59
60
61
62
  
      def prepare_hits(
          self,
          es_hits: List[Dict[str, Any]],
          parsed_query: Any,
      ) -> Dict[str, SkuSelectionDecision]:
          decisions: Dict[str, SkuSelectionDecision] = {}
          style_profile = getattr(parsed_query, "style_intent_profile", None)
          if not isinstance(style_profile, StyleIntentProfile) or not style_profile.is_active:
              return decisions
  
b712a831   tangwang   意图识别策略和性能优化
63
          selection_context = self._build_selection_context(style_profile)
cda1cd62   tangwang   意图分析&应用 baseline
64
65
66
67
68
69
70
71
72
  
          for hit in es_hits:
              source = hit.get("_source")
              if not isinstance(source, dict):
                  continue
  
              decision = self._select_for_source(
                  source,
                  style_profile=style_profile,
2efad04b   tangwang   意图匹配的性能优化:
73
                  selection_context=selection_context,
cda1cd62   tangwang   意图分析&应用 baseline
74
75
76
77
              )
              if decision is None:
                  continue
  
cda1cd62   tangwang   意图分析&应用 baseline
78
79
              if decision.rerank_suffix:
                  hit["_style_rerank_suffix"] = decision.rerank_suffix
2efad04b   tangwang   意图匹配的性能优化:
80
81
              else:
                  hit.pop("_style_rerank_suffix", None)
cda1cd62   tangwang   意图分析&应用 baseline
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
  
              doc_id = hit.get("_id")
              if doc_id is not None:
                  decisions[str(doc_id)] = decision
  
          return decisions
  
      def apply_precomputed_decisions(
          self,
          es_hits: List[Dict[str, Any]],
          decisions: Dict[str, SkuSelectionDecision],
      ) -> None:
          if not es_hits or not decisions:
              return
  
          for hit in es_hits:
              doc_id = hit.get("_id")
              if doc_id is None:
                  continue
              decision = decisions.get(str(doc_id))
              if decision is None:
                  continue
              source = hit.get("_source")
              if not isinstance(source, dict):
                  continue
              self._apply_decision_to_source(source, decision)
              if decision.rerank_suffix:
                  hit["_style_rerank_suffix"] = decision.rerank_suffix
2efad04b   tangwang   意图匹配的性能优化:
110
111
              else:
                  hit.pop("_style_rerank_suffix", None)
cda1cd62   tangwang   意图分析&应用 baseline
112
  
2efad04b   tangwang   意图匹配的性能优化:
113
114
      def _build_selection_context(
          self,
2efad04b   tangwang   意图匹配的性能优化:
115
116
          style_profile: StyleIntentProfile,
      ) -> _SelectionContext:
b712a831   tangwang   意图识别策略和性能优化
117
          attribute_terms_by_intent: Dict[str, List[str]] = {}
2efad04b   tangwang   意图匹配的性能优化:
118
          for intent in style_profile.intents:
b712a831   tangwang   意图识别策略和性能优化
119
120
121
122
123
124
              terms = attribute_terms_by_intent.setdefault(intent.intent_type, [])
              for raw_term in intent.attribute_terms:
                  normalized_term = normalize_query_text(raw_term)
                  if not normalized_term or normalized_term in terms:
                      continue
                  terms.append(normalized_term)
2efad04b   tangwang   意图匹配的性能优化:
125
126
  
          return _SelectionContext(
b712a831   tangwang   意图识别策略和性能优化
127
              attribute_terms_by_intent={
2efad04b   tangwang   意图匹配的性能优化:
128
                  intent_type: tuple(terms)
b712a831   tangwang   意图识别策略和性能优化
129
                  for intent_type, terms in attribute_terms_by_intent.items()
2efad04b   tangwang   意图匹配的性能优化:
130
              },
2efad04b   tangwang   意图匹配的性能优化:
131
132
          )
  
b712a831   tangwang   意图识别策略和性能优化
133
134
135
136
137
138
139
140
141
142
143
      @staticmethod
      def _normalize_cached(selection_context: _SelectionContext, value: Any) -> str:
          raw = str(value or "").strip()
          if not raw:
              return ""
          cached = selection_context.normalized_text_cache.get(raw)
          if cached is not None:
              return cached
          normalized = normalize_query_text(raw)
          selection_context.normalized_text_cache[raw] = normalized
          return normalized
cda1cd62   tangwang   意图分析&应用 baseline
144
  
cda1cd62   tangwang   意图分析&应用 baseline
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
      def _resolve_dimensions(
          self,
          source: Dict[str, Any],
          style_profile: StyleIntentProfile,
      ) -> Dict[str, Optional[str]]:
          option_names = {
              "option1_value": normalize_query_text(source.get("option1_name")),
              "option2_value": normalize_query_text(source.get("option2_name")),
              "option3_value": normalize_query_text(source.get("option3_name")),
          }
          resolved: Dict[str, Optional[str]] = {}
          for intent in style_profile.intents:
              if intent.intent_type in resolved:
                  continue
              aliases = set(intent.dimension_aliases or self.registry.get_dimension_aliases(intent.intent_type))
              matched_field = None
              for field_name, option_name in option_names.items():
                  if option_name and option_name in aliases:
                      matched_field = field_name
                      break
              resolved[intent.intent_type] = matched_field
          return resolved
  
cda1cd62   tangwang   意图分析&应用 baseline
168
      @staticmethod
2efad04b   tangwang   意图匹配的性能优化:
169
170
171
172
173
174
175
176
177
178
179
180
181
      def _empty_decision(
          resolved_dimensions: Dict[str, Optional[str]],
          matched_stage: str,
      ) -> SkuSelectionDecision:
          return SkuSelectionDecision(
              selected_sku_id=None,
              rerank_suffix="",
              selected_text="",
              matched_stage=matched_stage,
              resolved_dimensions=dict(resolved_dimensions),
          )
  
      def _is_text_match(
cda1cd62   tangwang   意图分析&应用 baseline
182
          self,
2efad04b   tangwang   意图匹配的性能优化:
183
          intent_type: str,
2efad04b   tangwang   意图匹配的性能优化:
184
          selection_context: _SelectionContext,
6adbf18a   tangwang   reranker提示词优化
185
          *,
b712a831   tangwang   意图识别策略和性能优化
186
          normalized_value: str,
cda1cd62   tangwang   意图分析&应用 baseline
187
      ) -> bool:
2efad04b   tangwang   意图匹配的性能优化:
188
          if not normalized_value:
cda1cd62   tangwang   意图分析&应用 baseline
189
190
              return False
  
2efad04b   tangwang   意图匹配的性能优化:
191
192
193
194
195
          cache_key = (intent_type, normalized_value)
          cached = selection_context.text_match_cache.get(cache_key)
          if cached is not None:
              return cached
  
b712a831   tangwang   意图识别策略和性能优化
196
197
          attribute_terms = selection_context.attribute_terms_by_intent.get(intent_type, ())
          matched = any(term in normalized_value for term in attribute_terms if term)
2efad04b   tangwang   意图匹配的性能优化:
198
199
200
201
202
          selection_context.text_match_cache[cache_key] = matched
          return matched
  
      def _find_first_text_match(
          self,
b712a831   tangwang   意图识别策略和性能优化
203
204
          skus: List[Dict[str, Any]],
          resolved_dimensions: Dict[str, Optional[str]],
2efad04b   tangwang   意图匹配的性能优化:
205
          selection_context: _SelectionContext,
b712a831   tangwang   意图识别策略和性能优化
206
207
208
209
210
      ) -> Optional[Tuple[str, str]]:
          for sku in skus:
              selection_parts: List[str] = []
              seen_parts: set[str] = set()
              matched = True
cda1cd62   tangwang   意图分析&应用 baseline
211
  
b712a831   tangwang   意图识别策略和性能优化
212
213
214
215
              for intent_type, field_name in resolved_dimensions.items():
                  if not field_name:
                      matched = False
                      break
2efad04b   tangwang   意图匹配的性能优化:
216
  
b712a831   tangwang   意图识别策略和性能优化
217
218
219
220
221
222
223
224
225
                  raw_value = str(sku.get(field_name) or "").strip()
                  normalized_value = self._normalize_cached(selection_context, raw_value)
                  if not self._is_text_match(
                      intent_type,
                      selection_context,
                      normalized_value=normalized_value,
                  ):
                      matched = False
                      break
2efad04b   tangwang   意图匹配的性能优化:
226
  
b712a831   tangwang   意图识别策略和性能优化
227
228
229
                  if raw_value and normalized_value not in seen_parts:
                      seen_parts.add(normalized_value)
                      selection_parts.append(raw_value)
cda1cd62   tangwang   意图分析&应用 baseline
230
  
b712a831   tangwang   意图识别策略和性能优化
231
232
233
              if matched:
                  return str(sku.get("sku_id") or ""), " ".join(selection_parts).strip()
          return None
cda1cd62   tangwang   意图分析&应用 baseline
234
235
236
237
238
239
  
      def _select_for_source(
          self,
          source: Dict[str, Any],
          *,
          style_profile: StyleIntentProfile,
2efad04b   tangwang   意图匹配的性能优化:
240
          selection_context: _SelectionContext,
cda1cd62   tangwang   意图分析&应用 baseline
241
242
243
244
245
246
      ) -> Optional[SkuSelectionDecision]:
          skus = source.get("skus")
          if not isinstance(skus, list) or not skus:
              return None
  
          resolved_dimensions = self._resolve_dimensions(source, style_profile)
2efad04b   tangwang   意图匹配的性能优化:
247
248
249
          if not resolved_dimensions or any(not field_name for field_name in resolved_dimensions.values()):
              return self._empty_decision(resolved_dimensions, matched_stage="unresolved")
  
b712a831   tangwang   意图识别策略和性能优化
250
251
          text_match = self._find_first_text_match(skus, resolved_dimensions, selection_context)
          if text_match is None:
2efad04b   tangwang   意图匹配的性能优化:
252
              return self._empty_decision(resolved_dimensions, matched_stage="no_match")
cda1cd62   tangwang   意图分析&应用 baseline
253
          return self._build_decision(
b712a831   tangwang   意图识别策略和性能优化
254
255
256
257
              selected_sku_id=text_match[0],
              selected_text=text_match[1],
              resolved_dimensions=resolved_dimensions,
              matched_stage="text",
cda1cd62   tangwang   意图分析&应用 baseline
258
259
260
261
          )
  
      @staticmethod
      def _build_decision(
b712a831   tangwang   意图识别策略和性能优化
262
263
          selected_sku_id: str,
          selected_text: str,
cda1cd62   tangwang   意图分析&应用 baseline
264
265
266
267
268
269
          resolved_dimensions: Dict[str, Optional[str]],
          *,
          matched_stage: str,
          similarity_score: Optional[float] = None,
      ) -> SkuSelectionDecision:
          return SkuSelectionDecision(
b712a831   tangwang   意图识别策略和性能优化
270
271
272
              selected_sku_id=selected_sku_id or None,
              rerank_suffix=str(selected_text or "").strip(),
              selected_text=str(selected_text or "").strip(),
cda1cd62   tangwang   意图分析&应用 baseline
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
              matched_stage=matched_stage,
              similarity_score=similarity_score,
              resolved_dimensions=dict(resolved_dimensions),
          )
  
      @staticmethod
      def _apply_decision_to_source(source: Dict[str, Any], decision: SkuSelectionDecision) -> None:
          skus = source.get("skus")
          if not isinstance(skus, list) or not skus or not decision.selected_sku_id:
              return
  
          selected_index = None
          for index, sku in enumerate(skus):
              if str(sku.get("sku_id") or "") == decision.selected_sku_id:
                  selected_index = index
                  break
          if selected_index is None:
              return
  
          selected_sku = skus.pop(selected_index)
          skus.insert(0, selected_sku)
  
          image_src = selected_sku.get("image_src") or selected_sku.get("imageSrc")
          if image_src:
              source["image_url"] = image_src