Commit ed13851cd0575d1033bbf5b8e66840031a869660

Authored by tangwang
1 parent 1681a135

图片文本两个knn召回相关参数配置

config/config.yaml
... ... @@ -206,8 +206,18 @@ query_config:
206 206 - specifications
207 207 - skus
208 208  
209   - # KNN boost配置(向量召回的boost值)
210   - knn_boost: 2.0 # Lower boost for embedding recall
  209 + # KNN:文本向量与多模态(图片)向量各自 boost 与召回(k / num_candidates)
  210 + knn_text_boost: 20
  211 + knn_image_boost: 20
  212 +
  213 + knn_text_k: 150
  214 + knn_text_num_candidates: 400
  215 +
  216 + knn_text_k_long: 300
  217 + knn_text_num_candidates_long: 720
  218 +
  219 + knn_image_k: 300
  220 + knn_image_num_candidates: 720
211 221  
212 222 # Function Score配置(ES层打分规则)
213 223 function_score:
... ...
config/loader.py
... ... @@ -376,7 +376,20 @@ class AppConfigLoader:
376 376 text_embedding_field=query_cfg.get("text_embedding_field"),
377 377 image_embedding_field=query_cfg.get("image_embedding_field"),
378 378 source_fields=query_cfg.get("source_fields"),
379   - knn_boost=float(query_cfg.get("knn_boost", 0.25)),
  379 + knn_text_boost=float(
  380 + query_cfg.get("knn_text_boost", query_cfg.get("knn_boost", 0.25))
  381 + ),
  382 + knn_image_boost=float(
  383 + query_cfg.get("knn_image_boost", query_cfg.get("knn_boost", 0.25))
  384 + ),
  385 + knn_text_k=int(query_cfg.get("knn_text_k", 120)),
  386 + knn_text_num_candidates=int(query_cfg.get("knn_text_num_candidates", 400)),
  387 + knn_text_k_long=int(query_cfg.get("knn_text_k_long", 160)),
  388 + knn_text_num_candidates_long=int(
  389 + query_cfg.get("knn_text_num_candidates_long", 500)
  390 + ),
  391 + knn_image_k=int(query_cfg.get("knn_image_k", 120)),
  392 + knn_image_num_candidates=int(query_cfg.get("knn_image_num_candidates", 400)),
380 393 multilingual_fields=list(
381 394 search_fields.get(
382 395 "multilingual_fields",
... ...
config/schema.py
... ... @@ -34,7 +34,15 @@ class QueryConfig:
34 34 text_embedding_field: Optional[str] = "title_embedding"
35 35 image_embedding_field: Optional[str] = None
36 36 source_fields: Optional[List[str]] = None
37   - knn_boost: float = 0.25
  37 + # 文本向量 KNN 与多模态(图片)向量 KNN 各自 boost;未在 YAML 中写时由 loader 用 legacy knn_boost 回填
  38 + knn_text_boost: float = 20.0
  39 + knn_image_boost: float = 20.0
  40 + knn_text_k: int = 120
  41 + knn_text_num_candidates: int = 400
  42 + knn_text_k_long: int = 160
  43 + knn_text_num_candidates_long: int = 500
  44 + knn_image_k: int = 120
  45 + knn_image_num_candidates: int = 400
38 46 multilingual_fields: List[str] = field(
39 47 default_factory=lambda: []
40 48 )
... ...
docs/常用查询 - sql.sql
... ... @@ -584,18 +584,14 @@ SELECT 'shoplazza_product_index_increment', COUNT(*) FROM shoplazza_product_inde
584 584 "
585 585  
586 586 # 执行删除
587   -cd /data/saas-search && MYSQL_PWD='qY8tgodLoA&KT#yQ' mysql -h 10.200.16.14 -P 3316 -u root saas -N -e "
588   -SELECT 'shoplazza_sync_log', COUNT(*) FROM shoplazza_sync_log WHERE tenant_id = 163
589   -UNION ALL
590   -SELECT 'shoplazza_product_index_increment', COUNT(*) FROM shoplazza_product_index_increment WHERE tenant_id = 163;
  587 +cd /data/saas-search && MYSQL_PWD='qY8tgodLoA&KT#yQ' mysql -h 10.200.16.14 -P 3316 -u root saas -e "
  588 +SET SESSION sql_safe_updates = 0;
  589 +DELETE FROM shoplazza_sync_log WHERE tenant_id = 163;
  590 +SELECT ROW_COUNT() AS deleted_sync_log;
  591 +DELETE FROM shoplazza_product_index_increment WHERE tenant_id = 163;
  592 +SELECT ROW_COUNT() AS deleted_index_increment;
591 593 "
592 594  
593   -# 再次统计 tenant_id=163 的行数
594   -MYSQL_PWD='qY8tgodLoA&KT#yQ' mysql -h 10.200.16.14 -P 3316 -u root saas -N -e "
595   -SELECT 'shoplazza_sync_log', COUNT(*) FROM shoplazza_sync_log WHERE tenant_id = 163
596   -UNION ALL
597   -SELECT 'shoplazza_product_index_increment', COUNT(*) FROM shoplazza_product_index_increment WHERE tenant_id = 163;
598   -"
599 595 ```
600 596  
601 597 然后触发重新安装:
... ...
search/es_query_builder.py
... ... @@ -29,7 +29,14 @@ class ESQueryBuilder:
29 29 source_fields: Optional[List[str]] = None,
30 30 function_score_config: Optional[FunctionScoreConfig] = None,
31 31 default_language: str = "en",
32   - knn_boost: float = 0.25,
  32 + knn_text_boost: float = 20.0,
  33 + knn_image_boost: float = 20.0,
  34 + knn_text_k: int = 120,
  35 + knn_text_num_candidates: int = 400,
  36 + knn_text_k_long: int = 160,
  37 + knn_text_num_candidates_long: int = 500,
  38 + knn_image_k: int = 120,
  39 + knn_image_num_candidates: int = 400,
33 40 base_minimum_should_match: str = "70%",
34 41 translation_minimum_should_match: str = "70%",
35 42 translation_boost: float = 0.4,
... ... @@ -55,7 +62,8 @@ class ESQueryBuilder:
55 62 source_fields: Fields to return in search results (_source includes)
56 63 function_score_config: Function score configuration
57 64 default_language: Default language to use when detection fails or returns "unknown"
58   - knn_boost: Boost value for KNN (embedding recall)
  65 + knn_text_boost: Boost for text-embedding KNN clause
  66 + knn_image_boost: Boost for image-embedding KNN clause
59 67 """
60 68 self.match_fields = match_fields
61 69 self.field_boosts = field_boosts or {}
... ... @@ -67,7 +75,14 @@ class ESQueryBuilder:
67 75 self.source_fields = source_fields
68 76 self.function_score_config = function_score_config
69 77 self.default_language = default_language
70   - self.knn_boost = knn_boost
  78 + self.knn_text_boost = float(knn_text_boost)
  79 + self.knn_image_boost = float(knn_image_boost)
  80 + self.knn_text_k = int(knn_text_k)
  81 + self.knn_text_num_candidates = int(knn_text_num_candidates)
  82 + self.knn_text_k_long = int(knn_text_k_long)
  83 + self.knn_text_num_candidates_long = int(knn_text_num_candidates_long)
  84 + self.knn_image_k = int(knn_image_k)
  85 + self.knn_image_num_candidates = int(knn_image_num_candidates)
71 86 self.base_minimum_should_match = base_minimum_should_match
72 87 self.translation_minimum_should_match = translation_minimum_should_match
73 88 self.translation_boost = float(translation_boost)
... ... @@ -171,8 +186,6 @@ class ESQueryBuilder:
171 186 size: int = 10,
172 187 from_: int = 0,
173 188 enable_knn: bool = True,
174   - knn_k: int = 50,
175   - knn_num_candidates: int = 200,
176 189 min_score: Optional[float] = None,
177 190 parsed_query: Optional[Any] = None,
178 191 ) -> Dict[str, Any]:
... ... @@ -195,8 +208,6 @@ class ESQueryBuilder:
195 208 size: Number of results
196 209 from_: Offset for pagination
197 210 enable_knn: Whether to use KNN search
198   - knn_k: K value for KNN
199   - knn_num_candidates: Number of candidates for KNN
200 211 min_score: Minimum score threshold
201 212  
202 213 Returns:
... ... @@ -234,41 +245,37 @@ class ESQueryBuilder:
234 245 filter_clauses.append(product_title_exclusion_filter)
235 246  
236 247 # 3. Add KNN search clauses alongside lexical clauses under the same bool.should
237   - # Adjust KNN k, num_candidates, boost by query_tokens (short query: less KNN; long: more)
238   - final_knn_k, final_knn_num_candidates = knn_k, knn_num_candidates
  248 + # Text KNN: k / num_candidates from config; long queries use *_long and higher boost
239 249 if has_embedding:
240   - knn_boost = self.knn_boost
  250 + text_knn_boost = self.knn_text_boost
  251 + final_knn_k = self.knn_text_k
  252 + final_knn_num_candidates = self.knn_text_num_candidates
241 253 if parsed_query:
242 254 query_tokens = getattr(parsed_query, 'query_tokens', None) or []
243 255 token_count = len(query_tokens)
244 256 if token_count >= 5:
245   - final_knn_k, final_knn_num_candidates = 160, 500
246   - knn_boost = self.knn_boost * 1.4 # Higher weight for long queries
247   - else:
248   - final_knn_k, final_knn_num_candidates = 120, 400
249   - else:
250   - final_knn_k, final_knn_num_candidates = 120, 400
  257 + final_knn_k = self.knn_text_k_long
  258 + final_knn_num_candidates = self.knn_text_num_candidates_long
  259 + text_knn_boost = self.knn_text_boost * 1.4
251 260 recall_clauses.append({
252 261 "knn": {
253 262 "field": self.text_embedding_field,
254 263 "query_vector": query_vector.tolist(),
255 264 "k": final_knn_k,
256 265 "num_candidates": final_knn_num_candidates,
257   - "boost": knn_boost,
  266 + "boost": text_knn_boost,
258 267 "_name": "knn_query",
259 268 }
260 269 })
261 270  
262 271 if has_image_embedding:
263   - image_knn_k = max(final_knn_k, 120)
264   - image_knn_num_candidates = max(final_knn_num_candidates, 400)
265 272 recall_clauses.append({
266 273 "knn": {
267 274 "field": self.image_embedding_field,
268 275 "query_vector": image_query_vector.tolist(),
269   - "k": image_knn_k,
270   - "num_candidates": image_knn_num_candidates,
271   - "boost": self.knn_boost,
  276 + "k": self.knn_image_k,
  277 + "num_candidates": self.knn_image_num_candidates,
  278 + "boost": self.knn_image_boost,
272 279 "_name": "image_knn_query",
273 280 }
274 281 })
... ...
search/searcher.py
... ... @@ -133,7 +133,14 @@ class Searcher:
133 133 source_fields=self.source_fields,
134 134 function_score_config=self.config.function_score,
135 135 default_language=self.config.query_config.default_language,
136   - knn_boost=self.config.query_config.knn_boost,
  136 + knn_text_boost=self.config.query_config.knn_text_boost,
  137 + knn_image_boost=self.config.query_config.knn_image_boost,
  138 + knn_text_k=self.config.query_config.knn_text_k,
  139 + knn_text_num_candidates=self.config.query_config.knn_text_num_candidates,
  140 + knn_text_k_long=self.config.query_config.knn_text_k_long,
  141 + knn_text_num_candidates_long=self.config.query_config.knn_text_num_candidates_long,
  142 + knn_image_k=self.config.query_config.knn_image_k,
  143 + knn_image_num_candidates=self.config.query_config.knn_image_num_candidates,
137 144 base_minimum_should_match=self.config.query_config.base_minimum_should_match,
138 145 translation_minimum_should_match=self.config.query_config.translation_minimum_should_match,
139 146 translation_boost=self.config.query_config.translation_boost,
... ...
tests/test_es_query_builder.py
... ... @@ -119,9 +119,12 @@ def test_text_query_skips_duplicate_translation_same_as_base():
119 119 enable_knn=False,
120 120 )
121 121  
122   - root = _recall_root(q)
123   - assert root["bool"]["_name"] == "base_query"
124   - assert [clause["multi_match"]["type"] for clause in root["bool"]["should"]] == ["best_fields", "phrase"]
  122 + query_root = q["query"]
  123 + if "function_score" in query_root:
  124 + query_root = query_root["function_score"]["query"]
  125 + base_bool = query_root["bool"]
  126 + assert base_bool["_name"] == "base_query"
  127 + assert [clause["multi_match"]["type"] for clause in base_bool["should"]] == ["best_fields", "phrase"]
125 128  
126 129  
127 130 def test_product_title_exclusion_filter_is_applied_once_on_outer_query():
... ...
tests/test_es_query_builder_text_recall_languages.py
... ... @@ -351,7 +351,10 @@ def test_text_clauses_present_alongside_knn():
351 351 parsed_query=parsed,
352 352 enable_knn=True,
353 353 )
354   - assert "knn" in q
  354 + qr = q["query"]
  355 + if "function_score" in qr:
  356 + qr = qr["function_score"]["query"]
  357 + assert any("knn" in c for c in qr["bool"]["should"])
355 358 idx = _clauses_index(q)
356 359 assert set(idx) == {"base_query", "base_query_trans_zh"}
357 360  
... ...