Commit ed13851cd0575d1033bbf5b8e66840031a869660

Authored by tangwang
1 parent 1681a135

图片文本两个knn召回相关参数配置

config/config.yaml
@@ -206,8 +206,18 @@ query_config: @@ -206,8 +206,18 @@ query_config:
206 - specifications 206 - specifications
207 - skus 207 - skus
208 208
209 - # KNN boost配置(向量召回的boost值)  
210 - knn_boost: 2.0 # Lower boost for embedding recall 209 + # KNN:文本向量与多模态(图片)向量各自 boost 与召回(k / num_candidates)
  210 + knn_text_boost: 20
  211 + knn_image_boost: 20
  212 +
  213 + knn_text_k: 150
  214 + knn_text_num_candidates: 400
  215 +
  216 + knn_text_k_long: 300
  217 + knn_text_num_candidates_long: 720
  218 +
  219 + knn_image_k: 300
  220 + knn_image_num_candidates: 720
211 221
212 # Function Score配置(ES层打分规则) 222 # Function Score配置(ES层打分规则)
213 function_score: 223 function_score:
@@ -376,7 +376,20 @@ class AppConfigLoader: @@ -376,7 +376,20 @@ class AppConfigLoader:
376 text_embedding_field=query_cfg.get("text_embedding_field"), 376 text_embedding_field=query_cfg.get("text_embedding_field"),
377 image_embedding_field=query_cfg.get("image_embedding_field"), 377 image_embedding_field=query_cfg.get("image_embedding_field"),
378 source_fields=query_cfg.get("source_fields"), 378 source_fields=query_cfg.get("source_fields"),
379 - knn_boost=float(query_cfg.get("knn_boost", 0.25)), 379 + knn_text_boost=float(
  380 + query_cfg.get("knn_text_boost", query_cfg.get("knn_boost", 0.25))
  381 + ),
  382 + knn_image_boost=float(
  383 + query_cfg.get("knn_image_boost", query_cfg.get("knn_boost", 0.25))
  384 + ),
  385 + knn_text_k=int(query_cfg.get("knn_text_k", 120)),
  386 + knn_text_num_candidates=int(query_cfg.get("knn_text_num_candidates", 400)),
  387 + knn_text_k_long=int(query_cfg.get("knn_text_k_long", 160)),
  388 + knn_text_num_candidates_long=int(
  389 + query_cfg.get("knn_text_num_candidates_long", 500)
  390 + ),
  391 + knn_image_k=int(query_cfg.get("knn_image_k", 120)),
  392 + knn_image_num_candidates=int(query_cfg.get("knn_image_num_candidates", 400)),
380 multilingual_fields=list( 393 multilingual_fields=list(
381 search_fields.get( 394 search_fields.get(
382 "multilingual_fields", 395 "multilingual_fields",
@@ -34,7 +34,15 @@ class QueryConfig: @@ -34,7 +34,15 @@ class QueryConfig:
34 text_embedding_field: Optional[str] = "title_embedding" 34 text_embedding_field: Optional[str] = "title_embedding"
35 image_embedding_field: Optional[str] = None 35 image_embedding_field: Optional[str] = None
36 source_fields: Optional[List[str]] = None 36 source_fields: Optional[List[str]] = None
37 - knn_boost: float = 0.25 37 + # 文本向量 KNN 与多模态(图片)向量 KNN 各自 boost;未在 YAML 中写时由 loader 用 legacy knn_boost 回填
  38 + knn_text_boost: float = 20.0
  39 + knn_image_boost: float = 20.0
  40 + knn_text_k: int = 120
  41 + knn_text_num_candidates: int = 400
  42 + knn_text_k_long: int = 160
  43 + knn_text_num_candidates_long: int = 500
  44 + knn_image_k: int = 120
  45 + knn_image_num_candidates: int = 400
38 multilingual_fields: List[str] = field( 46 multilingual_fields: List[str] = field(
39 default_factory=lambda: [] 47 default_factory=lambda: []
40 ) 48 )
docs/常用查询 - sql.sql
@@ -584,18 +584,14 @@ SELECT 'shoplazza_product_index_increment', COUNT(*) FROM shoplazza_product_inde @@ -584,18 +584,14 @@ SELECT 'shoplazza_product_index_increment', COUNT(*) FROM shoplazza_product_inde
584 " 584 "
585 585
586 # 执行删除 586 # 执行删除
587 -cd /data/saas-search && MYSQL_PWD='qY8tgodLoA&KT#yQ' mysql -h 10.200.16.14 -P 3316 -u root saas -N -e "  
588 -SELECT 'shoplazza_sync_log', COUNT(*) FROM shoplazza_sync_log WHERE tenant_id = 163  
589 -UNION ALL  
590 -SELECT 'shoplazza_product_index_increment', COUNT(*) FROM shoplazza_product_index_increment WHERE tenant_id = 163; 587 +cd /data/saas-search && MYSQL_PWD='qY8tgodLoA&KT#yQ' mysql -h 10.200.16.14 -P 3316 -u root saas -e "
  588 +SET SESSION sql_safe_updates = 0;
  589 +DELETE FROM shoplazza_sync_log WHERE tenant_id = 163;
  590 +SELECT ROW_COUNT() AS deleted_sync_log;
  591 +DELETE FROM shoplazza_product_index_increment WHERE tenant_id = 163;
  592 +SELECT ROW_COUNT() AS deleted_index_increment;
591 " 593 "
592 594
593 -# 再次统计 tenant_id=163 的行数  
594 -MYSQL_PWD='qY8tgodLoA&KT#yQ' mysql -h 10.200.16.14 -P 3316 -u root saas -N -e "  
595 -SELECT 'shoplazza_sync_log', COUNT(*) FROM shoplazza_sync_log WHERE tenant_id = 163  
596 -UNION ALL  
597 -SELECT 'shoplazza_product_index_increment', COUNT(*) FROM shoplazza_product_index_increment WHERE tenant_id = 163;  
598 -"  
599 ``` 595 ```
600 596
601 然后触发重新安装: 597 然后触发重新安装:
search/es_query_builder.py
@@ -29,7 +29,14 @@ class ESQueryBuilder: @@ -29,7 +29,14 @@ class ESQueryBuilder:
29 source_fields: Optional[List[str]] = None, 29 source_fields: Optional[List[str]] = None,
30 function_score_config: Optional[FunctionScoreConfig] = None, 30 function_score_config: Optional[FunctionScoreConfig] = None,
31 default_language: str = "en", 31 default_language: str = "en",
32 - knn_boost: float = 0.25, 32 + knn_text_boost: float = 20.0,
  33 + knn_image_boost: float = 20.0,
  34 + knn_text_k: int = 120,
  35 + knn_text_num_candidates: int = 400,
  36 + knn_text_k_long: int = 160,
  37 + knn_text_num_candidates_long: int = 500,
  38 + knn_image_k: int = 120,
  39 + knn_image_num_candidates: int = 400,
33 base_minimum_should_match: str = "70%", 40 base_minimum_should_match: str = "70%",
34 translation_minimum_should_match: str = "70%", 41 translation_minimum_should_match: str = "70%",
35 translation_boost: float = 0.4, 42 translation_boost: float = 0.4,
@@ -55,7 +62,8 @@ class ESQueryBuilder: @@ -55,7 +62,8 @@ class ESQueryBuilder:
55 source_fields: Fields to return in search results (_source includes) 62 source_fields: Fields to return in search results (_source includes)
56 function_score_config: Function score configuration 63 function_score_config: Function score configuration
57 default_language: Default language to use when detection fails or returns "unknown" 64 default_language: Default language to use when detection fails or returns "unknown"
58 - knn_boost: Boost value for KNN (embedding recall) 65 + knn_text_boost: Boost for text-embedding KNN clause
  66 + knn_image_boost: Boost for image-embedding KNN clause
59 """ 67 """
60 self.match_fields = match_fields 68 self.match_fields = match_fields
61 self.field_boosts = field_boosts or {} 69 self.field_boosts = field_boosts or {}
@@ -67,7 +75,14 @@ class ESQueryBuilder: @@ -67,7 +75,14 @@ class ESQueryBuilder:
67 self.source_fields = source_fields 75 self.source_fields = source_fields
68 self.function_score_config = function_score_config 76 self.function_score_config = function_score_config
69 self.default_language = default_language 77 self.default_language = default_language
70 - self.knn_boost = knn_boost 78 + self.knn_text_boost = float(knn_text_boost)
  79 + self.knn_image_boost = float(knn_image_boost)
  80 + self.knn_text_k = int(knn_text_k)
  81 + self.knn_text_num_candidates = int(knn_text_num_candidates)
  82 + self.knn_text_k_long = int(knn_text_k_long)
  83 + self.knn_text_num_candidates_long = int(knn_text_num_candidates_long)
  84 + self.knn_image_k = int(knn_image_k)
  85 + self.knn_image_num_candidates = int(knn_image_num_candidates)
71 self.base_minimum_should_match = base_minimum_should_match 86 self.base_minimum_should_match = base_minimum_should_match
72 self.translation_minimum_should_match = translation_minimum_should_match 87 self.translation_minimum_should_match = translation_minimum_should_match
73 self.translation_boost = float(translation_boost) 88 self.translation_boost = float(translation_boost)
@@ -171,8 +186,6 @@ class ESQueryBuilder: @@ -171,8 +186,6 @@ class ESQueryBuilder:
171 size: int = 10, 186 size: int = 10,
172 from_: int = 0, 187 from_: int = 0,
173 enable_knn: bool = True, 188 enable_knn: bool = True,
174 - knn_k: int = 50,  
175 - knn_num_candidates: int = 200,  
176 min_score: Optional[float] = None, 189 min_score: Optional[float] = None,
177 parsed_query: Optional[Any] = None, 190 parsed_query: Optional[Any] = None,
178 ) -> Dict[str, Any]: 191 ) -> Dict[str, Any]:
@@ -195,8 +208,6 @@ class ESQueryBuilder: @@ -195,8 +208,6 @@ class ESQueryBuilder:
195 size: Number of results 208 size: Number of results
196 from_: Offset for pagination 209 from_: Offset for pagination
197 enable_knn: Whether to use KNN search 210 enable_knn: Whether to use KNN search
198 - knn_k: K value for KNN  
199 - knn_num_candidates: Number of candidates for KNN  
200 min_score: Minimum score threshold 211 min_score: Minimum score threshold
201 212
202 Returns: 213 Returns:
@@ -234,41 +245,37 @@ class ESQueryBuilder: @@ -234,41 +245,37 @@ class ESQueryBuilder:
234 filter_clauses.append(product_title_exclusion_filter) 245 filter_clauses.append(product_title_exclusion_filter)
235 246
236 # 3. Add KNN search clauses alongside lexical clauses under the same bool.should 247 # 3. Add KNN search clauses alongside lexical clauses under the same bool.should
237 - # Adjust KNN k, num_candidates, boost by query_tokens (short query: less KNN; long: more)  
238 - final_knn_k, final_knn_num_candidates = knn_k, knn_num_candidates 248 + # Text KNN: k / num_candidates from config; long queries use *_long and higher boost
239 if has_embedding: 249 if has_embedding:
240 - knn_boost = self.knn_boost 250 + text_knn_boost = self.knn_text_boost
  251 + final_knn_k = self.knn_text_k
  252 + final_knn_num_candidates = self.knn_text_num_candidates
241 if parsed_query: 253 if parsed_query:
242 query_tokens = getattr(parsed_query, 'query_tokens', None) or [] 254 query_tokens = getattr(parsed_query, 'query_tokens', None) or []
243 token_count = len(query_tokens) 255 token_count = len(query_tokens)
244 if token_count >= 5: 256 if token_count >= 5:
245 - final_knn_k, final_knn_num_candidates = 160, 500  
246 - knn_boost = self.knn_boost * 1.4 # Higher weight for long queries  
247 - else:  
248 - final_knn_k, final_knn_num_candidates = 120, 400  
249 - else:  
250 - final_knn_k, final_knn_num_candidates = 120, 400 257 + final_knn_k = self.knn_text_k_long
  258 + final_knn_num_candidates = self.knn_text_num_candidates_long
  259 + text_knn_boost = self.knn_text_boost * 1.4
251 recall_clauses.append({ 260 recall_clauses.append({
252 "knn": { 261 "knn": {
253 "field": self.text_embedding_field, 262 "field": self.text_embedding_field,
254 "query_vector": query_vector.tolist(), 263 "query_vector": query_vector.tolist(),
255 "k": final_knn_k, 264 "k": final_knn_k,
256 "num_candidates": final_knn_num_candidates, 265 "num_candidates": final_knn_num_candidates,
257 - "boost": knn_boost, 266 + "boost": text_knn_boost,
258 "_name": "knn_query", 267 "_name": "knn_query",
259 } 268 }
260 }) 269 })
261 270
262 if has_image_embedding: 271 if has_image_embedding:
263 - image_knn_k = max(final_knn_k, 120)  
264 - image_knn_num_candidates = max(final_knn_num_candidates, 400)  
265 recall_clauses.append({ 272 recall_clauses.append({
266 "knn": { 273 "knn": {
267 "field": self.image_embedding_field, 274 "field": self.image_embedding_field,
268 "query_vector": image_query_vector.tolist(), 275 "query_vector": image_query_vector.tolist(),
269 - "k": image_knn_k,  
270 - "num_candidates": image_knn_num_candidates,  
271 - "boost": self.knn_boost, 276 + "k": self.knn_image_k,
  277 + "num_candidates": self.knn_image_num_candidates,
  278 + "boost": self.knn_image_boost,
272 "_name": "image_knn_query", 279 "_name": "image_knn_query",
273 } 280 }
274 }) 281 })
search/searcher.py
@@ -133,7 +133,14 @@ class Searcher: @@ -133,7 +133,14 @@ class Searcher:
133 source_fields=self.source_fields, 133 source_fields=self.source_fields,
134 function_score_config=self.config.function_score, 134 function_score_config=self.config.function_score,
135 default_language=self.config.query_config.default_language, 135 default_language=self.config.query_config.default_language,
136 - knn_boost=self.config.query_config.knn_boost, 136 + knn_text_boost=self.config.query_config.knn_text_boost,
  137 + knn_image_boost=self.config.query_config.knn_image_boost,
  138 + knn_text_k=self.config.query_config.knn_text_k,
  139 + knn_text_num_candidates=self.config.query_config.knn_text_num_candidates,
  140 + knn_text_k_long=self.config.query_config.knn_text_k_long,
  141 + knn_text_num_candidates_long=self.config.query_config.knn_text_num_candidates_long,
  142 + knn_image_k=self.config.query_config.knn_image_k,
  143 + knn_image_num_candidates=self.config.query_config.knn_image_num_candidates,
137 base_minimum_should_match=self.config.query_config.base_minimum_should_match, 144 base_minimum_should_match=self.config.query_config.base_minimum_should_match,
138 translation_minimum_should_match=self.config.query_config.translation_minimum_should_match, 145 translation_minimum_should_match=self.config.query_config.translation_minimum_should_match,
139 translation_boost=self.config.query_config.translation_boost, 146 translation_boost=self.config.query_config.translation_boost,
tests/test_es_query_builder.py
@@ -119,9 +119,12 @@ def test_text_query_skips_duplicate_translation_same_as_base(): @@ -119,9 +119,12 @@ def test_text_query_skips_duplicate_translation_same_as_base():
119 enable_knn=False, 119 enable_knn=False,
120 ) 120 )
121 121
122 - root = _recall_root(q)  
123 - assert root["bool"]["_name"] == "base_query"  
124 - assert [clause["multi_match"]["type"] for clause in root["bool"]["should"]] == ["best_fields", "phrase"] 122 + query_root = q["query"]
  123 + if "function_score" in query_root:
  124 + query_root = query_root["function_score"]["query"]
  125 + base_bool = query_root["bool"]
  126 + assert base_bool["_name"] == "base_query"
  127 + assert [clause["multi_match"]["type"] for clause in base_bool["should"]] == ["best_fields", "phrase"]
125 128
126 129
127 def test_product_title_exclusion_filter_is_applied_once_on_outer_query(): 130 def test_product_title_exclusion_filter_is_applied_once_on_outer_query():
tests/test_es_query_builder_text_recall_languages.py
@@ -351,7 +351,10 @@ def test_text_clauses_present_alongside_knn(): @@ -351,7 +351,10 @@ def test_text_clauses_present_alongside_knn():
351 parsed_query=parsed, 351 parsed_query=parsed,
352 enable_knn=True, 352 enable_knn=True,
353 ) 353 )
354 - assert "knn" in q 354 + qr = q["query"]
  355 + if "function_score" in qr:
  356 + qr = qr["function_score"]["query"]
  357 + assert any("knn" in c for c in qr["bool"]["should"])
355 idx = _clauses_index(q) 358 idx = _clauses_index(q)
356 assert set(idx) == {"base_query", "base_query_trans_zh"} 359 assert set(idx) == {"base_query", "base_query_trans_zh"}
357 360