Commit ed13851cd0575d1033bbf5b8e66840031a869660
1 parent
1681a135
图片文本两个knn召回相关参数配置
Showing
8 changed files
with
88 additions
and
41 deletions
Show diff stats
config/config.yaml
| ... | ... | @@ -206,8 +206,18 @@ query_config: |
| 206 | 206 | - specifications |
| 207 | 207 | - skus |
| 208 | 208 | |
| 209 | - # KNN boost配置(向量召回的boost值) | |
| 210 | - knn_boost: 2.0 # Lower boost for embedding recall | |
| 209 | + # KNN:文本向量与多模态(图片)向量各自 boost 与召回(k / num_candidates) | |
| 210 | + knn_text_boost: 20 | |
| 211 | + knn_image_boost: 20 | |
| 212 | + | |
| 213 | + knn_text_k: 150 | |
| 214 | + knn_text_num_candidates: 400 | |
| 215 | + | |
| 216 | + knn_text_k_long: 300 | |
| 217 | + knn_text_num_candidates_long: 720 | |
| 218 | + | |
| 219 | + knn_image_k: 300 | |
| 220 | + knn_image_num_candidates: 720 | |
| 211 | 221 | |
| 212 | 222 | # Function Score配置(ES层打分规则) |
| 213 | 223 | function_score: | ... | ... |
config/loader.py
| ... | ... | @@ -376,7 +376,20 @@ class AppConfigLoader: |
| 376 | 376 | text_embedding_field=query_cfg.get("text_embedding_field"), |
| 377 | 377 | image_embedding_field=query_cfg.get("image_embedding_field"), |
| 378 | 378 | source_fields=query_cfg.get("source_fields"), |
| 379 | - knn_boost=float(query_cfg.get("knn_boost", 0.25)), | |
| 379 | + knn_text_boost=float( | |
| 380 | + query_cfg.get("knn_text_boost", query_cfg.get("knn_boost", 0.25)) | |
| 381 | + ), | |
| 382 | + knn_image_boost=float( | |
| 383 | + query_cfg.get("knn_image_boost", query_cfg.get("knn_boost", 0.25)) | |
| 384 | + ), | |
| 385 | + knn_text_k=int(query_cfg.get("knn_text_k", 120)), | |
| 386 | + knn_text_num_candidates=int(query_cfg.get("knn_text_num_candidates", 400)), | |
| 387 | + knn_text_k_long=int(query_cfg.get("knn_text_k_long", 160)), | |
| 388 | + knn_text_num_candidates_long=int( | |
| 389 | + query_cfg.get("knn_text_num_candidates_long", 500) | |
| 390 | + ), | |
| 391 | + knn_image_k=int(query_cfg.get("knn_image_k", 120)), | |
| 392 | + knn_image_num_candidates=int(query_cfg.get("knn_image_num_candidates", 400)), | |
| 380 | 393 | multilingual_fields=list( |
| 381 | 394 | search_fields.get( |
| 382 | 395 | "multilingual_fields", | ... | ... |
config/schema.py
| ... | ... | @@ -34,7 +34,15 @@ class QueryConfig: |
| 34 | 34 | text_embedding_field: Optional[str] = "title_embedding" |
| 35 | 35 | image_embedding_field: Optional[str] = None |
| 36 | 36 | source_fields: Optional[List[str]] = None |
| 37 | - knn_boost: float = 0.25 | |
| 37 | + # 文本向量 KNN 与多模态(图片)向量 KNN 各自 boost;未在 YAML 中写时由 loader 用 legacy knn_boost 回填 | |
| 38 | + knn_text_boost: float = 20.0 | |
| 39 | + knn_image_boost: float = 20.0 | |
| 40 | + knn_text_k: int = 120 | |
| 41 | + knn_text_num_candidates: int = 400 | |
| 42 | + knn_text_k_long: int = 160 | |
| 43 | + knn_text_num_candidates_long: int = 500 | |
| 44 | + knn_image_k: int = 120 | |
| 45 | + knn_image_num_candidates: int = 400 | |
| 38 | 46 | multilingual_fields: List[str] = field( |
| 39 | 47 | default_factory=lambda: [] |
| 40 | 48 | ) | ... | ... |
docs/常用查询 - sql.sql
| ... | ... | @@ -584,18 +584,14 @@ SELECT 'shoplazza_product_index_increment', COUNT(*) FROM shoplazza_product_inde |
| 584 | 584 | " |
| 585 | 585 | |
| 586 | 586 | # 执行删除 |
| 587 | -cd /data/saas-search && MYSQL_PWD='qY8tgodLoA&KT#yQ' mysql -h 10.200.16.14 -P 3316 -u root saas -N -e " | |
| 588 | -SELECT 'shoplazza_sync_log', COUNT(*) FROM shoplazza_sync_log WHERE tenant_id = 163 | |
| 589 | -UNION ALL | |
| 590 | -SELECT 'shoplazza_product_index_increment', COUNT(*) FROM shoplazza_product_index_increment WHERE tenant_id = 163; | |
| 587 | +cd /data/saas-search && MYSQL_PWD='qY8tgodLoA&KT#yQ' mysql -h 10.200.16.14 -P 3316 -u root saas -e " | |
| 588 | +SET SESSION sql_safe_updates = 0; | |
| 589 | +DELETE FROM shoplazza_sync_log WHERE tenant_id = 163; | |
| 590 | +SELECT ROW_COUNT() AS deleted_sync_log; | |
| 591 | +DELETE FROM shoplazza_product_index_increment WHERE tenant_id = 163; | |
| 592 | +SELECT ROW_COUNT() AS deleted_index_increment; | |
| 591 | 593 | " |
| 592 | 594 | |
| 593 | -# 再次统计 tenant_id=163 的行数 | |
| 594 | -MYSQL_PWD='qY8tgodLoA&KT#yQ' mysql -h 10.200.16.14 -P 3316 -u root saas -N -e " | |
| 595 | -SELECT 'shoplazza_sync_log', COUNT(*) FROM shoplazza_sync_log WHERE tenant_id = 163 | |
| 596 | -UNION ALL | |
| 597 | -SELECT 'shoplazza_product_index_increment', COUNT(*) FROM shoplazza_product_index_increment WHERE tenant_id = 163; | |
| 598 | -" | |
| 599 | 595 | ``` |
| 600 | 596 | |
| 601 | 597 | 然后触发重新安装: | ... | ... |
search/es_query_builder.py
| ... | ... | @@ -29,7 +29,14 @@ class ESQueryBuilder: |
| 29 | 29 | source_fields: Optional[List[str]] = None, |
| 30 | 30 | function_score_config: Optional[FunctionScoreConfig] = None, |
| 31 | 31 | default_language: str = "en", |
| 32 | - knn_boost: float = 0.25, | |
| 32 | + knn_text_boost: float = 20.0, | |
| 33 | + knn_image_boost: float = 20.0, | |
| 34 | + knn_text_k: int = 120, | |
| 35 | + knn_text_num_candidates: int = 400, | |
| 36 | + knn_text_k_long: int = 160, | |
| 37 | + knn_text_num_candidates_long: int = 500, | |
| 38 | + knn_image_k: int = 120, | |
| 39 | + knn_image_num_candidates: int = 400, | |
| 33 | 40 | base_minimum_should_match: str = "70%", |
| 34 | 41 | translation_minimum_should_match: str = "70%", |
| 35 | 42 | translation_boost: float = 0.4, |
| ... | ... | @@ -55,7 +62,8 @@ class ESQueryBuilder: |
| 55 | 62 | source_fields: Fields to return in search results (_source includes) |
| 56 | 63 | function_score_config: Function score configuration |
| 57 | 64 | default_language: Default language to use when detection fails or returns "unknown" |
| 58 | - knn_boost: Boost value for KNN (embedding recall) | |
| 65 | + knn_text_boost: Boost for text-embedding KNN clause | |
| 66 | + knn_image_boost: Boost for image-embedding KNN clause | |
| 59 | 67 | """ |
| 60 | 68 | self.match_fields = match_fields |
| 61 | 69 | self.field_boosts = field_boosts or {} |
| ... | ... | @@ -67,7 +75,14 @@ class ESQueryBuilder: |
| 67 | 75 | self.source_fields = source_fields |
| 68 | 76 | self.function_score_config = function_score_config |
| 69 | 77 | self.default_language = default_language |
| 70 | - self.knn_boost = knn_boost | |
| 78 | + self.knn_text_boost = float(knn_text_boost) | |
| 79 | + self.knn_image_boost = float(knn_image_boost) | |
| 80 | + self.knn_text_k = int(knn_text_k) | |
| 81 | + self.knn_text_num_candidates = int(knn_text_num_candidates) | |
| 82 | + self.knn_text_k_long = int(knn_text_k_long) | |
| 83 | + self.knn_text_num_candidates_long = int(knn_text_num_candidates_long) | |
| 84 | + self.knn_image_k = int(knn_image_k) | |
| 85 | + self.knn_image_num_candidates = int(knn_image_num_candidates) | |
| 71 | 86 | self.base_minimum_should_match = base_minimum_should_match |
| 72 | 87 | self.translation_minimum_should_match = translation_minimum_should_match |
| 73 | 88 | self.translation_boost = float(translation_boost) |
| ... | ... | @@ -171,8 +186,6 @@ class ESQueryBuilder: |
| 171 | 186 | size: int = 10, |
| 172 | 187 | from_: int = 0, |
| 173 | 188 | enable_knn: bool = True, |
| 174 | - knn_k: int = 50, | |
| 175 | - knn_num_candidates: int = 200, | |
| 176 | 189 | min_score: Optional[float] = None, |
| 177 | 190 | parsed_query: Optional[Any] = None, |
| 178 | 191 | ) -> Dict[str, Any]: |
| ... | ... | @@ -195,8 +208,6 @@ class ESQueryBuilder: |
| 195 | 208 | size: Number of results |
| 196 | 209 | from_: Offset for pagination |
| 197 | 210 | enable_knn: Whether to use KNN search |
| 198 | - knn_k: K value for KNN | |
| 199 | - knn_num_candidates: Number of candidates for KNN | |
| 200 | 211 | min_score: Minimum score threshold |
| 201 | 212 | |
| 202 | 213 | Returns: |
| ... | ... | @@ -234,41 +245,37 @@ class ESQueryBuilder: |
| 234 | 245 | filter_clauses.append(product_title_exclusion_filter) |
| 235 | 246 | |
| 236 | 247 | # 3. Add KNN search clauses alongside lexical clauses under the same bool.should |
| 237 | - # Adjust KNN k, num_candidates, boost by query_tokens (short query: less KNN; long: more) | |
| 238 | - final_knn_k, final_knn_num_candidates = knn_k, knn_num_candidates | |
| 248 | + # Text KNN: k / num_candidates from config; long queries use *_long and higher boost | |
| 239 | 249 | if has_embedding: |
| 240 | - knn_boost = self.knn_boost | |
| 250 | + text_knn_boost = self.knn_text_boost | |
| 251 | + final_knn_k = self.knn_text_k | |
| 252 | + final_knn_num_candidates = self.knn_text_num_candidates | |
| 241 | 253 | if parsed_query: |
| 242 | 254 | query_tokens = getattr(parsed_query, 'query_tokens', None) or [] |
| 243 | 255 | token_count = len(query_tokens) |
| 244 | 256 | if token_count >= 5: |
| 245 | - final_knn_k, final_knn_num_candidates = 160, 500 | |
| 246 | - knn_boost = self.knn_boost * 1.4 # Higher weight for long queries | |
| 247 | - else: | |
| 248 | - final_knn_k, final_knn_num_candidates = 120, 400 | |
| 249 | - else: | |
| 250 | - final_knn_k, final_knn_num_candidates = 120, 400 | |
| 257 | + final_knn_k = self.knn_text_k_long | |
| 258 | + final_knn_num_candidates = self.knn_text_num_candidates_long | |
| 259 | + text_knn_boost = self.knn_text_boost * 1.4 | |
| 251 | 260 | recall_clauses.append({ |
| 252 | 261 | "knn": { |
| 253 | 262 | "field": self.text_embedding_field, |
| 254 | 263 | "query_vector": query_vector.tolist(), |
| 255 | 264 | "k": final_knn_k, |
| 256 | 265 | "num_candidates": final_knn_num_candidates, |
| 257 | - "boost": knn_boost, | |
| 266 | + "boost": text_knn_boost, | |
| 258 | 267 | "_name": "knn_query", |
| 259 | 268 | } |
| 260 | 269 | }) |
| 261 | 270 | |
| 262 | 271 | if has_image_embedding: |
| 263 | - image_knn_k = max(final_knn_k, 120) | |
| 264 | - image_knn_num_candidates = max(final_knn_num_candidates, 400) | |
| 265 | 272 | recall_clauses.append({ |
| 266 | 273 | "knn": { |
| 267 | 274 | "field": self.image_embedding_field, |
| 268 | 275 | "query_vector": image_query_vector.tolist(), |
| 269 | - "k": image_knn_k, | |
| 270 | - "num_candidates": image_knn_num_candidates, | |
| 271 | - "boost": self.knn_boost, | |
| 276 | + "k": self.knn_image_k, | |
| 277 | + "num_candidates": self.knn_image_num_candidates, | |
| 278 | + "boost": self.knn_image_boost, | |
| 272 | 279 | "_name": "image_knn_query", |
| 273 | 280 | } |
| 274 | 281 | }) | ... | ... |
search/searcher.py
| ... | ... | @@ -133,7 +133,14 @@ class Searcher: |
| 133 | 133 | source_fields=self.source_fields, |
| 134 | 134 | function_score_config=self.config.function_score, |
| 135 | 135 | default_language=self.config.query_config.default_language, |
| 136 | - knn_boost=self.config.query_config.knn_boost, | |
| 136 | + knn_text_boost=self.config.query_config.knn_text_boost, | |
| 137 | + knn_image_boost=self.config.query_config.knn_image_boost, | |
| 138 | + knn_text_k=self.config.query_config.knn_text_k, | |
| 139 | + knn_text_num_candidates=self.config.query_config.knn_text_num_candidates, | |
| 140 | + knn_text_k_long=self.config.query_config.knn_text_k_long, | |
| 141 | + knn_text_num_candidates_long=self.config.query_config.knn_text_num_candidates_long, | |
| 142 | + knn_image_k=self.config.query_config.knn_image_k, | |
| 143 | + knn_image_num_candidates=self.config.query_config.knn_image_num_candidates, | |
| 137 | 144 | base_minimum_should_match=self.config.query_config.base_minimum_should_match, |
| 138 | 145 | translation_minimum_should_match=self.config.query_config.translation_minimum_should_match, |
| 139 | 146 | translation_boost=self.config.query_config.translation_boost, | ... | ... |
tests/test_es_query_builder.py
| ... | ... | @@ -119,9 +119,12 @@ def test_text_query_skips_duplicate_translation_same_as_base(): |
| 119 | 119 | enable_knn=False, |
| 120 | 120 | ) |
| 121 | 121 | |
| 122 | - root = _recall_root(q) | |
| 123 | - assert root["bool"]["_name"] == "base_query" | |
| 124 | - assert [clause["multi_match"]["type"] for clause in root["bool"]["should"]] == ["best_fields", "phrase"] | |
| 122 | + query_root = q["query"] | |
| 123 | + if "function_score" in query_root: | |
| 124 | + query_root = query_root["function_score"]["query"] | |
| 125 | + base_bool = query_root["bool"] | |
| 126 | + assert base_bool["_name"] == "base_query" | |
| 127 | + assert [clause["multi_match"]["type"] for clause in base_bool["should"]] == ["best_fields", "phrase"] | |
| 125 | 128 | |
| 126 | 129 | |
| 127 | 130 | def test_product_title_exclusion_filter_is_applied_once_on_outer_query(): | ... | ... |
tests/test_es_query_builder_text_recall_languages.py
| ... | ... | @@ -351,7 +351,10 @@ def test_text_clauses_present_alongside_knn(): |
| 351 | 351 | parsed_query=parsed, |
| 352 | 352 | enable_knn=True, |
| 353 | 353 | ) |
| 354 | - assert "knn" in q | |
| 354 | + qr = q["query"] | |
| 355 | + if "function_score" in qr: | |
| 356 | + qr = qr["function_score"]["query"] | |
| 357 | + assert any("knn" in c for c in qr["bool"]["should"]) | |
| 355 | 358 | idx = _clauses_index(q) |
| 356 | 359 | assert set(idx) == {"base_query", "base_query_trans_zh"} |
| 357 | 360 | ... | ... |