Commit ed13851cd0575d1033bbf5b8e66840031a869660
1 parent
1681a135
图片文本两个knn召回相关参数配置
Showing
8 changed files
with
88 additions
and
41 deletions
Show diff stats
config/config.yaml
| @@ -206,8 +206,18 @@ query_config: | @@ -206,8 +206,18 @@ query_config: | ||
| 206 | - specifications | 206 | - specifications |
| 207 | - skus | 207 | - skus |
| 208 | 208 | ||
| 209 | - # KNN boost配置(向量召回的boost值) | ||
| 210 | - knn_boost: 2.0 # Lower boost for embedding recall | 209 | + # KNN:文本向量与多模态(图片)向量各自 boost 与召回(k / num_candidates) |
| 210 | + knn_text_boost: 20 | ||
| 211 | + knn_image_boost: 20 | ||
| 212 | + | ||
| 213 | + knn_text_k: 150 | ||
| 214 | + knn_text_num_candidates: 400 | ||
| 215 | + | ||
| 216 | + knn_text_k_long: 300 | ||
| 217 | + knn_text_num_candidates_long: 720 | ||
| 218 | + | ||
| 219 | + knn_image_k: 300 | ||
| 220 | + knn_image_num_candidates: 720 | ||
| 211 | 221 | ||
| 212 | # Function Score配置(ES层打分规则) | 222 | # Function Score配置(ES层打分规则) |
| 213 | function_score: | 223 | function_score: |
config/loader.py
| @@ -376,7 +376,20 @@ class AppConfigLoader: | @@ -376,7 +376,20 @@ class AppConfigLoader: | ||
| 376 | text_embedding_field=query_cfg.get("text_embedding_field"), | 376 | text_embedding_field=query_cfg.get("text_embedding_field"), |
| 377 | image_embedding_field=query_cfg.get("image_embedding_field"), | 377 | image_embedding_field=query_cfg.get("image_embedding_field"), |
| 378 | source_fields=query_cfg.get("source_fields"), | 378 | source_fields=query_cfg.get("source_fields"), |
| 379 | - knn_boost=float(query_cfg.get("knn_boost", 0.25)), | 379 | + knn_text_boost=float( |
| 380 | + query_cfg.get("knn_text_boost", query_cfg.get("knn_boost", 0.25)) | ||
| 381 | + ), | ||
| 382 | + knn_image_boost=float( | ||
| 383 | + query_cfg.get("knn_image_boost", query_cfg.get("knn_boost", 0.25)) | ||
| 384 | + ), | ||
| 385 | + knn_text_k=int(query_cfg.get("knn_text_k", 120)), | ||
| 386 | + knn_text_num_candidates=int(query_cfg.get("knn_text_num_candidates", 400)), | ||
| 387 | + knn_text_k_long=int(query_cfg.get("knn_text_k_long", 160)), | ||
| 388 | + knn_text_num_candidates_long=int( | ||
| 389 | + query_cfg.get("knn_text_num_candidates_long", 500) | ||
| 390 | + ), | ||
| 391 | + knn_image_k=int(query_cfg.get("knn_image_k", 120)), | ||
| 392 | + knn_image_num_candidates=int(query_cfg.get("knn_image_num_candidates", 400)), | ||
| 380 | multilingual_fields=list( | 393 | multilingual_fields=list( |
| 381 | search_fields.get( | 394 | search_fields.get( |
| 382 | "multilingual_fields", | 395 | "multilingual_fields", |
config/schema.py
| @@ -34,7 +34,15 @@ class QueryConfig: | @@ -34,7 +34,15 @@ class QueryConfig: | ||
| 34 | text_embedding_field: Optional[str] = "title_embedding" | 34 | text_embedding_field: Optional[str] = "title_embedding" |
| 35 | image_embedding_field: Optional[str] = None | 35 | image_embedding_field: Optional[str] = None |
| 36 | source_fields: Optional[List[str]] = None | 36 | source_fields: Optional[List[str]] = None |
| 37 | - knn_boost: float = 0.25 | 37 | + # 文本向量 KNN 与多模态(图片)向量 KNN 各自 boost;未在 YAML 中写时由 loader 用 legacy knn_boost 回填 |
| 38 | + knn_text_boost: float = 20.0 | ||
| 39 | + knn_image_boost: float = 20.0 | ||
| 40 | + knn_text_k: int = 120 | ||
| 41 | + knn_text_num_candidates: int = 400 | ||
| 42 | + knn_text_k_long: int = 160 | ||
| 43 | + knn_text_num_candidates_long: int = 500 | ||
| 44 | + knn_image_k: int = 120 | ||
| 45 | + knn_image_num_candidates: int = 400 | ||
| 38 | multilingual_fields: List[str] = field( | 46 | multilingual_fields: List[str] = field( |
| 39 | default_factory=lambda: [] | 47 | default_factory=lambda: [] |
| 40 | ) | 48 | ) |
docs/常用查询 - sql.sql
| @@ -584,18 +584,14 @@ SELECT 'shoplazza_product_index_increment', COUNT(*) FROM shoplazza_product_inde | @@ -584,18 +584,14 @@ SELECT 'shoplazza_product_index_increment', COUNT(*) FROM shoplazza_product_inde | ||
| 584 | " | 584 | " |
| 585 | 585 | ||
| 586 | # 执行删除 | 586 | # 执行删除 |
| 587 | -cd /data/saas-search && MYSQL_PWD='qY8tgodLoA&KT#yQ' mysql -h 10.200.16.14 -P 3316 -u root saas -N -e " | ||
| 588 | -SELECT 'shoplazza_sync_log', COUNT(*) FROM shoplazza_sync_log WHERE tenant_id = 163 | ||
| 589 | -UNION ALL | ||
| 590 | -SELECT 'shoplazza_product_index_increment', COUNT(*) FROM shoplazza_product_index_increment WHERE tenant_id = 163; | 587 | +cd /data/saas-search && MYSQL_PWD='qY8tgodLoA&KT#yQ' mysql -h 10.200.16.14 -P 3316 -u root saas -e " |
| 588 | +SET SESSION sql_safe_updates = 0; | ||
| 589 | +DELETE FROM shoplazza_sync_log WHERE tenant_id = 163; | ||
| 590 | +SELECT ROW_COUNT() AS deleted_sync_log; | ||
| 591 | +DELETE FROM shoplazza_product_index_increment WHERE tenant_id = 163; | ||
| 592 | +SELECT ROW_COUNT() AS deleted_index_increment; | ||
| 591 | " | 593 | " |
| 592 | 594 | ||
| 593 | -# 再次统计 tenant_id=163 的行数 | ||
| 594 | -MYSQL_PWD='qY8tgodLoA&KT#yQ' mysql -h 10.200.16.14 -P 3316 -u root saas -N -e " | ||
| 595 | -SELECT 'shoplazza_sync_log', COUNT(*) FROM shoplazza_sync_log WHERE tenant_id = 163 | ||
| 596 | -UNION ALL | ||
| 597 | -SELECT 'shoplazza_product_index_increment', COUNT(*) FROM shoplazza_product_index_increment WHERE tenant_id = 163; | ||
| 598 | -" | ||
| 599 | ``` | 595 | ``` |
| 600 | 596 | ||
| 601 | 然后触发重新安装: | 597 | 然后触发重新安装: |
search/es_query_builder.py
| @@ -29,7 +29,14 @@ class ESQueryBuilder: | @@ -29,7 +29,14 @@ class ESQueryBuilder: | ||
| 29 | source_fields: Optional[List[str]] = None, | 29 | source_fields: Optional[List[str]] = None, |
| 30 | function_score_config: Optional[FunctionScoreConfig] = None, | 30 | function_score_config: Optional[FunctionScoreConfig] = None, |
| 31 | default_language: str = "en", | 31 | default_language: str = "en", |
| 32 | - knn_boost: float = 0.25, | 32 | + knn_text_boost: float = 20.0, |
| 33 | + knn_image_boost: float = 20.0, | ||
| 34 | + knn_text_k: int = 120, | ||
| 35 | + knn_text_num_candidates: int = 400, | ||
| 36 | + knn_text_k_long: int = 160, | ||
| 37 | + knn_text_num_candidates_long: int = 500, | ||
| 38 | + knn_image_k: int = 120, | ||
| 39 | + knn_image_num_candidates: int = 400, | ||
| 33 | base_minimum_should_match: str = "70%", | 40 | base_minimum_should_match: str = "70%", |
| 34 | translation_minimum_should_match: str = "70%", | 41 | translation_minimum_should_match: str = "70%", |
| 35 | translation_boost: float = 0.4, | 42 | translation_boost: float = 0.4, |
| @@ -55,7 +62,8 @@ class ESQueryBuilder: | @@ -55,7 +62,8 @@ class ESQueryBuilder: | ||
| 55 | source_fields: Fields to return in search results (_source includes) | 62 | source_fields: Fields to return in search results (_source includes) |
| 56 | function_score_config: Function score configuration | 63 | function_score_config: Function score configuration |
| 57 | default_language: Default language to use when detection fails or returns "unknown" | 64 | default_language: Default language to use when detection fails or returns "unknown" |
| 58 | - knn_boost: Boost value for KNN (embedding recall) | 65 | + knn_text_boost: Boost for text-embedding KNN clause |
| 66 | + knn_image_boost: Boost for image-embedding KNN clause | ||
| 59 | """ | 67 | """ |
| 60 | self.match_fields = match_fields | 68 | self.match_fields = match_fields |
| 61 | self.field_boosts = field_boosts or {} | 69 | self.field_boosts = field_boosts or {} |
| @@ -67,7 +75,14 @@ class ESQueryBuilder: | @@ -67,7 +75,14 @@ class ESQueryBuilder: | ||
| 67 | self.source_fields = source_fields | 75 | self.source_fields = source_fields |
| 68 | self.function_score_config = function_score_config | 76 | self.function_score_config = function_score_config |
| 69 | self.default_language = default_language | 77 | self.default_language = default_language |
| 70 | - self.knn_boost = knn_boost | 78 | + self.knn_text_boost = float(knn_text_boost) |
| 79 | + self.knn_image_boost = float(knn_image_boost) | ||
| 80 | + self.knn_text_k = int(knn_text_k) | ||
| 81 | + self.knn_text_num_candidates = int(knn_text_num_candidates) | ||
| 82 | + self.knn_text_k_long = int(knn_text_k_long) | ||
| 83 | + self.knn_text_num_candidates_long = int(knn_text_num_candidates_long) | ||
| 84 | + self.knn_image_k = int(knn_image_k) | ||
| 85 | + self.knn_image_num_candidates = int(knn_image_num_candidates) | ||
| 71 | self.base_minimum_should_match = base_minimum_should_match | 86 | self.base_minimum_should_match = base_minimum_should_match |
| 72 | self.translation_minimum_should_match = translation_minimum_should_match | 87 | self.translation_minimum_should_match = translation_minimum_should_match |
| 73 | self.translation_boost = float(translation_boost) | 88 | self.translation_boost = float(translation_boost) |
| @@ -171,8 +186,6 @@ class ESQueryBuilder: | @@ -171,8 +186,6 @@ class ESQueryBuilder: | ||
| 171 | size: int = 10, | 186 | size: int = 10, |
| 172 | from_: int = 0, | 187 | from_: int = 0, |
| 173 | enable_knn: bool = True, | 188 | enable_knn: bool = True, |
| 174 | - knn_k: int = 50, | ||
| 175 | - knn_num_candidates: int = 200, | ||
| 176 | min_score: Optional[float] = None, | 189 | min_score: Optional[float] = None, |
| 177 | parsed_query: Optional[Any] = None, | 190 | parsed_query: Optional[Any] = None, |
| 178 | ) -> Dict[str, Any]: | 191 | ) -> Dict[str, Any]: |
| @@ -195,8 +208,6 @@ class ESQueryBuilder: | @@ -195,8 +208,6 @@ class ESQueryBuilder: | ||
| 195 | size: Number of results | 208 | size: Number of results |
| 196 | from_: Offset for pagination | 209 | from_: Offset for pagination |
| 197 | enable_knn: Whether to use KNN search | 210 | enable_knn: Whether to use KNN search |
| 198 | - knn_k: K value for KNN | ||
| 199 | - knn_num_candidates: Number of candidates for KNN | ||
| 200 | min_score: Minimum score threshold | 211 | min_score: Minimum score threshold |
| 201 | 212 | ||
| 202 | Returns: | 213 | Returns: |
| @@ -234,41 +245,37 @@ class ESQueryBuilder: | @@ -234,41 +245,37 @@ class ESQueryBuilder: | ||
| 234 | filter_clauses.append(product_title_exclusion_filter) | 245 | filter_clauses.append(product_title_exclusion_filter) |
| 235 | 246 | ||
| 236 | # 3. Add KNN search clauses alongside lexical clauses under the same bool.should | 247 | # 3. Add KNN search clauses alongside lexical clauses under the same bool.should |
| 237 | - # Adjust KNN k, num_candidates, boost by query_tokens (short query: less KNN; long: more) | ||
| 238 | - final_knn_k, final_knn_num_candidates = knn_k, knn_num_candidates | 248 | + # Text KNN: k / num_candidates from config; long queries use *_long and higher boost |
| 239 | if has_embedding: | 249 | if has_embedding: |
| 240 | - knn_boost = self.knn_boost | 250 | + text_knn_boost = self.knn_text_boost |
| 251 | + final_knn_k = self.knn_text_k | ||
| 252 | + final_knn_num_candidates = self.knn_text_num_candidates | ||
| 241 | if parsed_query: | 253 | if parsed_query: |
| 242 | query_tokens = getattr(parsed_query, 'query_tokens', None) or [] | 254 | query_tokens = getattr(parsed_query, 'query_tokens', None) or [] |
| 243 | token_count = len(query_tokens) | 255 | token_count = len(query_tokens) |
| 244 | if token_count >= 5: | 256 | if token_count >= 5: |
| 245 | - final_knn_k, final_knn_num_candidates = 160, 500 | ||
| 246 | - knn_boost = self.knn_boost * 1.4 # Higher weight for long queries | ||
| 247 | - else: | ||
| 248 | - final_knn_k, final_knn_num_candidates = 120, 400 | ||
| 249 | - else: | ||
| 250 | - final_knn_k, final_knn_num_candidates = 120, 400 | 257 | + final_knn_k = self.knn_text_k_long |
| 258 | + final_knn_num_candidates = self.knn_text_num_candidates_long | ||
| 259 | + text_knn_boost = self.knn_text_boost * 1.4 | ||
| 251 | recall_clauses.append({ | 260 | recall_clauses.append({ |
| 252 | "knn": { | 261 | "knn": { |
| 253 | "field": self.text_embedding_field, | 262 | "field": self.text_embedding_field, |
| 254 | "query_vector": query_vector.tolist(), | 263 | "query_vector": query_vector.tolist(), |
| 255 | "k": final_knn_k, | 264 | "k": final_knn_k, |
| 256 | "num_candidates": final_knn_num_candidates, | 265 | "num_candidates": final_knn_num_candidates, |
| 257 | - "boost": knn_boost, | 266 | + "boost": text_knn_boost, |
| 258 | "_name": "knn_query", | 267 | "_name": "knn_query", |
| 259 | } | 268 | } |
| 260 | }) | 269 | }) |
| 261 | 270 | ||
| 262 | if has_image_embedding: | 271 | if has_image_embedding: |
| 263 | - image_knn_k = max(final_knn_k, 120) | ||
| 264 | - image_knn_num_candidates = max(final_knn_num_candidates, 400) | ||
| 265 | recall_clauses.append({ | 272 | recall_clauses.append({ |
| 266 | "knn": { | 273 | "knn": { |
| 267 | "field": self.image_embedding_field, | 274 | "field": self.image_embedding_field, |
| 268 | "query_vector": image_query_vector.tolist(), | 275 | "query_vector": image_query_vector.tolist(), |
| 269 | - "k": image_knn_k, | ||
| 270 | - "num_candidates": image_knn_num_candidates, | ||
| 271 | - "boost": self.knn_boost, | 276 | + "k": self.knn_image_k, |
| 277 | + "num_candidates": self.knn_image_num_candidates, | ||
| 278 | + "boost": self.knn_image_boost, | ||
| 272 | "_name": "image_knn_query", | 279 | "_name": "image_knn_query", |
| 273 | } | 280 | } |
| 274 | }) | 281 | }) |
search/searcher.py
| @@ -133,7 +133,14 @@ class Searcher: | @@ -133,7 +133,14 @@ class Searcher: | ||
| 133 | source_fields=self.source_fields, | 133 | source_fields=self.source_fields, |
| 134 | function_score_config=self.config.function_score, | 134 | function_score_config=self.config.function_score, |
| 135 | default_language=self.config.query_config.default_language, | 135 | default_language=self.config.query_config.default_language, |
| 136 | - knn_boost=self.config.query_config.knn_boost, | 136 | + knn_text_boost=self.config.query_config.knn_text_boost, |
| 137 | + knn_image_boost=self.config.query_config.knn_image_boost, | ||
| 138 | + knn_text_k=self.config.query_config.knn_text_k, | ||
| 139 | + knn_text_num_candidates=self.config.query_config.knn_text_num_candidates, | ||
| 140 | + knn_text_k_long=self.config.query_config.knn_text_k_long, | ||
| 141 | + knn_text_num_candidates_long=self.config.query_config.knn_text_num_candidates_long, | ||
| 142 | + knn_image_k=self.config.query_config.knn_image_k, | ||
| 143 | + knn_image_num_candidates=self.config.query_config.knn_image_num_candidates, | ||
| 137 | base_minimum_should_match=self.config.query_config.base_minimum_should_match, | 144 | base_minimum_should_match=self.config.query_config.base_minimum_should_match, |
| 138 | translation_minimum_should_match=self.config.query_config.translation_minimum_should_match, | 145 | translation_minimum_should_match=self.config.query_config.translation_minimum_should_match, |
| 139 | translation_boost=self.config.query_config.translation_boost, | 146 | translation_boost=self.config.query_config.translation_boost, |
tests/test_es_query_builder.py
| @@ -119,9 +119,12 @@ def test_text_query_skips_duplicate_translation_same_as_base(): | @@ -119,9 +119,12 @@ def test_text_query_skips_duplicate_translation_same_as_base(): | ||
| 119 | enable_knn=False, | 119 | enable_knn=False, |
| 120 | ) | 120 | ) |
| 121 | 121 | ||
| 122 | - root = _recall_root(q) | ||
| 123 | - assert root["bool"]["_name"] == "base_query" | ||
| 124 | - assert [clause["multi_match"]["type"] for clause in root["bool"]["should"]] == ["best_fields", "phrase"] | 122 | + query_root = q["query"] |
| 123 | + if "function_score" in query_root: | ||
| 124 | + query_root = query_root["function_score"]["query"] | ||
| 125 | + base_bool = query_root["bool"] | ||
| 126 | + assert base_bool["_name"] == "base_query" | ||
| 127 | + assert [clause["multi_match"]["type"] for clause in base_bool["should"]] == ["best_fields", "phrase"] | ||
| 125 | 128 | ||
| 126 | 129 | ||
| 127 | def test_product_title_exclusion_filter_is_applied_once_on_outer_query(): | 130 | def test_product_title_exclusion_filter_is_applied_once_on_outer_query(): |
tests/test_es_query_builder_text_recall_languages.py
| @@ -351,7 +351,10 @@ def test_text_clauses_present_alongside_knn(): | @@ -351,7 +351,10 @@ def test_text_clauses_present_alongside_knn(): | ||
| 351 | parsed_query=parsed, | 351 | parsed_query=parsed, |
| 352 | enable_knn=True, | 352 | enable_knn=True, |
| 353 | ) | 353 | ) |
| 354 | - assert "knn" in q | 354 | + qr = q["query"] |
| 355 | + if "function_score" in qr: | ||
| 356 | + qr = qr["function_score"]["query"] | ||
| 357 | + assert any("knn" in c for c in qr["bool"]["should"]) | ||
| 355 | idx = _clauses_index(q) | 358 | idx = _clauses_index(q) |
| 356 | assert set(idx) == {"base_query", "base_query_trans_zh"} | 359 | assert set(idx) == {"base_query", "base_query_trans_zh"} |
| 357 | 360 |