From ed13851cd0575d1033bbf5b8e66840031a869660 Mon Sep 17 00:00:00 2001 From: tangwang Date: Fri, 27 Mar 2026 11:53:45 +0800 Subject: [PATCH] 图片文本两个knn召回相关参数配置 --- config/config.yaml | 14 ++++++++++++-- config/loader.py | 15 ++++++++++++++- config/schema.py | 10 +++++++++- docs/常用查询 - sql.sql | 16 ++++++---------- search/es_query_builder.py | 51 +++++++++++++++++++++++++++++---------------------- search/searcher.py | 9 ++++++++- tests/test_es_query_builder.py | 9 ++++++--- tests/test_es_query_builder_text_recall_languages.py | 5 ++++- 8 files changed, 88 insertions(+), 41 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index ba46bcf..ca42b90 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -206,8 +206,18 @@ query_config: - specifications - skus - # KNN boost配置(向量召回的boost值) - knn_boost: 2.0 # Lower boost for embedding recall + # KNN:文本向量与多模态(图片)向量各自 boost 与召回(k / num_candidates) + knn_text_boost: 20 + knn_image_boost: 20 + + knn_text_k: 150 + knn_text_num_candidates: 400 + + knn_text_k_long: 300 + knn_text_num_candidates_long: 720 + + knn_image_k: 300 + knn_image_num_candidates: 720 # Function Score配置(ES层打分规则) function_score: diff --git a/config/loader.py b/config/loader.py index 79bf3e2..674c93c 100644 --- a/config/loader.py +++ b/config/loader.py @@ -376,7 +376,20 @@ class AppConfigLoader: text_embedding_field=query_cfg.get("text_embedding_field"), image_embedding_field=query_cfg.get("image_embedding_field"), source_fields=query_cfg.get("source_fields"), - knn_boost=float(query_cfg.get("knn_boost", 0.25)), + knn_text_boost=float( + query_cfg.get("knn_text_boost", query_cfg.get("knn_boost", 0.25)) + ), + knn_image_boost=float( + query_cfg.get("knn_image_boost", query_cfg.get("knn_boost", 0.25)) + ), + knn_text_k=int(query_cfg.get("knn_text_k", 120)), + knn_text_num_candidates=int(query_cfg.get("knn_text_num_candidates", 400)), + knn_text_k_long=int(query_cfg.get("knn_text_k_long", 160)), + knn_text_num_candidates_long=int( + query_cfg.get("knn_text_num_candidates_long", 500) + ), + knn_image_k=int(query_cfg.get("knn_image_k", 120)), + knn_image_num_candidates=int(query_cfg.get("knn_image_num_candidates", 400)), multilingual_fields=list( search_fields.get( "multilingual_fields", diff --git a/config/schema.py b/config/schema.py index fd2148b..226e9ea 100644 --- a/config/schema.py +++ b/config/schema.py @@ -34,7 +34,15 @@ class QueryConfig: text_embedding_field: Optional[str] = "title_embedding" image_embedding_field: Optional[str] = None source_fields: Optional[List[str]] = None - knn_boost: float = 0.25 + # 文本向量 KNN 与多模态(图片)向量 KNN 各自 boost;未在 YAML 中写时由 loader 用 legacy knn_boost 回填 + knn_text_boost: float = 20.0 + knn_image_boost: float = 20.0 + knn_text_k: int = 120 + knn_text_num_candidates: int = 400 + knn_text_k_long: int = 160 + knn_text_num_candidates_long: int = 500 + knn_image_k: int = 120 + knn_image_num_candidates: int = 400 multilingual_fields: List[str] = field( default_factory=lambda: [] ) diff --git a/docs/常用查询 - sql.sql b/docs/常用查询 - sql.sql index 74229e7..fa09c13 100644 --- a/docs/常用查询 - sql.sql +++ b/docs/常用查询 - sql.sql @@ -584,18 +584,14 @@ SELECT 'shoplazza_product_index_increment', COUNT(*) FROM shoplazza_product_inde " # 执行删除 -cd /data/saas-search && MYSQL_PWD='qY8tgodLoA&KT#yQ' mysql -h 10.200.16.14 -P 3316 -u root saas -N -e " -SELECT 'shoplazza_sync_log', COUNT(*) FROM shoplazza_sync_log WHERE tenant_id = 163 -UNION ALL -SELECT 'shoplazza_product_index_increment', COUNT(*) FROM shoplazza_product_index_increment WHERE tenant_id = 163; +cd /data/saas-search && MYSQL_PWD='qY8tgodLoA&KT#yQ' mysql -h 10.200.16.14 -P 3316 -u root saas -e " +SET SESSION sql_safe_updates = 0; +DELETE FROM shoplazza_sync_log WHERE tenant_id = 163; +SELECT ROW_COUNT() AS deleted_sync_log; +DELETE FROM shoplazza_product_index_increment WHERE tenant_id = 163; +SELECT ROW_COUNT() AS deleted_index_increment; " -# 再次统计 tenant_id=163 的行数 -MYSQL_PWD='qY8tgodLoA&KT#yQ' mysql -h 10.200.16.14 -P 3316 -u root saas -N -e " -SELECT 'shoplazza_sync_log', COUNT(*) FROM shoplazza_sync_log WHERE tenant_id = 163 -UNION ALL -SELECT 'shoplazza_product_index_increment', COUNT(*) FROM shoplazza_product_index_increment WHERE tenant_id = 163; -" ``` 然后触发重新安装: diff --git a/search/es_query_builder.py b/search/es_query_builder.py index 25778b3..09f6bfe 100644 --- a/search/es_query_builder.py +++ b/search/es_query_builder.py @@ -29,7 +29,14 @@ class ESQueryBuilder: source_fields: Optional[List[str]] = None, function_score_config: Optional[FunctionScoreConfig] = None, default_language: str = "en", - knn_boost: float = 0.25, + knn_text_boost: float = 20.0, + knn_image_boost: float = 20.0, + knn_text_k: int = 120, + knn_text_num_candidates: int = 400, + knn_text_k_long: int = 160, + knn_text_num_candidates_long: int = 500, + knn_image_k: int = 120, + knn_image_num_candidates: int = 400, base_minimum_should_match: str = "70%", translation_minimum_should_match: str = "70%", translation_boost: float = 0.4, @@ -55,7 +62,8 @@ class ESQueryBuilder: source_fields: Fields to return in search results (_source includes) function_score_config: Function score configuration default_language: Default language to use when detection fails or returns "unknown" - knn_boost: Boost value for KNN (embedding recall) + knn_text_boost: Boost for text-embedding KNN clause + knn_image_boost: Boost for image-embedding KNN clause """ self.match_fields = match_fields self.field_boosts = field_boosts or {} @@ -67,7 +75,14 @@ class ESQueryBuilder: self.source_fields = source_fields self.function_score_config = function_score_config self.default_language = default_language - self.knn_boost = knn_boost + self.knn_text_boost = float(knn_text_boost) + self.knn_image_boost = float(knn_image_boost) + self.knn_text_k = int(knn_text_k) + self.knn_text_num_candidates = int(knn_text_num_candidates) + self.knn_text_k_long = int(knn_text_k_long) + self.knn_text_num_candidates_long = int(knn_text_num_candidates_long) + self.knn_image_k = int(knn_image_k) + self.knn_image_num_candidates = int(knn_image_num_candidates) self.base_minimum_should_match = base_minimum_should_match self.translation_minimum_should_match = translation_minimum_should_match self.translation_boost = float(translation_boost) @@ -171,8 +186,6 @@ class ESQueryBuilder: size: int = 10, from_: int = 0, enable_knn: bool = True, - knn_k: int = 50, - knn_num_candidates: int = 200, min_score: Optional[float] = None, parsed_query: Optional[Any] = None, ) -> Dict[str, Any]: @@ -195,8 +208,6 @@ class ESQueryBuilder: size: Number of results from_: Offset for pagination enable_knn: Whether to use KNN search - knn_k: K value for KNN - knn_num_candidates: Number of candidates for KNN min_score: Minimum score threshold Returns: @@ -234,41 +245,37 @@ class ESQueryBuilder: filter_clauses.append(product_title_exclusion_filter) # 3. Add KNN search clauses alongside lexical clauses under the same bool.should - # Adjust KNN k, num_candidates, boost by query_tokens (short query: less KNN; long: more) - final_knn_k, final_knn_num_candidates = knn_k, knn_num_candidates + # Text KNN: k / num_candidates from config; long queries use *_long and higher boost if has_embedding: - knn_boost = self.knn_boost + text_knn_boost = self.knn_text_boost + final_knn_k = self.knn_text_k + final_knn_num_candidates = self.knn_text_num_candidates if parsed_query: query_tokens = getattr(parsed_query, 'query_tokens', None) or [] token_count = len(query_tokens) if token_count >= 5: - final_knn_k, final_knn_num_candidates = 160, 500 - knn_boost = self.knn_boost * 1.4 # Higher weight for long queries - else: - final_knn_k, final_knn_num_candidates = 120, 400 - else: - final_knn_k, final_knn_num_candidates = 120, 400 + final_knn_k = self.knn_text_k_long + final_knn_num_candidates = self.knn_text_num_candidates_long + text_knn_boost = self.knn_text_boost * 1.4 recall_clauses.append({ "knn": { "field": self.text_embedding_field, "query_vector": query_vector.tolist(), "k": final_knn_k, "num_candidates": final_knn_num_candidates, - "boost": knn_boost, + "boost": text_knn_boost, "_name": "knn_query", } }) if has_image_embedding: - image_knn_k = max(final_knn_k, 120) - image_knn_num_candidates = max(final_knn_num_candidates, 400) recall_clauses.append({ "knn": { "field": self.image_embedding_field, "query_vector": image_query_vector.tolist(), - "k": image_knn_k, - "num_candidates": image_knn_num_candidates, - "boost": self.knn_boost, + "k": self.knn_image_k, + "num_candidates": self.knn_image_num_candidates, + "boost": self.knn_image_boost, "_name": "image_knn_query", } }) diff --git a/search/searcher.py b/search/searcher.py index 66b78ab..81a4e04 100644 --- a/search/searcher.py +++ b/search/searcher.py @@ -133,7 +133,14 @@ class Searcher: source_fields=self.source_fields, function_score_config=self.config.function_score, default_language=self.config.query_config.default_language, - knn_boost=self.config.query_config.knn_boost, + knn_text_boost=self.config.query_config.knn_text_boost, + knn_image_boost=self.config.query_config.knn_image_boost, + knn_text_k=self.config.query_config.knn_text_k, + knn_text_num_candidates=self.config.query_config.knn_text_num_candidates, + knn_text_k_long=self.config.query_config.knn_text_k_long, + knn_text_num_candidates_long=self.config.query_config.knn_text_num_candidates_long, + knn_image_k=self.config.query_config.knn_image_k, + knn_image_num_candidates=self.config.query_config.knn_image_num_candidates, base_minimum_should_match=self.config.query_config.base_minimum_should_match, translation_minimum_should_match=self.config.query_config.translation_minimum_should_match, translation_boost=self.config.query_config.translation_boost, diff --git a/tests/test_es_query_builder.py b/tests/test_es_query_builder.py index 1e5789f..a6cca7a 100644 --- a/tests/test_es_query_builder.py +++ b/tests/test_es_query_builder.py @@ -119,9 +119,12 @@ def test_text_query_skips_duplicate_translation_same_as_base(): enable_knn=False, ) - root = _recall_root(q) - assert root["bool"]["_name"] == "base_query" - assert [clause["multi_match"]["type"] for clause in root["bool"]["should"]] == ["best_fields", "phrase"] + query_root = q["query"] + if "function_score" in query_root: + query_root = query_root["function_score"]["query"] + base_bool = query_root["bool"] + assert base_bool["_name"] == "base_query" + assert [clause["multi_match"]["type"] for clause in base_bool["should"]] == ["best_fields", "phrase"] def test_product_title_exclusion_filter_is_applied_once_on_outer_query(): diff --git a/tests/test_es_query_builder_text_recall_languages.py b/tests/test_es_query_builder_text_recall_languages.py index 6db0c1e..ff98e64 100644 --- a/tests/test_es_query_builder_text_recall_languages.py +++ b/tests/test_es_query_builder_text_recall_languages.py @@ -351,7 +351,10 @@ def test_text_clauses_present_alongside_knn(): parsed_query=parsed, enable_knn=True, ) - assert "knn" in q + qr = q["query"] + if "function_score" in qr: + qr = qr["function_score"]["query"] + assert any("knn" in c for c in qr["bool"]["should"]) idx = _clauses_index(q) assert set(idx) == {"base_query", "base_query_trans_zh"} -- libgit2 0.21.2