Commit 24e921414ea12e1e6fa123c0b33eb5333bcdecd2
1 parent
26b910bd
delete enable_multilang_search
Showing
5 changed files
with
29 additions
and
37 deletions
Show diff stats
config/config.yaml
| @@ -88,7 +88,6 @@ query_config: | @@ -88,7 +88,6 @@ query_config: | ||
| 88 | # 功能开关(翻译开关由tenant_config控制) | 88 | # 功能开关(翻译开关由tenant_config控制) |
| 89 | enable_text_embedding: true | 89 | enable_text_embedding: true |
| 90 | enable_query_rewrite: true | 90 | enable_query_rewrite: true |
| 91 | - enable_multilang_search: true # 启用多语言搜索(使用翻译进行跨语言检索) | ||
| 92 | 91 | ||
| 93 | # Embedding字段名称 | 92 | # Embedding字段名称 |
| 94 | text_embedding_field: "title_embedding" | 93 | text_embedding_field: "title_embedding" |
config/config_loader.py
| @@ -33,8 +33,7 @@ class QueryConfig: | @@ -33,8 +33,7 @@ class QueryConfig: | ||
| 33 | # Feature flags | 33 | # Feature flags |
| 34 | enable_text_embedding: bool = True | 34 | enable_text_embedding: bool = True |
| 35 | enable_query_rewrite: bool = True | 35 | enable_query_rewrite: bool = True |
| 36 | - enable_multilang_search: bool = True # Enable multi-language search using translations | ||
| 37 | - | 36 | + |
| 38 | # Query rewrite dictionary (loaded from external file) | 37 | # Query rewrite dictionary (loaded from external file) |
| 39 | rewrite_dictionary: Dict[str, str] = field(default_factory=dict) | 38 | rewrite_dictionary: Dict[str, str] = field(default_factory=dict) |
| 40 | 39 |
search/es_query_builder.py
| @@ -24,20 +24,21 @@ class ESQueryBuilder: | @@ -24,20 +24,21 @@ class ESQueryBuilder: | ||
| 24 | image_embedding_field: Optional[str] = None, | 24 | image_embedding_field: Optional[str] = None, |
| 25 | source_fields: Optional[List[str]] = None, | 25 | source_fields: Optional[List[str]] = None, |
| 26 | function_score_config: Optional[FunctionScoreConfig] = None, | 26 | function_score_config: Optional[FunctionScoreConfig] = None, |
| 27 | - enable_multilang_search: bool = True, | ||
| 28 | default_language: str = "en", | 27 | default_language: str = "en", |
| 29 | knn_boost: float = 0.25 | 28 | knn_boost: float = 0.25 |
| 30 | ): | 29 | ): |
| 31 | """ | 30 | """ |
| 32 | Initialize query builder. | 31 | Initialize query builder. |
| 33 | 32 | ||
| 33 | + Multi-language search (translation-based cross-language recall) is always enabled: | ||
| 34 | + queries are matched against both detected-language and translated zh/en clauses. | ||
| 35 | + | ||
| 34 | Args: | 36 | Args: |
| 35 | match_fields: Fields to search for text matching | 37 | match_fields: Fields to search for text matching |
| 36 | text_embedding_field: Field name for text embeddings | 38 | text_embedding_field: Field name for text embeddings |
| 37 | image_embedding_field: Field name for image embeddings | 39 | image_embedding_field: Field name for image embeddings |
| 38 | source_fields: Fields to return in search results (_source includes) | 40 | source_fields: Fields to return in search results (_source includes) |
| 39 | function_score_config: Function score configuration | 41 | function_score_config: Function score configuration |
| 40 | - enable_multilang_search: Enable multi-language search using translations | ||
| 41 | default_language: Default language to use when detection fails or returns "unknown" | 42 | default_language: Default language to use when detection fails or returns "unknown" |
| 42 | knn_boost: Boost value for KNN (embedding recall) | 43 | knn_boost: Boost value for KNN (embedding recall) |
| 43 | """ | 44 | """ |
| @@ -46,7 +47,6 @@ class ESQueryBuilder: | @@ -46,7 +47,6 @@ class ESQueryBuilder: | ||
| 46 | self.image_embedding_field = image_embedding_field | 47 | self.image_embedding_field = image_embedding_field |
| 47 | self.source_fields = source_fields | 48 | self.source_fields = source_fields |
| 48 | self.function_score_config = function_score_config | 49 | self.function_score_config = function_score_config |
| 49 | - self.enable_multilang_search = enable_multilang_search | ||
| 50 | self.default_language = default_language | 50 | self.default_language = default_language |
| 51 | self.knn_boost = knn_boost | 51 | self.knn_boost = knn_boost |
| 52 | 52 | ||
| @@ -488,35 +488,31 @@ class ESQueryBuilder: | @@ -488,35 +488,31 @@ class ESQueryBuilder: | ||
| 488 | } | 488 | } |
| 489 | }) | 489 | }) |
| 490 | 490 | ||
| 491 | - # 2. Translation queries - lower boost (0.4) for other languages | ||
| 492 | - if self.enable_multilang_search: | ||
| 493 | - if language != 'zh' and translations.get('zh'): | ||
| 494 | - zh_fields, _ = self._get_match_fields('zh') | ||
| 495 | - should_clauses.append({ | ||
| 496 | - "multi_match": { | ||
| 497 | - "query": translations['zh'], | ||
| 498 | - "fields": zh_fields, | ||
| 499 | - # "operator": "AND", | ||
| 500 | - "minimum_should_match": "75%", | ||
| 501 | - "tie_breaker": tie_breaker_base_query, | ||
| 502 | - "boost": 0.4, | ||
| 503 | - "_name": "base_query_trans_zh" | ||
| 504 | - } | ||
| 505 | - }) | ||
| 506 | - | ||
| 507 | - if language != 'en' and translations.get('en'): | ||
| 508 | - en_fields, _ = self._get_match_fields('en') | ||
| 509 | - should_clauses.append({ | ||
| 510 | - "multi_match": { | ||
| 511 | - "query": translations['en'], | ||
| 512 | - "fields": en_fields, | ||
| 513 | - # "operator": "AND", | ||
| 514 | - "minimum_should_match": "75%", | ||
| 515 | - "tie_breaker": tie_breaker_base_query, | ||
| 516 | - "boost": 0.4, | ||
| 517 | - "_name": "base_query_trans_en" | ||
| 518 | - } | ||
| 519 | - }) | 491 | + # 2. Translation queries - lower boost (0.4) for other languages (multi-language search always on) |
| 492 | + if language != 'zh' and translations.get('zh'): | ||
| 493 | + zh_fields, _ = self._get_match_fields('zh') | ||
| 494 | + should_clauses.append({ | ||
| 495 | + "multi_match": { | ||
| 496 | + "query": translations['zh'], | ||
| 497 | + "fields": zh_fields, | ||
| 498 | + "minimum_should_match": "75%", | ||
| 499 | + "tie_breaker": tie_breaker_base_query, | ||
| 500 | + "boost": 0.4, | ||
| 501 | + "_name": "base_query_trans_zh" | ||
| 502 | + } | ||
| 503 | + }) | ||
| 504 | + if language != 'en' and translations.get('en'): | ||
| 505 | + en_fields, _ = self._get_match_fields('en') | ||
| 506 | + should_clauses.append({ | ||
| 507 | + "multi_match": { | ||
| 508 | + "query": translations['en'], | ||
| 509 | + "fields": en_fields, | ||
| 510 | + "minimum_should_match": "75%", | ||
| 511 | + "tie_breaker": tie_breaker_base_query, | ||
| 512 | + "boost": 0.4, | ||
| 513 | + "_name": "base_query_trans_en" | ||
| 514 | + } | ||
| 515 | + }) | ||
| 520 | 516 | ||
| 521 | if False and is_long_query: | 517 | if False and is_long_query: |
| 522 | boost = 0.5 * pow(min(1.0, token_count / 10.0), 0.9) | 518 | boost = 0.5 * pow(min(1.0, token_count / 10.0), 0.9) |
search/searcher.py
| @@ -119,7 +119,6 @@ class Searcher: | @@ -119,7 +119,6 @@ class Searcher: | ||
| 119 | image_embedding_field=self.image_embedding_field, | 119 | image_embedding_field=self.image_embedding_field, |
| 120 | source_fields=self.source_fields, | 120 | source_fields=self.source_fields, |
| 121 | function_score_config=self.config.function_score, | 121 | function_score_config=self.config.function_score, |
| 122 | - enable_multilang_search=self.config.query_config.enable_multilang_search, | ||
| 123 | default_language=self.config.query_config.default_language, | 122 | default_language=self.config.query_config.default_language, |
| 124 | knn_boost=self.config.query_config.knn_boost | 123 | knn_boost=self.config.query_config.knn_boost |
| 125 | ) | 124 | ) |
tests/test_embedding_pipeline.py
| @@ -77,7 +77,6 @@ def _build_test_config() -> SearchConfig: | @@ -77,7 +77,6 @@ def _build_test_config() -> SearchConfig: | ||
| 77 | default_language="en", | 77 | default_language="en", |
| 78 | enable_text_embedding=True, | 78 | enable_text_embedding=True, |
| 79 | enable_query_rewrite=False, | 79 | enable_query_rewrite=False, |
| 80 | - enable_multilang_search=True, | ||
| 81 | rewrite_dictionary={}, | 80 | rewrite_dictionary={}, |
| 82 | translation_prompts={"query_zh": "e-commerce domain", "query_en": "e-commerce domain"}, | 81 | translation_prompts={"query_zh": "e-commerce domain", "query_en": "e-commerce domain"}, |
| 83 | text_embedding_field="title_embedding", | 82 | text_embedding_field="title_embedding", |