Commit bd96ceadef76dc327afcd2d07a023f4902d2f9f5

Authored by tangwang
1 parent 24e92141

1. 动态多语言字段与统一策略配置

- 配置改为“字段基名 + 动态语言后缀”方案,已不再依赖旧 `indexes`。
[config.yaml](/data/saas-search/config/config.yaml#L17)
- `search_fields` / `text_query_strategy` 已进入强校验与解析流程。
[config_loader.py](/data/saas-search/config/config_loader.py#L254)

2. 查询语言计划与翻译等待策略
- `QueryParser` 现在产出
  `query_text_by_lang`、`search_langs`、`source_in_index_languages`。
[query_parser.py](/data/saas-search/query/query_parser.py#L41)
- 你要求的两种翻译路径都在:
  - 源语言不在店铺 `index_languages`:`translate_multi_async` + 等待
    future
  - 源语言在 `index_languages`:`translate_multi(...,
    async_mode=True)`,尽量走缓存
[query_parser.py](/data/saas-search/query/query_parser.py#L284)

3. ES 查询统一文本策略(无 AST 分支)
- 主召回按 `search_langs` 动态拼 `field.{lang}`,翻译语种做次权重
  `should`。
[es_query_builder.py](/data/saas-search/search/es_query_builder.py#L454)
- 布尔 AST 路径已删除,仅保留统一文本策略。
[es_query_builder.py](/data/saas-search/search/es_query_builder.py#L185)

4. LanguageDetector 优化
- 从“拉丁字母默认英文”升级为:脚本优先 +
  拉丁语系打分(词典/变音/后缀)。
[language_detector.py](/data/saas-search/query/language_detector.py#L68)

5. 布尔能力清理(补充)
- 已删除废弃模块:
[boolean_parser.py](/data/saas-search/search/boolean_parser.py)
- `search/__init__` 已无相关导出。
[search/__init__.py](/data/saas-search/search/__init__.py)

6. `indexes` 过时收口(补充)
- 兼容函数改为基于动态字段生成,不再依赖 `config.indexes`。
[utils.py](/data/saas-search/config/utils.py#L24)
- Admin 配置接口改为返回动态字段配置,不再暴露 `num_indexes`。
[admin.py](/data/saas-search/api/routes/admin.py#L52)

7. suggest
1 # Elasticsearch Configuration 1 # Elasticsearch Configuration
2 -ES_HOST=http://localhost:9200  
3 -ES_USERNAME=saas 2 +ES_HOST=http://120.76.41.98:9200
  3 +ES_USERNAME=essa
4 ES_PASSWORD=4hOaLaf41y2VuI8y 4 ES_PASSWORD=4hOaLaf41y2VuI8y
5 5
6 # Redis Configuration (Optional) - AI 生产 10.200.16.14:6479 6 # Redis Configuration (Optional) - AI 生产 10.200.16.14:6479
@@ -70,12 +70,12 @@ class SearchRequest(BaseModel): @@ -70,12 +70,12 @@ class SearchRequest(BaseModel):
70 """搜索请求模型(重构版)""" 70 """搜索请求模型(重构版)"""
71 71
72 # 基础搜索参数 72 # 基础搜索参数
73 - query: str = Field(..., description="搜索查询字符串,支持布尔表达式(AND, OR, RANK, ANDNOT)") 73 + query: str = Field(..., description="搜索查询字符串(统一文本检索策略)")
74 size: int = Field(10, ge=1, le=1000, description="返回结果数量") 74 size: int = Field(10, ge=1, le=1000, description="返回结果数量")
75 from_: int = Field(0, ge=0, alias="from", description="分页偏移量") 75 from_: int = Field(0, ge=0, alias="from", description="分页偏移量")
76 - language: Literal["zh", "en"] = Field(  
77 - "zh",  
78 - description="响应语言:'zh'(中文)或 'en'(英文),用于选择 title/description/vendor 等多语言字段" 76 + language: str = Field(
  77 + "en",
  78 + description="响应语言代码(如 zh/en/de/fr/ar/ru),用于多语言字段返回优先级"
79 ) 79 )
80 80
81 # 过滤器 - 精确匹配和多值匹配 81 # 过滤器 - 精确匹配和多值匹配
api/result_formatter.py
@@ -27,20 +27,32 @@ class ResultFormatter: @@ -27,20 +27,32 @@ class ResultFormatter:
27 List of SpuResult objects 27 List of SpuResult objects
28 """ 28 """
29 results = [] 29 results = []
30 - lang = (language or "en").lower()  
31 - if lang not in ("zh", "en"):  
32 - lang = "en" 30 + lang = (language or "en").lower().replace("-", "_")
  31 + lang_base = lang.split("_")[0] if lang else "en"
33 32
34 def pick_lang_field(src: Dict[str, Any], base: str) -> Optional[str]: 33 def pick_lang_field(src: Dict[str, Any], base: str) -> Optional[str]:
35 """从多语言对象字段中按语言选择一个值:{base: {"zh": "...", "en": "...", ...}}""" 34 """从多语言对象字段中按语言选择一个值:{base: {"zh": "...", "en": "...", ...}}"""
36 obj = src.get(base) 35 obj = src.get(base)
37 if not isinstance(obj, dict): 36 if not isinstance(obj, dict):
38 return None 37 return None
39 - zh_val = obj.get("zh")  
40 - en_val = obj.get("en")  
41 - if lang == "zh":  
42 - return zh_val or en_val  
43 - return en_val or zh_val 38 + candidates = [
  39 + lang,
  40 + lang_base,
  41 + "en",
  42 + "zh",
  43 + ]
  44 + seen = set()
  45 + for cand in candidates:
  46 + if not cand or cand in seen:
  47 + continue
  48 + seen.add(cand)
  49 + value = obj.get(cand)
  50 + if value:
  51 + return value
  52 + for value in obj.values():
  53 + if value:
  54 + return value
  55 + return None
44 56
45 for hit in es_hits: 57 for hit in es_hits:
46 source = hit.get('_source', {}) 58 source = hit.get('_source', {})
@@ -434,4 +446,3 @@ class ResultFormatter: @@ -434,4 +446,3 @@ class ResultFormatter:
434 """ 446 """
435 # TODO: Implement related search generation logic 447 # TODO: Implement related search generation logic
436 return [] 448 return []
437 -  
api/routes/admin.py
@@ -52,7 +52,9 @@ async def get_configuration(): @@ -52,7 +52,9 @@ async def get_configuration():
52 return { 52 return {
53 "es_index_name": config.es_index_name, 53 "es_index_name": config.es_index_name,
54 "num_field_boosts": len(config.field_boosts), 54 "num_field_boosts": len(config.field_boosts),
55 - "num_indexes": len(config.indexes), 55 + "multilingual_fields": config.query_config.multilingual_fields,
  56 + "shared_fields": config.query_config.shared_fields,
  57 + "core_multilingual_fields": config.query_config.core_multilingual_fields,
56 "supported_languages": config.query_config.supported_languages, 58 "supported_languages": config.query_config.supported_languages,
57 "ranking_expression": config.ranking.expression, 59 "ranking_expression": config.ranking.expression,
58 "spu_enabled": config.spu_config.enabled 60 "spu_enabled": config.spu_config.enabled
api/routes/search.py
@@ -37,7 +37,7 @@ async def search(request: SearchRequest, http_request: Request): @@ -37,7 +37,7 @@ async def search(request: SearchRequest, http_request: Request):
37 37
38 Supports: 38 Supports:
39 - Multi-language query processing 39 - Multi-language query processing
40 - - Boolean operators (AND, OR, RANK, ANDNOT) 40 + - Unified text retrieval strategy (no boolean AST parsing)
41 - Semantic search with embeddings 41 - Semantic search with embeddings
42 - Custom ranking functions 42 - Custom ranking functions
43 - Exact match filters and range filters 43 - Exact match filters and range filters
config/config.yaml
@@ -12,71 +12,20 @@ es_settings: @@ -12,71 +12,20 @@ es_settings:
12 refresh_interval: "30s" 12 refresh_interval: "30s"
13 13
14 # 字段权重配置(用于搜索时的字段boost) 14 # 字段权重配置(用于搜索时的字段boost)
15 -# 只配置权重,不配置字段结构(字段结构由 mappings/search_products.json 定义) 15 +# 统一按“字段基名”配置;查询时按 search_langs 动态拼接 .{lang}。
  16 +# 若需要按某个语言单独调权,也可以加显式 key(例如 title.de: 3.2)。
16 field_boosts: 17 field_boosts:
17 - # 文本相关性字段  
18 - "title.zh": 3.0  
19 - "brief.zh": 1.5  
20 - "description.zh": 1.0  
21 - "vendor.zh": 1.5  
22 - "title.en": 3.0  
23 - "brief.en": 1.5  
24 - "description.en": 1.0  
25 - "vendor.en": 1.5  
26 -  
27 - # 分类相关字段  
28 - "category_path.zh": 1.5  
29 - "category_name_text.zh": 1.5  
30 - "category_path.en": 1.5  
31 - "category_name_text.en": 1.5  
32 -  
33 - # 标签和属性值字段 18 + title: 3.0
  19 + brief: 1.5
  20 + description: 1.0
  21 + vendor: 1.5
  22 + category_path: 1.5
  23 + category_name_text: 1.5
34 tags: 1.0 24 tags: 1.0
35 option1_values: 0.5 25 option1_values: 0.5
36 option2_values: 0.5 26 option2_values: 0.5
37 option3_values: 0.5 27 option3_values: 0.5
38 28
39 -# 搜索域配置(Query Domains)  
40 -# 定义不同的搜索策略,指定哪些字段组合在一起搜索  
41 -indexes:  
42 - - name: "default"  
43 - label: "默认搜索"  
44 - fields:  
45 - - "title.zh"  
46 - - "brief.zh"  
47 - - "description.zh"  
48 - - "vendor.zh"  
49 - - "tags"  
50 - - "category_path.zh"  
51 - - "category_name_text.zh"  
52 - - "option1_values"  
53 - boost: 1.0  
54 -  
55 - - name: "title"  
56 - label: "标题搜索"  
57 - fields:  
58 - - "title.zh"  
59 - boost: 2.0  
60 -  
61 - - name: "vendor"  
62 - label: "品牌搜索"  
63 - fields:  
64 - - "vendor.zh"  
65 - boost: 1.5  
66 -  
67 - - name: "category"  
68 - label: "类目搜索"  
69 - fields:  
70 - - "category_path.zh"  
71 - - "category_name_text.zh"  
72 - boost: 1.5  
73 -  
74 - - name: "tags"  
75 - label: "标签搜索"  
76 - fields:  
77 - - "tags"  
78 - boost: 1.0  
79 -  
80 # Query Configuration(查询配置) 29 # Query Configuration(查询配置)
81 query_config: 30 query_config:
82 # 支持的语言 31 # 支持的语言
@@ -89,6 +38,41 @@ query_config: @@ -89,6 +38,41 @@ query_config:
89 enable_text_embedding: true 38 enable_text_embedding: true
90 enable_query_rewrite: true 39 enable_query_rewrite: true
91 40
  41 + # 动态多语言检索字段配置
  42 + # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式;
  43 + # shared_fields 为无语言后缀字段。
  44 + search_fields:
  45 + multilingual_fields:
  46 + - "title"
  47 + - "brief"
  48 + - "description"
  49 + - "vendor"
  50 + - "category_path"
  51 + - "category_name_text"
  52 + shared_fields:
  53 + - "tags"
  54 + - "option1_values"
  55 + - "option2_values"
  56 + - "option3_values"
  57 + core_multilingual_fields:
  58 + - "title"
  59 + - "brief"
  60 + - "vendor"
  61 + - "category_name_text"
  62 +
  63 + # 统一文本召回策略(主查询 + 翻译查询 + phrase/keywords)
  64 + text_query_strategy:
  65 + base_minimum_should_match: "75%"
  66 + translation_minimum_should_match: "75%"
  67 + translation_boost: 0.4
  68 + translation_boost_when_source_missing: 1.0
  69 + source_boost_when_missing: 0.6
  70 + original_query_fallback_boost_when_translation_missing: 0.2
  71 + keywords_boost: 0.1
  72 + enable_phrase_query: true
  73 + tie_breaker_base_query: 0.9
  74 + tie_breaker_keywords: 0.9
  75 +
92 # Embedding字段名称 76 # Embedding字段名称
93 text_embedding_field: "title_embedding" 77 text_embedding_field: "title_embedding"
94 image_embedding_field: null 78 image_embedding_field: null
config/config_loader.py
@@ -57,6 +57,28 @@ class QueryConfig: @@ -57,6 +57,28 @@ class QueryConfig:
57 57
58 # KNN boost configuration 58 # KNN boost configuration
59 knn_boost: float = 0.25 # Boost value for KNN (embedding recall) 59 knn_boost: float = 0.25 # Boost value for KNN (embedding recall)
  60 +
  61 + # Dynamic text fields for multi-language retrieval
  62 + multilingual_fields: List[str] = field(
  63 + default_factory=lambda: ["title", "brief", "description", "vendor", "category_path", "category_name_text"]
  64 + )
  65 + shared_fields: List[str] = field(
  66 + default_factory=lambda: ["tags", "option1_values", "option2_values", "option3_values"]
  67 + )
  68 + core_multilingual_fields: List[str] = field(
  69 + default_factory=lambda: ["title", "brief", "vendor", "category_name_text"]
  70 + )
  71 +
  72 + # Unified text strategy tuning
  73 + base_minimum_should_match: str = "75%"
  74 + translation_minimum_should_match: str = "75%"
  75 + translation_boost: float = 0.4
  76 + translation_boost_when_source_missing: float = 1.0
  77 + source_boost_when_missing: float = 0.6
  78 + keywords_boost: float = 0.1
  79 + enable_phrase_query: bool = True
  80 + tie_breaker_base_query: float = 0.9
  81 + tie_breaker_keywords: float = 0.9
60 82
61 83
62 @dataclass 84 @dataclass
@@ -102,7 +124,7 @@ class SearchConfig: @@ -102,7 +124,7 @@ class SearchConfig:
102 # 字段权重配置(用于搜索) 124 # 字段权重配置(用于搜索)
103 field_boosts: Dict[str, float] 125 field_boosts: Dict[str, float]
104 126
105 - # Index structure (query domains) 127 + # Legacy index domains (deprecated; kept for compatibility)
106 indexes: List[IndexConfig] 128 indexes: List[IndexConfig]
107 129
108 # Query processing 130 # Query processing
@@ -218,7 +240,7 @@ class ConfigLoader: @@ -218,7 +240,7 @@ class ConfigLoader:
218 if not isinstance(field_boosts, dict): 240 if not isinstance(field_boosts, dict):
219 raise ConfigurationError("field_boosts must be a dictionary") 241 raise ConfigurationError("field_boosts must be a dictionary")
220 242
221 - # Parse indexes 243 + # Parse indexes (deprecated; compatibility only)
222 indexes = [] 244 indexes = []
223 for index_data in config_data.get("indexes", []): 245 for index_data in config_data.get("indexes", []):
224 indexes.append(self._parse_index_config(index_data)) 246 indexes.append(self._parse_index_config(index_data))
@@ -228,6 +250,8 @@ class ConfigLoader: @@ -228,6 +250,8 @@ class ConfigLoader:
228 services_data = config_data.get("services", {}) if isinstance(config_data.get("services", {}), dict) else {} 250 services_data = config_data.get("services", {}) if isinstance(config_data.get("services", {}), dict) else {}
229 rewrite_dictionary = self._load_rewrite_dictionary() 251 rewrite_dictionary = self._load_rewrite_dictionary()
230 embedding_thresholds = query_config_data.get("embedding_disable_thresholds", {}) 252 embedding_thresholds = query_config_data.get("embedding_disable_thresholds", {})
  253 + search_fields_cfg = query_config_data.get("search_fields", {})
  254 + text_strategy_cfg = query_config_data.get("text_query_strategy", {})
231 255
232 query_config = QueryConfig( 256 query_config = QueryConfig(
233 supported_languages=query_config_data.get("supported_languages") or ["zh", "en"], 257 supported_languages=query_config_data.get("supported_languages") or ["zh", "en"],
@@ -245,7 +269,30 @@ class ConfigLoader: @@ -245,7 +269,30 @@ class ConfigLoader:
245 embedding_disable_chinese_char_limit=embedding_thresholds.get("chinese_char_limit", 4), 269 embedding_disable_chinese_char_limit=embedding_thresholds.get("chinese_char_limit", 4),
246 embedding_disable_english_word_limit=embedding_thresholds.get("english_word_limit", 3), 270 embedding_disable_english_word_limit=embedding_thresholds.get("english_word_limit", 3),
247 source_fields=query_config_data.get("source_fields"), 271 source_fields=query_config_data.get("source_fields"),
248 - knn_boost=query_config_data.get("knn_boost", 0.25) 272 + knn_boost=query_config_data.get("knn_boost", 0.25),
  273 + multilingual_fields=search_fields_cfg.get(
  274 + "multilingual_fields",
  275 + ["title", "brief", "description", "vendor", "category_path", "category_name_text"],
  276 + ),
  277 + shared_fields=search_fields_cfg.get(
  278 + "shared_fields",
  279 + ["tags", "option1_values", "option2_values", "option3_values"],
  280 + ),
  281 + core_multilingual_fields=search_fields_cfg.get(
  282 + "core_multilingual_fields",
  283 + ["title", "brief", "vendor", "category_name_text"],
  284 + ),
  285 + base_minimum_should_match=str(text_strategy_cfg.get("base_minimum_should_match", "75%")),
  286 + translation_minimum_should_match=str(text_strategy_cfg.get("translation_minimum_should_match", "75%")),
  287 + translation_boost=float(text_strategy_cfg.get("translation_boost", 0.4)),
  288 + translation_boost_when_source_missing=float(
  289 + text_strategy_cfg.get("translation_boost_when_source_missing", 1.0)
  290 + ),
  291 + source_boost_when_missing=float(text_strategy_cfg.get("source_boost_when_missing", 0.6)),
  292 + keywords_boost=float(text_strategy_cfg.get("keywords_boost", 0.1)),
  293 + enable_phrase_query=bool(text_strategy_cfg.get("enable_phrase_query", True)),
  294 + tie_breaker_base_query=float(text_strategy_cfg.get("tie_breaker_base_query", 0.9)),
  295 + tie_breaker_keywords=float(text_strategy_cfg.get("tie_breaker_keywords", 0.9)),
249 ) 296 )
250 297
251 # Parse ranking config 298 # Parse ranking config
@@ -336,10 +383,7 @@ class ConfigLoader: @@ -336,10 +383,7 @@ class ConfigLoader:
336 elif boost < 0: 383 elif boost < 0:
337 errors.append(f"field_boosts['{field_name}']: boost must be non-negative") 384 errors.append(f"field_boosts['{field_name}']: boost must be non-negative")
338 385
339 - # Validate indexes  
340 - if not config.indexes:  
341 - errors.append("At least one index domain must be defined")  
342 - 386 + # Validate indexes (deprecated, optional)
343 index_names = set() 387 index_names = set()
344 for index in config.indexes: 388 for index in config.indexes:
345 # Check for duplicate index names 389 # Check for duplicate index names
@@ -365,6 +409,39 @@ class ConfigLoader: @@ -365,6 +409,39 @@ class ConfigLoader:
365 f"Default language '{config.query_config.default_language}' " 409 f"Default language '{config.query_config.default_language}' "
366 f"not in supported languages: {config.query_config.supported_languages}" 410 f"not in supported languages: {config.query_config.supported_languages}"
367 ) 411 )
  412 +
  413 + # Validate dynamic search fields
  414 + def _validate_str_list(name: str, values: List[str]) -> None:
  415 + if not isinstance(values, list) or not values:
  416 + errors.append(f"query_config.{name} must be a non-empty list[str]")
  417 + return
  418 + for i, val in enumerate(values):
  419 + if not isinstance(val, str) or not val.strip():
  420 + errors.append(f"query_config.{name}[{i}] must be a non-empty string")
  421 +
  422 + _validate_str_list("multilingual_fields", config.query_config.multilingual_fields)
  423 + _validate_str_list("shared_fields", config.query_config.shared_fields)
  424 + _validate_str_list("core_multilingual_fields", config.query_config.core_multilingual_fields)
  425 +
  426 + core_set = set(config.query_config.core_multilingual_fields)
  427 + multi_set = set(config.query_config.multilingual_fields)
  428 + if not core_set.issubset(multi_set):
  429 + errors.append("query_config.core_multilingual_fields must be subset of multilingual_fields")
  430 +
  431 + # Validate text query strategy numbers
  432 + for name in (
  433 + "translation_boost",
  434 + "translation_boost_when_source_missing",
  435 + "source_boost_when_missing",
  436 + "keywords_boost",
  437 + "tie_breaker_base_query",
  438 + "tie_breaker_keywords",
  439 + ):
  440 + value = getattr(config.query_config, name, None)
  441 + if not isinstance(value, (int, float)):
  442 + errors.append(f"query_config.{name} must be a number")
  443 + elif value < 0:
  444 + errors.append(f"query_config.{name} must be non-negative")
368 445
369 # Validate source_fields tri-state semantics 446 # Validate source_fields tri-state semantics
370 source_fields = config.query_config.source_fields 447 source_fields = config.query_config.source_fields
@@ -409,7 +486,23 @@ class ConfigLoader: @@ -409,7 +486,23 @@ class ConfigLoader:
409 "chinese_char_limit": config.query_config.embedding_disable_chinese_char_limit, 486 "chinese_char_limit": config.query_config.embedding_disable_chinese_char_limit,
410 "english_word_limit": config.query_config.embedding_disable_english_word_limit 487 "english_word_limit": config.query_config.embedding_disable_english_word_limit
411 }, 488 },
412 - "source_fields": config.query_config.source_fields 489 + "source_fields": config.query_config.source_fields,
  490 + "search_fields": {
  491 + "multilingual_fields": config.query_config.multilingual_fields,
  492 + "shared_fields": config.query_config.shared_fields,
  493 + "core_multilingual_fields": config.query_config.core_multilingual_fields,
  494 + },
  495 + "text_query_strategy": {
  496 + "base_minimum_should_match": config.query_config.base_minimum_should_match,
  497 + "translation_minimum_should_match": config.query_config.translation_minimum_should_match,
  498 + "translation_boost": config.query_config.translation_boost,
  499 + "translation_boost_when_source_missing": config.query_config.translation_boost_when_source_missing,
  500 + "source_boost_when_missing": config.query_config.source_boost_when_missing,
  501 + "keywords_boost": config.query_config.keywords_boost,
  502 + "enable_phrase_query": config.query_config.enable_phrase_query,
  503 + "tie_breaker_base_query": config.query_config.tie_breaker_base_query,
  504 + "tie_breaker_keywords": config.query_config.tie_breaker_keywords,
  505 + }
413 } 506 }
414 507
415 return { 508 return {
1 -"""  
2 -Configuration utility functions.  
3 -  
4 -Helper functions for working with SearchConfig objects.  
5 -""" 1 +"""Configuration helper functions for dynamic multi-language search fields."""
6 2
7 from typing import Dict, List 3 from typing import Dict, List
8 from .config_loader import SearchConfig 4 from .config_loader import SearchConfig
9 5
10 6
  7 +def _format_field_with_boost(field_name: str, boost: float) -> str:
  8 + if abs(float(boost) - 1.0) < 1e-9:
  9 + return field_name
  10 + return f"{field_name}^{boost}"
  11 +
  12 +
  13 +def _get_boost(config: SearchConfig, base_field: str, language: str = "") -> float:
  14 + lang = (language or "").strip().lower()
  15 + if lang:
  16 + lang_key = f"{base_field}.{lang}"
  17 + if lang_key in config.field_boosts:
  18 + return float(config.field_boosts[lang_key])
  19 + if base_field in config.field_boosts:
  20 + return float(config.field_boosts[base_field])
  21 + return 1.0
  22 +
  23 +
11 def get_match_fields_for_index(config: SearchConfig, index_name: str = "default") -> List[str]: 24 def get_match_fields_for_index(config: SearchConfig, index_name: str = "default") -> List[str]:
12 """ 25 """
13 - Generate match fields list with boost from field_boosts.  
14 -  
15 - Args:  
16 - config: SearchConfig instance  
17 - index_name: Name of the index domain (default: "default")  
18 -  
19 - Returns:  
20 - List of field names with boost, e.g., ["title.zh^3.0", "brief.zh^1.5"] 26 + Deprecated compatibility wrapper.
  27 +
  28 + `indexes` is no longer used by runtime query building. This function now returns
  29 + dynamic match fields for the default language based on query_config.search_fields.
21 """ 30 """
22 - # Find the index config  
23 - index_config = None  
24 - for idx in config.indexes:  
25 - if idx.name == index_name:  
26 - index_config = idx  
27 - break  
28 -  
29 - if not index_config:  
30 - return []  
31 -  
32 - # Generate match fields with boost  
33 - match_fields = []  
34 - for field_name in index_config.fields:  
35 - # Get field boost from field_boosts dictionary  
36 - field_boost = config.field_boosts.get(field_name, 1.0)  
37 -  
38 - # Combine index boost and field boost  
39 - total_boost = index_config.boost * field_boost  
40 -  
41 - if total_boost != 1.0:  
42 - match_fields.append(f"{field_name}^{total_boost}")  
43 - else:  
44 - match_fields.append(field_name)  
45 - 31 + del index_name
  32 + lang = (config.query_config.default_language or "en").strip().lower()
  33 + match_fields: List[str] = []
  34 +
  35 + for base_field in config.query_config.multilingual_fields:
  36 + field_name = f"{base_field}.{lang}"
  37 + match_fields.append(_format_field_with_boost(field_name, _get_boost(config, base_field, lang)))
  38 +
  39 + for shared_field in config.query_config.shared_fields:
  40 + match_fields.append(_format_field_with_boost(shared_field, _get_boost(config, shared_field)))
  41 +
46 return match_fields 42 return match_fields
47 43
48 44
49 def get_domain_fields(config: SearchConfig) -> Dict[str, List[str]]: 45 def get_domain_fields(config: SearchConfig) -> Dict[str, List[str]]:
50 """ 46 """
51 - Generate domain-specific match fields from all index configs.  
52 -  
53 - Args:  
54 - config: SearchConfig instance  
55 - 47 + Get dynamic domain fields for compatibility with old diagnostics endpoints.
  48 +
56 Returns: 49 Returns:
57 - Dictionary mapping domain name to list of match fields 50 + A single `default` domain entry generated from dynamic search_fields.
58 """ 51 """
59 - domain_fields = {}  
60 - for index_config in config.indexes:  
61 - domain_fields[index_config.name] = get_match_fields_for_index(config, index_config.name)  
62 - return domain_fields 52 + return {"default": get_match_fields_for_index(config)}
docs/DEVELOPER_GUIDE.md
@@ -105,7 +105,7 @@ MySQL (店匠 SPU/SKU) @@ -105,7 +105,7 @@ MySQL (店匠 SPU/SKU)
105 api/ # FastAPI 应用:搜索路由、管理路由、索引路由(indexer_app) 105 api/ # FastAPI 应用:搜索路由、管理路由、索引路由(indexer_app)
106 config/ # 配置加载与解析:config.yaml、services、env 106 config/ # 配置加载与解析:config.yaml、services、env
107 indexer/ # MySQL → ES 管道:mapping、transformer、bulk、增量、build-docs 107 indexer/ # MySQL → ES 管道:mapping、transformer、bulk、增量、build-docs
108 -query/ # 查询解析:规范化、改写、翻译、embedding 调用、布尔解析 108 +query/ # 查询解析:规范化、改写、翻译、embedding 调用、语言计划生成
109 search/ # 搜索执行:多语言查询构建、Searcher、重排客户端、分数融合 109 search/ # 搜索执行:多语言查询构建、Searcher、重排客户端、分数融合
110 embeddings/ # 向量化:服务端(server)、文本/图像后端、协议与配置 110 embeddings/ # 向量化:服务端(server)、文本/图像后端、协议与配置
111 reranker/ # 重排:服务端(server)、后端(backends)、配置 111 reranker/ # 重排:服务端(server)、后端(backends)、配置
@@ -144,7 +144,7 @@ docs/ # 文档(含本指南) @@ -144,7 +144,7 @@ docs/ # 文档(含本指南)
144 144
145 ### 4.4 query 145 ### 4.4 query
146 146
147 -- **职责**:查询解析与预处理:规范化、语言检测、改写(词典)、翻译、文本向量化、布尔表达式解析;输出可供 Searcher 使用的结构化查询信息 147 +- **职责**:查询解析与预处理:规范化、语言检测、改写(词典)、翻译、文本向量化;输出可供 Searcher 使用的结构化查询信息(含 search_langs 语言计划)
148 - **原则**:翻译/向量通过 `providers` 获取,不直接依赖具体服务 URL 或实现;支持按配置关闭翻译/向量(如短查询、typing 场景)。 148 - **原则**:翻译/向量通过 `providers` 获取,不直接依赖具体服务 URL 或实现;支持按配置关闭翻译/向量(如短查询、typing 场景)。
149 149
150 ### 4.5 search 150 ### 4.5 search
@@ -241,7 +241,7 @@ docs/ # 文档(含本指南) @@ -241,7 +241,7 @@ docs/ # 文档(含本指南)
241 241
242 ### 6.1 主配置文件 242 ### 6.1 主配置文件
243 243
244 -- **config/config.yaml**:搜索行为(field_boosts、indexes、query_config、ranking、function_score、rerank 融合参数)、SPU 配置、**services**(翻译/向量/重排的 provider 与 backends)、tenant_config 等。 244 +- **config/config.yaml**:搜索行为(field_boosts、query_config.search_fields、query_config.text_query_strategy、ranking、function_score、rerank 融合参数)、SPU 配置、**services**(翻译/向量/重排的 provider 与 backends)、tenant_config 等。
245 - **.env**:敏感信息与部署态变量(DB、ES、Redis、API Key、端口等);不提交敏感值,可提供 `.env.example` 模板。 245 - **.env**:敏感信息与部署态变量(DB、ES、Redis、API Key、端口等);不提交敏感值,可提供 `.env.example` 模板。
246 246
247 ### 6.2 services 块结构(能力统一约定) 247 ### 6.2 services 块结构(能力统一约定)
docs/QUICKSTART.md
@@ -329,7 +329,7 @@ saas-search 以 MySQL 中的店匠标准表为权威数据源: @@ -329,7 +329,7 @@ saas-search 以 MySQL 中的店匠标准表为权威数据源:
329 329
330 - **统一索引结构**:所有租户使用同一套 mapping(按租户数据分索引名 + 文档内 `tenant_id` 隔离) 330 - **统一索引结构**:所有租户使用同一套 mapping(按租户数据分索引名 + 文档内 `tenant_id` 隔离)
331 - **SPU 级索引**:每个文档是一个 SPU,包含嵌套 `skus`、`specifications` 331 - **SPU 级索引**:每个文档是一个 SPU,包含嵌套 `skus`、`specifications`
332 -- **配置文件驱动**:搜索权重、搜索域、重排融合、provider 全在 `config/config.yaml`,不再以“硬编码配置”为主 332 +- **配置文件驱动**:搜索权重、动态多语言字段、重排融合、provider 全在 `config/config.yaml`,不再以“硬编码配置”为主
333 333
334 ### 2.2 索引结构(Mapping) 334 ### 2.2 索引结构(Mapping)
335 335
@@ -338,7 +338,7 @@ saas-search 以 MySQL 中的店匠标准表为权威数据源: @@ -338,7 +338,7 @@ saas-search 以 MySQL 中的店匠标准表为权威数据源:
338 核心字段可分为: 338 核心字段可分为:
339 339
340 - 标识字段:`tenant_id`, `spu_id` 340 - 标识字段:`tenant_id`, `spu_id`
341 -- 多语言文本:`title.zh/en`, `brief.zh/en`, `description.zh/en`, `vendor.zh/en`, `category_path.zh/en`, `category_name_text.zh/en` 341 +- 多语言文本:`title.<lang>`, `brief.<lang>`, `description.<lang>`, `vendor.<lang>`, `category_path.<lang>`, `category_name_text.<lang>`
342 - 类目过滤:`category1_name`, `category2_name`, `category3_name` 等 342 - 类目过滤:`category1_name`, `category2_name`, `category3_name` 等
343 - 规格/变体:`specifications`(nested)、`skus`(nested) 343 - 规格/变体:`specifications`(nested)、`skus`(nested)
344 - 价格库存:`min_price`, `max_price`, `total_inventory` 等 344 - 价格库存:`min_price`, `max_price`, `total_inventory` 等
@@ -346,8 +346,9 @@ saas-search 以 MySQL 中的店匠标准表为权威数据源: @@ -346,8 +346,9 @@ saas-search 以 MySQL 中的店匠标准表为权威数据源:
346 346
347 ### 2.3 查询、权重、排序(`config/config.yaml`) 347 ### 2.3 查询、权重、排序(`config/config.yaml`)
348 348
349 -- `field_boosts`:字段权重(如标题、品牌、类目)  
350 -- `indexes`:搜索域(default/title/vendor/category/tags) 349 +- `field_boosts`:字段权重(统一按字段基名配置,运行时按 `.{lang}` 动态组装)
  350 +- `query_config.search_fields`:动态多语言检索字段(multilingual/shared/core)
  351 +- `query_config.text_query_strategy`:文本召回策略参数(minimum_should_match、翻译boost等)
351 - `query_config`:语言、embedding 开关、source_fields、knn_boost、翻译提示词等 352 - `query_config`:语言、embedding 开关、source_fields、knn_boost、翻译提示词等
352 - `ranking.expression`:融合表达式(例如 `bm25() + 0.25*text_embedding_relevance()`) 353 - `ranking.expression`:融合表达式(例如 `bm25() + 0.25*text_embedding_relevance()`)
353 - `function_score`:ES 层加权函数 354 - `function_score`:ES 层加权函数
@@ -364,7 +365,7 @@ saas-search 以 MySQL 中的店匠标准表为权威数据源: @@ -364,7 +365,7 @@ saas-search 以 MySQL 中的店匠标准表为权威数据源:
364 | 修改项 | 操作 | 365 | 修改项 | 操作 |
365 |--------|------| 366 |--------|------|
366 | 索引结构(mapping) | 修改 `mappings/search_products.json` → `./scripts/create_tenant_index.sh <tenant_id>` → 重新导入 | 367 | 索引结构(mapping) | 修改 `mappings/search_products.json` → `./scripts/create_tenant_index.sh <tenant_id>` → 重新导入 |
367 -| 搜索/权重/排序/重排 | 修改 `config/config.yaml` 对应块 | 368 +| 搜索字段/权重/排序/重排 | 修改 `config/config.yaml` 对应块 |
368 | provider 与服务 URL | 修改 `config/config.yaml` 的 `services` 块,或用环境变量覆盖 | 369 | provider 与服务 URL | 修改 `config/config.yaml` 的 `services` 块,或用环境变量覆盖 |
369 370
370 --- 371 ---
docs/搜索API对接指南.md
@@ -18,10 +18,9 @@ @@ -18,10 +18,9 @@
18 - 3.3 [过滤器详解](#33-过滤器详解) 18 - 3.3 [过滤器详解](#33-过滤器详解)
19 - 3.4 [分面配置](#34-分面配置) 19 - 3.4 [分面配置](#34-分面配置)
20 - 3.5 [SKU筛选维度](#35-sku筛选维度) 20 - 3.5 [SKU筛选维度](#35-sku筛选维度)
21 - - 3.6 [布尔表达式语法](#36-布尔表达式语法)  
22 - - 3.7 [搜索建议接口](#37-搜索建议接口)  
23 - - 3.8 [即时搜索接口](#38-即时搜索接口)  
24 - - 3.9 [获取单个文档](#39-获取单个文档) 21 + - 3.6 [搜索建议接口](#37-搜索建议接口)
  22 + - 3.7 [即时搜索接口](#38-即时搜索接口)
  23 + - 3.8 [获取单个文档](#39-获取单个文档)
25 24
26 4. [响应格式说明](#响应格式说明) 25 4. [响应格式说明](#响应格式说明)
27 - 4.1 [标准响应结构](#41-标准响应结构) 26 - 4.1 [标准响应结构](#41-标准响应结构)
@@ -56,8 +55,7 @@ @@ -56,8 +55,7 @@
56 - 8.3 [分面搜索](#83-分面搜索) 55 - 8.3 [分面搜索](#83-分面搜索)
57 - 8.4 [规格过滤与分面](#84-规格过滤与分面) 56 - 8.4 [规格过滤与分面](#84-规格过滤与分面)
58 - 8.5 [SKU筛选](#85-sku筛选) 57 - 8.5 [SKU筛选](#85-sku筛选)
59 - - 8.6 [布尔表达式搜索](#86-布尔表达式搜索)  
60 - - 8.7 [分页查询](#87-分页查询) 58 + - 8.6 [分页查询](#87-分页查询)
61 59
62 9. [数据模型](#9-数据模型) 60 9. [数据模型](#9-数据模型)
63 - 9.1 [商品字段定义](#91-商品字段定义) 61 - 9.1 [商品字段定义](#91-商品字段定义)
@@ -167,7 +165,7 @@ curl -X POST &quot;http://43.166.252.75:6002/search/&quot; \ @@ -167,7 +165,7 @@ curl -X POST &quot;http://43.166.252.75:6002/search/&quot; \
167 ### 3.1 接口信息 165 ### 3.1 接口信息
168 166
169 - **端点**: `POST /search/` 167 - **端点**: `POST /search/`
170 -- **描述**: 执行文本搜索查询,支持多语言、布尔表达式、过滤器和分面搜索 168 +- **描述**: 执行文本搜索查询,支持多语言、过滤器和分面搜索
171 - **租户标识**:`tenant_id` 通过 HTTP 请求头 **`X-Tenant-ID`** 传递(推荐);也可通过 URL query 参数 **`tenant_id`** 传递。**不要放在请求体中。** 169 - **租户标识**:`tenant_id` 通过 HTTP 请求头 **`X-Tenant-ID`** 传递(推荐);也可通过 URL query 参数 **`tenant_id`** 传递。**不要放在请求体中。**
172 170
173 **请求示例(推荐)**: 171 **请求示例(推荐)**:
@@ -210,7 +208,7 @@ response = requests.post(url, headers=headers, json={&quot;query&quot;: &quot;芭比娃娃&quot;}) @@ -210,7 +208,7 @@ response = requests.post(url, headers=headers, json={&quot;query&quot;: &quot;芭比娃娃&quot;})
210 208
211 | 参数 | 类型 | 必填 | 默认值 | 说明 | 209 | 参数 | 类型 | 必填 | 默认值 | 说明 |
212 |------|------|------|--------|------| 210 |------|------|------|--------|------|
213 -| `query` | string | Y | - | 搜索查询字符串,支持布尔表达式(AND, OR, RANK, ANDNOT) | 211 +| `query` | string | Y | - | 搜索查询字符串(统一文本检索策略) |
214 | `size` | integer | N | 10 | 返回结果数量(1-100) | 212 | `size` | integer | N | 10 | 返回结果数量(1-100) |
215 | `from` | integer | N | 0 | 分页偏移量(用于分页) | 213 | `from` | integer | N | 0 | 分页偏移量(用于分页) |
216 | `language` | string | N | "zh" | 返回语言:`zh`(中文)或 `en`(英文)。后端会根据此参数选择对应的中英文字段返回 | 214 | `language` | string | N | "zh" | 返回语言:`zh`(中文)或 `en`(英文)。后端会根据此参数选择对应的中英文字段返回 |
@@ -544,36 +542,6 @@ response = requests.post(url, headers=headers, json={&quot;query&quot;: &quot;芭比娃娃&quot;}) @@ -544,36 +542,6 @@ response = requests.post(url, headers=headers, json={&quot;query&quot;: &quot;芭比娃娃&quot;})
544 } 542 }
545 ``` 543 ```
546 544
547 -### 3.6 布尔表达式语法  
548 -  
549 -搜索查询支持布尔表达式,提供更灵活的搜索能力。  
550 -  
551 -**支持的操作符**:  
552 -  
553 -| 操作符 | 描述 | 示例 |  
554 -|--------|------|------|  
555 -| `AND` | 所有词必须匹配 | `玩具 AND 乐高` |  
556 -| `OR` | 任意词匹配 | `芭比 OR 娃娃` |  
557 -| `ANDNOT` | 排除特定词 | `玩具 ANDNOT 电动` |  
558 -| `RANK` | 排序加权(不强制匹配) | `玩具 RANK 乐高` |  
559 -| `()` | 分组 | `玩具 AND (乐高 OR 芭比)` |  
560 -  
561 -**操作符优先级**(从高到低):  
562 -1. `()` - 括号  
563 -2. `ANDNOT` - 排除  
564 -3. `AND` - 与  
565 -4. `OR` - 或  
566 -5. `RANK` - 排序  
567 -  
568 -**示例**:  
569 -```  
570 -"芭比娃娃" // 简单查询  
571 -"玩具 AND 乐高" // AND 查询  
572 -"芭比 OR 娃娃" // OR 查询  
573 -"玩具 ANDNOT 电动" // 排除查询  
574 -"玩具 AND (乐高 OR 芭比)" // 复杂查询  
575 -```  
576 -  
577 ### 3.7 搜索建议接口 545 ### 3.7 搜索建议接口
578 546
579 - **端点**: `GET /search/suggestions` 547 - **端点**: `GET /search/suggestions`
@@ -2020,17 +1988,6 @@ curl &quot;http://localhost:6006/health&quot; @@ -2020,17 +1988,6 @@ curl &quot;http://localhost:6006/health&quot;
2020 - 每个SPU下,每种颜色只会返回第一个SKU 1988 - 每个SPU下,每种颜色只会返回第一个SKU
2021 - 如果维度不匹配,返回所有SKU(不进行过滤) 1989 - 如果维度不匹配,返回所有SKU(不进行过滤)
2022 1990
2023 -### 8.6 布尔表达式搜索  
2024 -  
2025 -**需求**: 搜索包含"手机"和"智能"的商品,排除"二手"  
2026 -  
2027 -```json  
2028 -{  
2029 - "query": "手机 AND 智能 ANDNOT 二手",  
2030 - "size": 20  
2031 -}  
2032 -```  
2033 -  
2034 ### 8.7 分页查询 1991 ### 8.7 分页查询
2035 1992
2036 **需求**: 获取第2页结果(每页20条) 1993 **需求**: 获取第2页结果(每页20条)
docs/搜索API速查表.md
@@ -165,18 +165,6 @@ POST /search/ @@ -165,18 +165,6 @@ POST /search/
165 165
166 --- 166 ---
167 167
168 -## 布尔表达式  
169 -  
170 -```bash  
171 -{  
172 - "query": "玩具 AND (乐高 OR 芭比) ANDNOT 电动"  
173 -}  
174 -```  
175 -  
176 -**操作符优先级**: `()` > `ANDNOT` > `AND` > `OR` > `RANK`  
177 -  
178 ----  
179 -  
180 ## 分页 168 ## 分页
181 169
182 ```bash 170 ```bash
query/language_detector.py
1 """ 1 """
2 Language detection utility. 2 Language detection utility.
3 3
4 -Detects the language of a query string. 4 +Detects language of short e-commerce queries with script checks + lightweight
  5 +Latin-language scoring (de/fr/es/it/pt/nl/en).
5 """ 6 """
6 7
7 -from typing import Optional 8 +from typing import Dict, List
8 import re 9 import re
9 10
10 11
11 class LanguageDetector: 12 class LanguageDetector:
12 - """Simple rule-based language detector for common e-commerce languages."""  
13 -  
14 - # Unicode ranges for different scripts  
15 - CJK_RANGES = [  
16 - (0x4E00, 0x9FFF), # CJK Unified Ideographs  
17 - (0x3400, 0x4DBF), # CJK Extension A  
18 - (0x20000, 0x2A6DF), # CJK Extension B  
19 - (0x3040, 0x309F), # Hiragana  
20 - (0x30A0, 0x30FF), # Katakana  
21 - ]  
22 -  
23 - CYRILLIC_RANGE = (0x0400, 0x04FF)  
24 - ARABIC_RANGE = (0x0600, 0x06FF)  
25 - LATIN_RANGE = (0x0041, 0x007A) 13 + """Rule-based language detector for common e-commerce query languages."""
26 14
27 def __init__(self): 15 def __init__(self):
28 - """Initialize language detector."""  
29 - self.chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')  
30 - self.russian_pattern = re.compile(r'[\u0400-\u04ff]+')  
31 - self.arabic_pattern = re.compile(r'[\u0600-\u06ff]+')  
32 - self.japanese_pattern = re.compile(r'[\u3040-\u309f\u30a0-\u30ff]+') 16 + self._re_zh = re.compile(r"[\u4e00-\u9fff]")
  17 + self._re_ja_kana = re.compile(r"[\u3040-\u30ff]")
  18 + self._re_ko = re.compile(r"[\uac00-\ud7af]")
  19 + self._re_ru = re.compile(r"[\u0400-\u04ff]")
  20 + self._re_ar = re.compile(r"[\u0600-\u06ff]")
  21 + self._re_hi = re.compile(r"[\u0900-\u097f]")
  22 + self._re_he = re.compile(r"[\u0590-\u05ff]")
  23 + self._re_th = re.compile(r"[\u0e00-\u0e7f]")
  24 + self._re_latin_word = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ]+")
  25 +
  26 + # Stopwords + e-commerce terms for Latin-family disambiguation.
  27 + self._latin_lexicons: Dict[str, set] = {
  28 + "en": {
  29 + "the", "and", "for", "with", "new", "women", "men", "kids",
  30 + "shoe", "shoes", "dress", "shirt", "jacket", "bag", "wireless",
  31 + },
  32 + "de": {
  33 + "der", "die", "das", "und", "mit", "für", "damen", "herren",
  34 + "kinder", "schuhe", "kleid", "hemd", "jacke", "tasche",
  35 + },
  36 + "fr": {
  37 + "le", "la", "les", "et", "avec", "pour", "femme", "homme",
  38 + "enfant", "chaussures", "robe", "chemise", "veste", "sac",
  39 + },
  40 + "es": {
  41 + "el", "la", "los", "las", "y", "con", "para", "mujer", "hombre",
  42 + "niño", "niña", "zapatos", "vestido", "camisa", "chaqueta", "bolso",
  43 + },
  44 + "it": {
  45 + "il", "lo", "la", "gli", "le", "e", "con", "per", "donna", "uomo",
  46 + "bambino", "scarpe", "abito", "camicia", "giacca", "borsa",
  47 + },
  48 + "pt": {
  49 + "o", "a", "os", "as", "e", "com", "para", "mulher", "homem",
  50 + "criança", "sapatos", "vestido", "camisa", "jaqueta", "bolsa",
  51 + },
  52 + "nl": {
  53 + "de", "het", "en", "met", "voor", "dames", "heren", "kinderen",
  54 + "schoenen", "jurk", "overhemd", "jas", "tas",
  55 + },
  56 + }
  57 + self._diacritic_weights: Dict[str, Dict[str, int]] = {
  58 + "de": {"ä": 3, "ö": 3, "ü": 3, "ß": 4},
  59 + "fr": {"é": 2, "è": 2, "ê": 2, "à": 2, "ç": 2, "ù": 2, "ô": 2},
  60 + "es": {"ñ": 3, "á": 2, "é": 2, "í": 2, "ó": 2, "ú": 2},
  61 + "it": {"à": 2, "è": 2, "é": 2, "ì": 2, "ò": 2, "ù": 2},
  62 + "pt": {"ã": 3, "õ": 3, "ç": 2, "á": 2, "â": 2, "ê": 2, "ô": 2},
  63 + "nl": {"ij": 2},
  64 + }
33 65
34 def detect(self, text: str) -> str: 66 def detect(self, text: str) -> str:
35 """ 67 """
36 - Detect language of text.  
37 -  
38 - Args:  
39 - text: Input text 68 + Detect language code for text.
40 69
41 - Returns:  
42 - Language code: 'zh', 'en', 'ru', 'ar', 'ja', or 'unknown' 70 + Returns one of: zh/en/ru/ar/ja/ko/de/fr/es/it/pt/nl/hi/he/th/unknown
43 """ 71 """
44 if not text or not text.strip(): 72 if not text or not text.strip():
45 - return 'unknown'  
46 -  
47 - text = text.strip()  
48 -  
49 - # Count characters in each script  
50 - char_counts = {  
51 - 'chinese': 0,  
52 - 'russian': 0,  
53 - 'arabic': 0,  
54 - 'japanese': 0,  
55 - 'latin': 0  
56 - }  
57 -  
58 - for char in text:  
59 - code_point = ord(char)  
60 -  
61 - # Check CJK (Chinese/Japanese)  
62 - is_cjk = any(start <= code_point <= end for start, end in self.CJK_RANGES)  
63 - if is_cjk:  
64 - char_counts['chinese'] += 1  
65 -  
66 - # Check Hiragana/Katakana (Japanese)  
67 - if 0x3040 <= code_point <= 0x30FF:  
68 - char_counts['japanese'] += 1  
69 -  
70 - # Check Cyrillic (Russian)  
71 - if self.CYRILLIC_RANGE[0] <= code_point <= self.CYRILLIC_RANGE[1]:  
72 - char_counts['russian'] += 1  
73 -  
74 - # Check Arabic  
75 - if self.ARABIC_RANGE[0] <= code_point <= self.ARABIC_RANGE[1]:  
76 - char_counts['arabic'] += 1  
77 -  
78 - # Check Latin  
79 - if (0x0041 <= code_point <= 0x005A) or (0x0061 <= code_point <= 0x007A):  
80 - char_counts['latin'] += 1  
81 -  
82 - # Determine dominant script  
83 - total_chars = sum(char_counts.values())  
84 - if total_chars == 0:  
85 - return 'unknown'  
86 -  
87 - # Calculate percentages  
88 - percentages = {  
89 - script: count / total_chars  
90 - for script, count in char_counts.items()  
91 - }  
92 -  
93 - # Japanese has both Hiragana/Katakana and CJK  
94 - if percentages['japanese'] > 0.1:  
95 - return 'ja'  
96 -  
97 - # Russian (Cyrillic)  
98 - if percentages['russian'] > 0.5:  
99 - return 'ru'  
100 -  
101 - # Arabic  
102 - if percentages['arabic'] > 0.5:  
103 - return 'ar'  
104 -  
105 - # Chinese (CJK without Japanese kana)  
106 - if percentages['chinese'] > 0.3:  
107 - return 'zh'  
108 -  
109 - # English/Latin  
110 - if percentages['latin'] > 0.5:  
111 - return 'en'  
112 -  
113 - return 'unknown' 73 + return "unknown"
  74 + q = text.strip().lower()
  75 +
  76 + # Script-first detection for non-Latin languages.
  77 + if self._re_ja_kana.search(q):
  78 + return "ja"
  79 + if self._re_ko.search(q):
  80 + return "ko"
  81 + if self._re_zh.search(q):
  82 + return "zh"
  83 + if self._re_ru.search(q):
  84 + return "ru"
  85 + if self._re_ar.search(q):
  86 + return "ar"
  87 + if self._re_hi.search(q):
  88 + return "hi"
  89 + if self._re_he.search(q):
  90 + return "he"
  91 + if self._re_th.search(q):
  92 + return "th"
  93 +
  94 + # Latin-family scoring.
  95 + tokens = self._re_latin_word.findall(q)
  96 + if not tokens:
  97 + return "unknown"
  98 +
  99 + scores: Dict[str, float] = {k: 0.0 for k in self._latin_lexicons.keys()}
  100 + scores["en"] = scores.get("en", 0.0)
  101 + token_set = set(tokens)
  102 +
  103 + # Lexicon matches
  104 + for lang, lex in self._latin_lexicons.items():
  105 + overlap = len(token_set & lex)
  106 + if overlap:
  107 + scores[lang] += overlap * 2.0
  108 +
  109 + # Diacritics / orthographic hints
  110 + for lang, hints in self._diacritic_weights.items():
  111 + for marker, weight in hints.items():
  112 + if marker in q:
  113 + scores[lang] += weight
  114 +
  115 + # Light suffix hints for common product words
  116 + for t in tokens:
  117 + if t.endswith("ung") or t.endswith("chen"):
  118 + scores["de"] += 0.6
  119 + if t.endswith("ción") or t.endswith("ado") or t.endswith("ada"):
  120 + scores["es"] += 0.6
  121 + if t.endswith("zione") or t.endswith("etto") or t.endswith("ella"):
  122 + scores["it"] += 0.6
  123 + if t.endswith("ção") or t.endswith("mente"):
  124 + scores["pt"] += 0.6
  125 + if t.endswith("ment") or t.endswith("eau"):
  126 + scores["fr"] += 0.5
  127 +
  128 + # Fallback preference: English for pure Latin short tokens.
  129 + scores["en"] += 0.2
  130 +
  131 + best_lang = max(scores.items(), key=lambda x: x[1])[0]
  132 + best_score = scores[best_lang]
  133 + if best_score <= 0:
  134 + return "en"
  135 + return best_lang
114 136
115 def is_chinese(self, text: str) -> bool: 137 def is_chinese(self, text: str) -> bool:
116 - """Check if text is primarily Chinese."""  
117 - return self.detect(text) == 'zh' 138 + return self.detect(text) == "zh"
118 139
119 def is_english(self, text: str) -> bool: 140 def is_english(self, text: str) -> bool:
120 - """Check if text is primarily English."""  
121 - return self.detect(text) == 'en' 141 + return self.detect(text) == "en"
122 142
123 def is_russian(self, text: str) -> bool: 143 def is_russian(self, text: str) -> bool:
124 - """Check if text is primarily Russian."""  
125 - return self.detect(text) == 'ru' 144 + return self.detect(text) == "ru"
126 145
127 def is_arabic(self, text: str) -> bool: 146 def is_arabic(self, text: str) -> bool:
128 - """Check if text is primarily Arabic."""  
129 - return self.detect(text) == 'ar' 147 + return self.detect(text) == "ar"
130 148
131 def is_japanese(self, text: str) -> bool: 149 def is_japanese(self, text: str) -> bool:
132 - """Check if text is primarily Japanese."""  
133 - return self.detect(text) == 'ja' 150 + return self.detect(text) == "ja"
query/query_parser.py
@@ -37,7 +37,11 @@ class ParsedQuery: @@ -37,7 +37,11 @@ class ParsedQuery:
37 domain: str = "default", 37 domain: str = "default",
38 keywords: str = "", 38 keywords: str = "",
39 token_count: int = 0, 39 token_count: int = 0,
40 - query_tokens: Optional[List[str]] = None 40 + query_tokens: Optional[List[str]] = None,
  41 + query_text_by_lang: Optional[Dict[str, str]] = None,
  42 + search_langs: Optional[List[str]] = None,
  43 + index_languages: Optional[List[str]] = None,
  44 + source_in_index_languages: bool = True,
41 ): 45 ):
42 self.original_query = original_query 46 self.original_query = original_query
43 self.query_normalized = query_normalized 47 self.query_normalized = query_normalized
@@ -50,6 +54,10 @@ class ParsedQuery: @@ -50,6 +54,10 @@ class ParsedQuery:
50 self.keywords = keywords 54 self.keywords = keywords
51 self.token_count = token_count 55 self.token_count = token_count
52 self.query_tokens = query_tokens or [] 56 self.query_tokens = query_tokens or []
  57 + self.query_text_by_lang = query_text_by_lang or {}
  58 + self.search_langs = search_langs or []
  59 + self.index_languages = index_languages or []
  60 + self.source_in_index_languages = bool(source_in_index_languages)
53 61
54 def to_dict(self) -> Dict[str, Any]: 62 def to_dict(self) -> Dict[str, Any]:
55 """Convert to dictionary representation.""" 63 """Convert to dictionary representation."""
@@ -61,6 +69,10 @@ class ParsedQuery: @@ -61,6 +69,10 @@ class ParsedQuery:
61 "translations": self.translations, 69 "translations": self.translations,
62 "domain": self.domain 70 "domain": self.domain
63 } 71 }
  72 + result["query_text_by_lang"] = self.query_text_by_lang
  73 + result["search_langs"] = self.search_langs
  74 + result["index_languages"] = self.index_languages
  75 + result["source_in_index_languages"] = self.source_in_index_languages
64 return result 76 return result
65 77
66 78
@@ -253,12 +265,21 @@ class QueryParser: @@ -253,12 +265,21 @@ class QueryParser:
253 # Stage 4: Translation (with async support and conditional waiting) 265 # Stage 4: Translation (with async support and conditional waiting)
254 translations = {} 266 translations = {}
255 translation_futures = {} 267 translation_futures = {}
  268 + index_langs = ["en", "zh"]
256 try: 269 try:
257 # 根据租户配置的 index_languages 决定翻译目标语言 270 # 根据租户配置的 index_languages 决定翻译目标语言
258 from config.tenant_config_loader import get_tenant_config_loader 271 from config.tenant_config_loader import get_tenant_config_loader
259 tenant_loader = get_tenant_config_loader() 272 tenant_loader = get_tenant_config_loader()
260 tenant_cfg = tenant_loader.get_tenant_config(tenant_id or "default") 273 tenant_cfg = tenant_loader.get_tenant_config(tenant_id or "default")
261 - index_langs = tenant_cfg.get("index_languages") or ["en", "zh"] 274 + raw_index_langs = tenant_cfg.get("index_languages") or ["en", "zh"]
  275 + index_langs = []
  276 + seen_langs = set()
  277 + for lang in raw_index_langs:
  278 + norm_lang = str(lang or "").strip().lower()
  279 + if not norm_lang or norm_lang in seen_langs:
  280 + continue
  281 + seen_langs.add(norm_lang)
  282 + index_langs.append(norm_lang)
262 283
263 target_langs_for_translation = [lang for lang in index_langs if lang != detected_lang] 284 target_langs_for_translation = [lang for lang in index_langs if lang != detected_lang]
264 285
@@ -269,8 +290,12 @@ class QueryParser: @@ -269,8 +290,12 @@ class QueryParser:
269 # Use e-commerce context for better disambiguation 290 # Use e-commerce context for better disambiguation
270 translation_context = self.config.query_config.translation_context 291 translation_context = self.config.query_config.translation_context
271 # For query translation, we use a general prompt (not language-specific) 292 # For query translation, we use a general prompt (not language-specific)
272 - query_prompt = self.config.query_config.translation_prompts.get('query_zh') or \  
273 - self.config.query_config.translation_prompts.get('default_zh') 293 + query_prompt = (
  294 + self.config.query_config.translation_prompts.get(f"query_{detected_lang}")
  295 + or self.config.query_config.translation_prompts.get("query_en")
  296 + or self.config.query_config.translation_prompts.get("default_en")
  297 + or self.config.query_config.translation_prompts.get("default_zh")
  298 + )
274 299
275 # Determine if we need to wait for translation results 300 # Determine if we need to wait for translation results
276 # If detected_lang is not in index_languages, we must wait for translation 301 # If detected_lang is not in index_languages, we must wait for translation
@@ -417,6 +442,33 @@ class QueryParser: @@ -417,6 +442,33 @@ class QueryParser:
417 # Update translations in context after all are complete 442 # Update translations in context after all are complete
418 if translations and context: 443 if translations and context:
419 context.store_intermediate_result('translations', translations) 444 context.store_intermediate_result('translations', translations)
  445 +
  446 + # Build language-scoped query plan: source language + available translations
  447 + query_text_by_lang: Dict[str, str] = {}
  448 + if query_text:
  449 + query_text_by_lang[detected_lang] = query_text
  450 + for lang, translated_text in (translations or {}).items():
  451 + if translated_text and str(translated_text).strip():
  452 + query_text_by_lang[str(lang).strip().lower()] = str(translated_text)
  453 +
  454 + source_in_index_languages = detected_lang in index_langs
  455 + ordered_search_langs: List[str] = []
  456 + seen_order = set()
  457 + if detected_lang in query_text_by_lang:
  458 + ordered_search_langs.append(detected_lang)
  459 + seen_order.add(detected_lang)
  460 + for lang in index_langs:
  461 + if lang in query_text_by_lang and lang not in seen_order:
  462 + ordered_search_langs.append(lang)
  463 + seen_order.add(lang)
  464 + for lang in query_text_by_lang.keys():
  465 + if lang not in seen_order:
  466 + ordered_search_langs.append(lang)
  467 + seen_order.add(lang)
  468 +
  469 + if context:
  470 + context.store_intermediate_result("search_langs", ordered_search_langs)
  471 + context.store_intermediate_result("query_text_by_lang", query_text_by_lang)
420 472
421 # Build result 473 # Build result
422 result = ParsedQuery( 474 result = ParsedQuery(
@@ -429,7 +481,11 @@ class QueryParser: @@ -429,7 +481,11 @@ class QueryParser:
429 domain=domain, 481 domain=domain,
430 keywords=keywords, 482 keywords=keywords,
431 token_count=token_count, 483 token_count=token_count,
432 - query_tokens=query_tokens 484 + query_tokens=query_tokens,
  485 + query_text_by_lang=query_text_by_lang,
  486 + search_langs=ordered_search_langs,
  487 + index_languages=index_langs,
  488 + source_in_index_languages=source_in_index_languages,
433 ) 489 )
434 490
435 if context and hasattr(context, 'logger'): 491 if context and hasattr(context, 'logger'):
query/query_rewriter.py
@@ -19,7 +19,7 @@ class QueryRewriter: @@ -19,7 +19,7 @@ class QueryRewriter:
19 19
20 Args: 20 Args:
21 rewrite_dict: Dictionary mapping exact query terms to rewrite expressions 21 rewrite_dict: Dictionary mapping exact query terms to rewrite expressions
22 - e.g., {"芭比": "brand:芭比 OR name:芭比娃娃"} 22 + e.g., {"芭比": "芭比娃娃"}
23 Only full word matches will be rewritten, no partial matching. 23 Only full word matches will be rewritten, no partial matching.
24 """ 24 """
25 self.rewrite_dict = rewrite_dict or {} 25 self.rewrite_dict = rewrite_dict or {}
@@ -107,13 +107,13 @@ class QueryNormalizer: @@ -107,13 +107,13 @@ class QueryNormalizer:
107 return query 107 return query
108 108
109 @staticmethod 109 @staticmethod
110 - def remove_punctuation(query: str, keep_operators: bool = True) -> str: 110 + def remove_punctuation(query: str, keep_operators: bool = False) -> str:
111 """ 111 """
112 Remove punctuation from query. 112 Remove punctuation from query.
113 113
114 Args: 114 Args:
115 query: Original query 115 query: Original query
116 - keep_operators: Whether to keep boolean operators (AND, OR, etc.) 116 + keep_operators: Whether to keep symbols used in old query syntax.
117 117
118 Returns: 118 Returns:
119 Query without punctuation 119 Query without punctuation
search/__init__.py
1 """Search package initialization.""" 1 """Search package initialization."""
2 2
3 -from .boolean_parser import BooleanParser, QueryNode  
4 from .es_query_builder import ESQueryBuilder 3 from .es_query_builder import ESQueryBuilder
5 from .searcher import Searcher, SearchResult 4 from .searcher import Searcher, SearchResult
6 5
7 __all__ = [ 6 __all__ = [
8 - 'BooleanParser',  
9 - 'QueryNode',  
10 'ESQueryBuilder', 7 'ESQueryBuilder',
11 'Searcher', 8 'Searcher',
12 'SearchResult', 9 'SearchResult',
search/boolean_parser.py deleted
@@ -1,201 +0,0 @@ @@ -1,201 +0,0 @@
1 -"""  
2 -Boolean expression parser for search queries.  
3 -  
4 -Supports: AND, OR, RANK, ANDNOT operators with parentheses.  
5 -Precedence (high to low): (), ANDNOT, AND, OR, RANK  
6 -"""  
7 -  
8 -import re  
9 -from typing import List, Tuple, Optional  
10 -from dataclasses import dataclass  
11 -  
12 -  
13 -@dataclass  
14 -class QueryNode:  
15 - """Represents a node in the parsed query tree."""  
16 - operator: str # 'AND', 'OR', 'RANK', 'ANDNOT', 'TERM'  
17 - terms: List['QueryNode'] = None # Child nodes for operators  
18 - value: str = None # Value for leaf nodes (TERM)  
19 -  
20 - def __repr__(self):  
21 - if self.operator == 'TERM':  
22 - return f"TERM({self.value})"  
23 - else:  
24 - return f"{self.operator}({', '.join(str(t) for t in self.terms)})"  
25 -  
26 -  
27 -class BooleanParser:  
28 - """  
29 - Parser for boolean search expressions.  
30 -  
31 - Operator precedence (high to low):  
32 - 1. () - Parentheses  
33 - 2. ANDNOT - AND NOT (exclusion)  
34 - 3. AND - All terms must match  
35 - 4. OR - Any term must match  
36 - 5. RANK - Scoring boost (like OR but affects ranking)  
37 - """  
38 -  
39 - OPERATORS = {'AND', 'OR', 'RANK', 'ANDNOT'}  
40 - PRECEDENCE = {  
41 - 'ANDNOT': 3,  
42 - 'AND': 2,  
43 - 'OR': 1,  
44 - 'RANK': 0  
45 - }  
46 -  
47 - def __init__(self):  
48 - """Initialize boolean parser."""  
49 - pass  
50 -  
51 - def parse(self, expression: str) -> QueryNode:  
52 - """  
53 - Parse boolean expression into query tree.  
54 -  
55 - Args:  
56 - expression: Boolean expression string  
57 - Example: "laptop AND (gaming OR professional) ANDNOT cheap"  
58 -  
59 - Returns:  
60 - Root QueryNode of parsed tree  
61 - """  
62 - if not expression or not expression.strip():  
63 - return QueryNode(operator='TERM', value='')  
64 -  
65 - # Tokenize  
66 - tokens = self._tokenize(expression)  
67 -  
68 - if not tokens:  
69 - return QueryNode(operator='TERM', value='')  
70 -  
71 - # Parse with precedence  
72 - return self._parse_expression(tokens)  
73 -  
74 - def _tokenize(self, expression: str) -> List[str]:  
75 - """  
76 - Tokenize expression into terms and operators.  
77 -  
78 - Args:  
79 - expression: Expression string  
80 -  
81 - Returns:  
82 - List of tokens  
83 - """  
84 - # Pattern to match: operators, parentheses, or terms (with domain prefix support)  
85 - pattern = r'\b(AND|OR|RANK|ANDNOT)\b|[()]|(?:\w+:)?[^\s()]+'  
86 -  
87 - tokens = []  
88 - for match in re.finditer(pattern, expression):  
89 - token = match.group().strip()  
90 - if token:  
91 - tokens.append(token)  
92 -  
93 - return tokens  
94 -  
95 - def _parse_expression(self, tokens: List[str], start: int = 0) -> Tuple[QueryNode, int]:  
96 - """  
97 - Parse expression with operator precedence.  
98 -  
99 - Args:  
100 - tokens: List of tokens  
101 - start: Starting index  
102 -  
103 - Returns:  
104 - Tuple of (QueryNode, next_index)  
105 - """  
106 - # Start with lowest precedence (RANK)  
107 - return self._parse_rank(tokens, start)  
108 -  
109 - def _parse_rank(self, tokens: List[str], start: int) -> Tuple[QueryNode, int]:  
110 - """Parse RANK operator (lowest precedence)."""  
111 - left, pos = self._parse_or(tokens, start)  
112 -  
113 - while pos < len(tokens) and tokens[pos] == 'RANK':  
114 - pos += 1 # Skip 'RANK'  
115 - right, pos = self._parse_or(tokens, pos)  
116 - left = QueryNode(operator='RANK', terms=[left, right])  
117 -  
118 - return left, pos  
119 -  
120 - def _parse_or(self, tokens: List[str], start: int) -> Tuple[QueryNode, int]:  
121 - """Parse OR operator."""  
122 - left, pos = self._parse_and(tokens, start)  
123 -  
124 - while pos < len(tokens) and tokens[pos] == 'OR':  
125 - pos += 1 # Skip 'OR'  
126 - right, pos = self._parse_and(tokens, pos)  
127 - left = QueryNode(operator='OR', terms=[left, right])  
128 -  
129 - return left, pos  
130 -  
131 - def _parse_and(self, tokens: List[str], start: int) -> Tuple[QueryNode, int]:  
132 - """Parse AND operator."""  
133 - left, pos = self._parse_andnot(tokens, start)  
134 -  
135 - while pos < len(tokens) and tokens[pos] == 'AND':  
136 - pos += 1 # Skip 'AND'  
137 - right, pos = self._parse_andnot(tokens, pos)  
138 - left = QueryNode(operator='AND', terms=[left, right])  
139 -  
140 - return left, pos  
141 -  
142 - def _parse_andnot(self, tokens: List[str], start: int) -> Tuple[QueryNode, int]:  
143 - """Parse ANDNOT operator (highest precedence)."""  
144 - left, pos = self._parse_primary(tokens, start)  
145 -  
146 - while pos < len(tokens) and tokens[pos] == 'ANDNOT':  
147 - pos += 1 # Skip 'ANDNOT'  
148 - right, pos = self._parse_primary(tokens, pos)  
149 - left = QueryNode(operator='ANDNOT', terms=[left, right])  
150 -  
151 - return left, pos  
152 -  
153 - def _parse_primary(self, tokens: List[str], start: int) -> Tuple[QueryNode, int]:  
154 - """Parse primary expression (terms or parentheses)."""  
155 - if start >= len(tokens):  
156 - return QueryNode(operator='TERM', value=''), start  
157 -  
158 - token = tokens[start]  
159 -  
160 - # Handle parentheses  
161 - if token == '(':  
162 - # Find matching closing parenthesis  
163 - depth = 1  
164 - pos = start + 1  
165 - while pos < len(tokens) and depth > 0:  
166 - if tokens[pos] == '(':  
167 - depth += 1  
168 - elif tokens[pos] == ')':  
169 - depth -= 1  
170 - pos += 1  
171 -  
172 - # Parse contents of parentheses  
173 - inner_tokens = tokens[start + 1:pos - 1]  
174 - if inner_tokens:  
175 - node, _ = self._parse_expression(inner_tokens, 0)  
176 - return node, pos  
177 - else:  
178 - return QueryNode(operator='TERM', value=''), pos  
179 -  
180 - # Handle term  
181 - if token not in self.OPERATORS and token not in ['(', ')']:  
182 - return QueryNode(operator='TERM', value=token), start + 1  
183 -  
184 - # Unexpected token  
185 - return QueryNode(operator='TERM', value=''), start + 1  
186 -  
187 - def is_simple_query(self, expression: str) -> bool:  
188 - """  
189 - Check if query is simple (no boolean operators).  
190 -  
191 - Args:  
192 - expression: Query expression  
193 -  
194 - Returns:  
195 - True if simple query (no operators)  
196 - """  
197 - tokens = self._tokenize(expression)  
198 - for token in tokens:  
199 - if token in self.OPERATORS:  
200 - return False  
201 - return True  
search/es_query_builder.py
@@ -10,7 +10,6 @@ Simplified architecture: @@ -10,7 +10,6 @@ Simplified architecture:
10 10
11 from typing import Dict, Any, List, Optional, Union, Tuple 11 from typing import Dict, Any, List, Optional, Union, Tuple
12 import numpy as np 12 import numpy as np
13 -from .boolean_parser import QueryNode  
14 from config import FunctionScoreConfig 13 from config import FunctionScoreConfig
15 14
16 15
@@ -20,18 +19,31 @@ class ESQueryBuilder: @@ -20,18 +19,31 @@ class ESQueryBuilder:
20 def __init__( 19 def __init__(
21 self, 20 self,
22 match_fields: List[str], 21 match_fields: List[str],
  22 + field_boosts: Optional[Dict[str, float]] = None,
  23 + multilingual_fields: Optional[List[str]] = None,
  24 + shared_fields: Optional[List[str]] = None,
  25 + core_multilingual_fields: Optional[List[str]] = None,
23 text_embedding_field: Optional[str] = None, 26 text_embedding_field: Optional[str] = None,
24 image_embedding_field: Optional[str] = None, 27 image_embedding_field: Optional[str] = None,
25 source_fields: Optional[List[str]] = None, 28 source_fields: Optional[List[str]] = None,
26 function_score_config: Optional[FunctionScoreConfig] = None, 29 function_score_config: Optional[FunctionScoreConfig] = None,
27 default_language: str = "en", 30 default_language: str = "en",
28 - knn_boost: float = 0.25 31 + knn_boost: float = 0.25,
  32 + base_minimum_should_match: str = "75%",
  33 + translation_minimum_should_match: str = "75%",
  34 + translation_boost: float = 0.4,
  35 + translation_boost_when_source_missing: float = 1.0,
  36 + source_boost_when_missing: float = 0.6,
  37 + keywords_boost: float = 0.1,
  38 + enable_phrase_query: bool = True,
  39 + tie_breaker_base_query: float = 0.9,
  40 + tie_breaker_keywords: float = 0.9,
29 ): 41 ):
30 """ 42 """
31 Initialize query builder. 43 Initialize query builder.
32 44
33 Multi-language search (translation-based cross-language recall) is always enabled: 45 Multi-language search (translation-based cross-language recall) is always enabled:
34 - queries are matched against both detected-language and translated zh/en clauses. 46 + queries are matched against detected-language and translated target-language clauses.
35 47
36 Args: 48 Args:
37 match_fields: Fields to search for text matching 49 match_fields: Fields to search for text matching
@@ -43,12 +55,27 @@ class ESQueryBuilder: @@ -43,12 +55,27 @@ class ESQueryBuilder:
43 knn_boost: Boost value for KNN (embedding recall) 55 knn_boost: Boost value for KNN (embedding recall)
44 """ 56 """
45 self.match_fields = match_fields 57 self.match_fields = match_fields
  58 + self.field_boosts = field_boosts or {}
  59 + self.multilingual_fields = multilingual_fields or [
  60 + "title", "brief", "description", "vendor", "category_path", "category_name_text"
  61 + ]
  62 + self.shared_fields = shared_fields or ["tags", "option1_values", "option2_values", "option3_values"]
  63 + self.core_multilingual_fields = core_multilingual_fields or ["title", "brief", "vendor", "category_name_text"]
46 self.text_embedding_field = text_embedding_field 64 self.text_embedding_field = text_embedding_field
47 self.image_embedding_field = image_embedding_field 65 self.image_embedding_field = image_embedding_field
48 self.source_fields = source_fields 66 self.source_fields = source_fields
49 self.function_score_config = function_score_config 67 self.function_score_config = function_score_config
50 self.default_language = default_language 68 self.default_language = default_language
51 self.knn_boost = knn_boost 69 self.knn_boost = knn_boost
  70 + self.base_minimum_should_match = base_minimum_should_match
  71 + self.translation_minimum_should_match = translation_minimum_should_match
  72 + self.translation_boost = float(translation_boost)
  73 + self.translation_boost_when_source_missing = float(translation_boost_when_source_missing)
  74 + self.source_boost_when_missing = float(source_boost_when_missing)
  75 + self.keywords_boost = float(keywords_boost)
  76 + self.enable_phrase_query = bool(enable_phrase_query)
  77 + self.tie_breaker_base_query = float(tie_breaker_base_query)
  78 + self.tie_breaker_keywords = float(tie_breaker_keywords)
52 79
53 def _apply_source_filter(self, es_query: Dict[str, Any]) -> None: 80 def _apply_source_filter(self, es_query: Dict[str, Any]) -> None:
54 """ 81 """
@@ -118,7 +145,6 @@ class ESQueryBuilder: @@ -118,7 +145,6 @@ class ESQueryBuilder:
118 self, 145 self,
119 query_text: str, 146 query_text: str,
120 query_vector: Optional[np.ndarray] = None, 147 query_vector: Optional[np.ndarray] = None,
121 - query_node: Optional[QueryNode] = None,  
122 filters: Optional[Dict[str, Any]] = None, 148 filters: Optional[Dict[str, Any]] = None,
123 range_filters: Optional[Dict[str, Any]] = None, 149 range_filters: Optional[Dict[str, Any]] = None,
124 facet_configs: Optional[List[Any]] = None, 150 facet_configs: Optional[List[Any]] = None,
@@ -136,14 +162,13 @@ class ESQueryBuilder: @@ -136,14 +162,13 @@ class ESQueryBuilder:
136 结构:filters and (text_recall or embedding_recall) + post_filter 162 结构:filters and (text_recall or embedding_recall) + post_filter
137 - conjunctive_filters: 应用在 query.bool.filter(影响结果和聚合) 163 - conjunctive_filters: 应用在 query.bool.filter(影响结果和聚合)
138 - disjunctive_filters: 应用在 post_filter(只影响结果,不影响聚合) 164 - disjunctive_filters: 应用在 post_filter(只影响结果,不影响聚合)
139 - - text_recall: 文本相关性召回(中英文字段都用 165 + - text_recall: 文本相关性召回(按 search_langs 动态语言字段
140 - embedding_recall: 向量召回(KNN) 166 - embedding_recall: 向量召回(KNN)
141 - function_score: 包装召回部分,支持提权字段 167 - function_score: 包装召回部分,支持提权字段
142 168
143 Args: 169 Args:
144 query_text: Query text for BM25 matching 170 query_text: Query text for BM25 matching
145 query_vector: Query embedding for KNN search 171 query_vector: Query embedding for KNN search
146 - query_node: Parsed boolean expression tree  
147 filters: Exact match filters 172 filters: Exact match filters
148 range_filters: Range filters for numeric fields (always applied in query) 173 range_filters: Range filters for numeric fields (always applied in query)
149 facet_configs: Facet configurations (used to identify multi-select facets) 174 facet_configs: Facet configurations (used to identify multi-select facets)
@@ -157,6 +182,7 @@ class ESQueryBuilder: @@ -157,6 +182,7 @@ class ESQueryBuilder:
157 Returns: 182 Returns:
158 ES query DSL dictionary 183 ES query DSL dictionary
159 """ 184 """
  185 + # Boolean AST path has been removed; keep a single text strategy.
160 es_query = { 186 es_query = {
161 "size": size, 187 "size": size,
162 "from": from_ 188 "from": from_
@@ -170,12 +196,8 @@ class ESQueryBuilder: @@ -170,12 +196,8 @@ class ESQueryBuilder:
170 196
171 # Text recall (always include if query_text exists) 197 # Text recall (always include if query_text exists)
172 if query_text: 198 if query_text:
173 - if query_node and query_node.operator != 'TERM':  
174 - # Complex boolean query  
175 - text_query = self._build_boolean_query(query_node)  
176 - else:  
177 - # Simple text query - use advanced should-based multi-query strategy  
178 - text_query = self._build_advanced_text_query(query_text, parsed_query) 199 + # Unified text query strategy
  200 + text_query = self._build_advanced_text_query(query_text, parsed_query)
179 recall_clauses.append(text_query) 201 recall_clauses.append(text_query)
180 202
181 # Embedding recall (KNN - separate from query, handled below) 203 # Embedding recall (KNN - separate from query, handled below)
@@ -379,50 +401,49 @@ class ESQueryBuilder: @@ -379,50 +401,49 @@ class ESQueryBuilder:
379 } 401 }
380 } 402 }
381 403
  404 + def _format_field_with_boost(self, field_name: str, boost: float) -> str:
  405 + if abs(float(boost) - 1.0) < 1e-9:
  406 + return field_name
  407 + return f"{field_name}^{boost}"
  408 +
  409 + def _get_field_boost(self, base_field: str, language: Optional[str] = None) -> float:
  410 + # Language-specific override first (e.g. title.de), then base field (e.g. title)
  411 + if language:
  412 + lang_key = f"{base_field}.{language}"
  413 + if lang_key in self.field_boosts:
  414 + return float(self.field_boosts[lang_key])
  415 + if base_field in self.field_boosts:
  416 + return float(self.field_boosts[base_field])
  417 + return 1.0
  418 +
382 def _get_match_fields(self, language: str) -> Tuple[List[str], List[str]]: 419 def _get_match_fields(self, language: str) -> Tuple[List[str], List[str]]:
383 """ 420 """
384 - Get match fields for a specific language. 421 + Build dynamic match fields for one language.
385 422
386 Args: 423 Args:
387 - language: Language code ('zh' or 'en') 424 + language: Language code (e.g. zh/en/de/fr/...)
388 425
389 Returns: 426 Returns:
390 (all_fields, core_fields) - core_fields are for phrase/keyword queries 427 (all_fields, core_fields) - core_fields are for phrase/keyword queries
391 """ 428 """
392 - if language == 'zh':  
393 - all_fields = [  
394 - "title.zh^3.0",  
395 - "brief.zh^1.5",  
396 - "description.zh",  
397 - "vendor.zh^1.5",  
398 - "tags",  
399 - "category_path.zh^1.5",  
400 - "category_name_text.zh^1.5",  
401 - "option1_values^0.5"  
402 - ]  
403 - core_fields = [  
404 - "title.zh^3.0",  
405 - "brief.zh^1.5",  
406 - "vendor.zh^1.5",  
407 - "category_name_text.zh^1.5"  
408 - ]  
409 - else: # en  
410 - all_fields = [  
411 - "title.en^3.0",  
412 - "brief.en^1.5",  
413 - "description.en",  
414 - "vendor.en^1.5",  
415 - "tags",  
416 - "category_path.en^1.5",  
417 - "category_name_text.en^1.5",  
418 - "option1_values^0.5"  
419 - ]  
420 - core_fields = [  
421 - "title.en^3.0",  
422 - "brief.en^1.5",  
423 - "vendor.en^1.5",  
424 - "category_name_text.en^1.5"  
425 - ] 429 + lang = (language or "").strip().lower()
  430 + all_fields: List[str] = []
  431 + core_fields: List[str] = []
  432 +
  433 + for base in self.multilingual_fields:
  434 + field = f"{base}.{lang}"
  435 + boost = self._get_field_boost(base, lang)
  436 + all_fields.append(self._format_field_with_boost(field, boost))
  437 +
  438 + for shared in self.shared_fields:
  439 + boost = self._get_field_boost(shared, None)
  440 + all_fields.append(self._format_field_with_boost(shared, boost))
  441 +
  442 + for base in self.core_multilingual_fields:
  443 + field = f"{base}.{lang}"
  444 + boost = self._get_field_boost(base, lang)
  445 + core_fields.append(self._format_field_with_boost(field, boost))
  446 +
426 return all_fields, core_fields 447 return all_fields, core_fields
427 448
428 def _get_embedding_field(self, language: str) -> str: 449 def _get_embedding_field(self, language: str) -> str:
@@ -434,9 +455,9 @@ class ESQueryBuilder: @@ -434,9 +455,9 @@ class ESQueryBuilder:
434 """ 455 """
435 Build advanced text query using should clauses with multiple query strategies. 456 Build advanced text query using should clauses with multiple query strategies.
436 457
437 - Reference implementation:  
438 - - base_query: main query with AND operator and 75% minimum_should_match  
439 - - translation queries: lower boost (0.4) for other languages 458 + Unified implementation:
  459 + - base_query: source-language clause
  460 + - translation queries: target-language clauses from search_langs/query_text_by_lang
440 - phrase query: for short queries (2+ tokens) 461 - phrase query: for short queries (2+ tokens)
441 - keywords query: extracted nouns from query 462 - keywords query: extracted nouns from query
442 - KNN query: added separately in build_query 463 - KNN query: added separately in build_query
@@ -451,94 +472,89 @@ class ESQueryBuilder: @@ -451,94 +472,89 @@ class ESQueryBuilder:
451 should_clauses = [] 472 should_clauses = []
452 473
453 # Get query analysis from parsed_query 474 # Get query analysis from parsed_query
454 - translations = {}  
455 - language = self.default_language 475 + query_text_by_lang: Dict[str, str] = {}
  476 + search_langs: List[str] = []
  477 + source_lang = self.default_language
  478 + source_in_index_languages = True
456 keywords = "" 479 keywords = ""
457 query_tokens = [] 480 query_tokens = []
458 token_count = 0 481 token_count = 0
459 482
460 if parsed_query: 483 if parsed_query:
461 - translations = parsed_query.translations or {}  
462 - # Use default language if detected_language is None or "unknown"  
463 - detected_lang = parsed_query.detected_language  
464 - if not detected_lang or detected_lang == "unknown":  
465 - language = self.default_language  
466 - else:  
467 - language = detected_lang 484 + query_text_by_lang = getattr(parsed_query, "query_text_by_lang", None) or {}
  485 + search_langs = getattr(parsed_query, "search_langs", None) or []
  486 + detected_lang = getattr(parsed_query, "detected_language", None)
  487 + source_lang = detected_lang if detected_lang and detected_lang != "unknown" else self.default_language
  488 + source_in_index_languages = bool(
  489 + getattr(parsed_query, "source_in_index_languages", True)
  490 + )
468 keywords = getattr(parsed_query, 'keywords', '') or "" 491 keywords = getattr(parsed_query, 'keywords', '') or ""
469 query_tokens = getattr(parsed_query, 'query_tokens', None) or [] 492 query_tokens = getattr(parsed_query, 'query_tokens', None) or []
470 token_count = len(query_tokens) or getattr(parsed_query, 'token_count', 0) or 0 493 token_count = len(query_tokens) or getattr(parsed_query, 'token_count', 0) or 0
471 494
472 - # Get match fields for the detected language  
473 - match_fields, core_fields = self._get_match_fields(language)  
474 -  
475 - # Tie breaker values  
476 - tie_breaker_base_query = 0.9  
477 - tie_breaker_keywords = 0.9  
478 -  
479 - # 1. Base query - main query with AND operator  
480 - should_clauses.append({  
481 - "multi_match": {  
482 - "_name": "base_query",  
483 - "fields": match_fields,  
484 - "minimum_should_match": "75%",  
485 - # "operator": "AND",  
486 - "query": query_text,  
487 - "tie_breaker": tie_breaker_base_query  
488 - }  
489 - })  
490 -  
491 - # 2. Translation queries - lower boost (0.4) for other languages (multi-language search always on)  
492 - if language != 'zh' and translations.get('zh'):  
493 - zh_fields, _ = self._get_match_fields('zh')  
494 - should_clauses.append({  
495 - "multi_match": {  
496 - "query": translations['zh'],  
497 - "fields": zh_fields,  
498 - "minimum_should_match": "75%",  
499 - "tie_breaker": tie_breaker_base_query,  
500 - "boost": 0.4,  
501 - "_name": "base_query_trans_zh"  
502 - }  
503 - })  
504 - if language != 'en' and translations.get('en'):  
505 - en_fields, _ = self._get_match_fields('en')  
506 - should_clauses.append({  
507 - "multi_match": {  
508 - "query": translations['en'],  
509 - "fields": en_fields,  
510 - "minimum_should_match": "75%",  
511 - "tie_breaker": tie_breaker_base_query,  
512 - "boost": 0.4,  
513 - "_name": "base_query_trans_en"  
514 - }  
515 - }) 495 + if not query_text_by_lang:
  496 + query_text_by_lang = {source_lang: query_text}
  497 + if source_lang not in query_text_by_lang and query_text:
  498 + query_text_by_lang[source_lang] = query_text
  499 + if not search_langs:
  500 + search_langs = list(query_text_by_lang.keys())
  501 +
  502 + # Core fields for phrase/keyword based on source language.
  503 + _, core_fields = self._get_match_fields(source_lang)
  504 + if not core_fields and search_langs:
  505 + _, core_fields = self._get_match_fields(search_langs[0])
  506 +
  507 + # Base + translated clauses based on language plan.
  508 + for lang in search_langs:
  509 + lang_query = query_text_by_lang.get(lang)
  510 + if not lang_query:
  511 + continue
  512 + match_fields, _ = self._get_match_fields(lang)
  513 + if not match_fields:
  514 + continue
516 515
517 - if False and is_long_query:  
518 - boost = 0.5 * pow(min(1.0, token_count / 10.0), 0.9)  
519 - minimum_should_match = "70%"  
520 - should_clauses.append({ 516 + is_source = (lang == source_lang)
  517 + clause_boost = 1.0
  518 + clause_name = "base_query" if is_source else f"base_query_trans_{lang}"
  519 + minimum_should_match = (
  520 + self.base_minimum_should_match if is_source else self.translation_minimum_should_match
  521 + )
  522 + if is_source and not source_in_index_languages:
  523 + clause_boost = self.source_boost_when_missing
  524 + elif not is_source:
  525 + clause_boost = (
  526 + self.translation_boost
  527 + if source_in_index_languages
  528 + else self.translation_boost_when_source_missing
  529 + )
  530 +
  531 + clause = {
521 "multi_match": { 532 "multi_match": {
522 - "query": query_text, 533 + "_name": clause_name,
523 "fields": match_fields, 534 "fields": match_fields,
524 "minimum_should_match": minimum_should_match, 535 "minimum_should_match": minimum_should_match,
525 - "boost": boost,  
526 - "tie_breaker": tie_breaker_long_query,  
527 - "_name": "long_query" 536 + "query": lang_query,
  537 + "tie_breaker": self.tie_breaker_base_query,
528 } 538 }
  539 + }
  540 + if abs(clause_boost - 1.0) > 1e-9:
  541 + clause["multi_match"]["boost"] = clause_boost
  542 + should_clauses.append({
  543 + "multi_match": clause["multi_match"]
529 }) 544 })
530 545
531 # 3. Short query - add phrase query (derived from query_tokens) 546 # 3. Short query - add phrase query (derived from query_tokens)
532 # is_short: quoted or ((token_count <= 2 or len <= 4) and no space) 547 # is_short: quoted or ((token_count <= 2 or len <= 4) and no space)
533 - ENABLE_PHRASE_QUERY = True 548 + source_query_text = query_text_by_lang.get(source_lang) or query_text
  549 + ENABLE_PHRASE_QUERY = self.enable_phrase_query
534 is_quoted = query_text.startswith('"') and query_text.endswith('"') 550 is_quoted = query_text.startswith('"') and query_text.endswith('"')
535 is_short = is_quoted or ((token_count <= 2 or len(query_text) <= 4) and ' ' not in query_text) 551 is_short = is_quoted or ((token_count <= 2 or len(query_text) <= 4) and ' ' not in query_text)
536 - if ENABLE_PHRASE_QUERY and token_count >= 2 and is_short: 552 + if ENABLE_PHRASE_QUERY and core_fields and token_count >= 2 and is_short:
537 query_length = len(query_text) 553 query_length = len(query_text)
538 slop = 0 if query_length < 3 else 1 if query_length < 5 else 2 554 slop = 0 if query_length < 3 else 1 if query_length < 5 else 2
539 should_clauses.append({ 555 should_clauses.append({
540 "multi_match": { 556 "multi_match": {
541 - "query": query_text, 557 + "query": source_query_text,
542 "fields": core_fields, 558 "fields": core_fields,
543 "type": "phrase", 559 "type": "phrase",
544 "slop": slop, 560 "slop": slop,
@@ -548,18 +564,31 @@ class ESQueryBuilder: @@ -548,18 +564,31 @@ class ESQueryBuilder:
548 }) 564 })
549 565
550 # 4. Keywords query - extracted nouns from query 566 # 4. Keywords query - extracted nouns from query
551 - elif keywords and len(keywords.split()) <= 2 and 2 * len(keywords.replace(' ', '')) <= len(query_text): 567 + elif core_fields and keywords and len(keywords.split()) <= 2 and 2 * len(keywords.replace(' ', '')) <= len(query_text):
552 should_clauses.append({ 568 should_clauses.append({
553 "multi_match": { 569 "multi_match": {
554 "query": keywords, 570 "query": keywords,
555 "fields": core_fields, 571 "fields": core_fields,
556 # "operator": "AND", 572 # "operator": "AND",
557 - "tie_breaker": tie_breaker_keywords,  
558 - "boost": 0.1, 573 + "tie_breaker": self.tie_breaker_keywords,
  574 + "boost": self.keywords_boost,
559 "_name": "keywords_query" 575 "_name": "keywords_query"
560 } 576 }
561 }) 577 })
562 578
  579 + # Fallback to a simple query when language fields cannot be resolved.
  580 + if not should_clauses:
  581 + fallback_fields = self.match_fields or ["title.en^1.0"]
  582 + return {
  583 + "multi_match": {
  584 + "_name": "base_query_fallback",
  585 + "query": query_text,
  586 + "fields": fallback_fields,
  587 + "minimum_should_match": self.base_minimum_should_match,
  588 + "tie_breaker": self.tie_breaker_base_query,
  589 + }
  590 + }
  591 +
563 # Return bool query with should clauses 592 # Return bool query with should clauses
564 if len(should_clauses) == 1: 593 if len(should_clauses) == 1:
565 return should_clauses[0] 594 return should_clauses[0]
@@ -571,70 +600,6 @@ class ESQueryBuilder: @@ -571,70 +600,6 @@ class ESQueryBuilder:
571 } 600 }
572 } 601 }
573 602
574 - def _build_boolean_query(self, node: QueryNode) -> Dict[str, Any]:  
575 - """  
576 - Build query from boolean expression tree.  
577 -  
578 - Args:  
579 - node: Query tree node  
580 -  
581 - Returns:  
582 - ES query clause  
583 - """  
584 - if node.operator == 'TERM':  
585 - # Leaf node - simple text query  
586 - return self._build_text_query(node.value)  
587 -  
588 - elif node.operator == 'AND':  
589 - # All terms must match  
590 - return {  
591 - "bool": {  
592 - "must": [  
593 - self._build_boolean_query(term)  
594 - for term in node.terms  
595 - ]  
596 - }  
597 - }  
598 -  
599 - elif node.operator == 'OR':  
600 - # Any term must match  
601 - return {  
602 - "bool": {  
603 - "should": [  
604 - self._build_boolean_query(term)  
605 - for term in node.terms  
606 - ],  
607 - "minimum_should_match": 1  
608 - }  
609 - }  
610 -  
611 - elif node.operator == 'ANDNOT':  
612 - # First term must match, second must not  
613 - if len(node.terms) >= 2:  
614 - return {  
615 - "bool": {  
616 - "must": [self._build_boolean_query(node.terms[0])],  
617 - "must_not": [self._build_boolean_query(node.terms[1])]  
618 - }  
619 - }  
620 - else:  
621 - return self._build_boolean_query(node.terms[0])  
622 -  
623 - elif node.operator == 'RANK':  
624 - # Like OR but for ranking (all terms contribute to score)  
625 - return {  
626 - "bool": {  
627 - "should": [  
628 - self._build_boolean_query(term)  
629 - for term in node.terms  
630 - ]  
631 - }  
632 - }  
633 -  
634 - else:  
635 - # Unknown operator  
636 - return {"match_all": {}}  
637 -  
638 def _build_filters( 603 def _build_filters(
639 self, 604 self,
640 filters: Optional[Dict[str, Any]] = None, 605 filters: Optional[Dict[str, Any]] = None,
search/searcher.py
1 """ 1 """
2 Main Searcher module - executes search queries against Elasticsearch. 2 Main Searcher module - executes search queries against Elasticsearch.
3 3
4 -Handles query parsing, boolean expressions, ranking, and result formatting. 4 +Handles query parsing, ranking, and result formatting.
5 """ 5 """
6 6
7 from typing import Dict, Any, List, Optional, Union 7 from typing import Dict, Any, List, Optional, Union
@@ -12,11 +12,9 @@ import logging @@ -12,11 +12,9 @@ import logging
12 from utils.es_client import ESClient 12 from utils.es_client import ESClient
13 from query import QueryParser, ParsedQuery 13 from query import QueryParser, ParsedQuery
14 from embeddings.image_encoder import CLIPImageEncoder 14 from embeddings.image_encoder import CLIPImageEncoder
15 -from .boolean_parser import BooleanParser, QueryNode  
16 from .es_query_builder import ESQueryBuilder 15 from .es_query_builder import ESQueryBuilder
17 from config import SearchConfig 16 from config import SearchConfig
18 from config.tenant_config_loader import get_tenant_config_loader 17 from config.tenant_config_loader import get_tenant_config_loader
19 -from config.utils import get_match_fields_for_index  
20 from context.request_context import RequestContext, RequestContextStage 18 from context.request_context import RequestContext, RequestContextStage
21 from api.models import FacetResult, FacetValue, FacetConfig 19 from api.models import FacetResult, FacetValue, FacetConfig
22 from api.result_formatter import ResultFormatter 20 from api.result_formatter import ResultFormatter
@@ -73,7 +71,7 @@ class Searcher: @@ -73,7 +71,7 @@ class Searcher:
73 71
74 Handles: 72 Handles:
75 - Query parsing and translation 73 - Query parsing and translation
76 - - Boolean expression parsing 74 + - Dynamic multi-language text recall planning
77 - ES query building 75 - ES query building
78 - Result ranking and formatting 76 - Result ranking and formatting
79 """ 77 """
@@ -98,12 +96,6 @@ class Searcher: @@ -98,12 +96,6 @@ class Searcher:
98 self.config = config 96 self.config = config
99 # Index name is now generated dynamically per tenant, no longer stored here 97 # Index name is now generated dynamically per tenant, no longer stored here
100 self.query_parser = query_parser or QueryParser(config) 98 self.query_parser = query_parser or QueryParser(config)
101 -  
102 - # Initialize components  
103 - self.boolean_parser = BooleanParser()  
104 -  
105 - # Get match fields from config  
106 - self.match_fields = get_match_fields_for_index(config, "default")  
107 self.text_embedding_field = config.query_config.text_embedding_field or "title_embedding" 99 self.text_embedding_field = config.query_config.text_embedding_field or "title_embedding"
108 self.image_embedding_field = config.query_config.image_embedding_field 100 self.image_embedding_field = config.query_config.image_embedding_field
109 if self.image_embedding_field and image_encoder is None: 101 if self.image_embedding_field and image_encoder is None:
@@ -114,13 +106,26 @@ class Searcher: @@ -114,13 +106,26 @@ class Searcher:
114 106
115 # Query builder - simplified single-layer architecture 107 # Query builder - simplified single-layer architecture
116 self.query_builder = ESQueryBuilder( 108 self.query_builder = ESQueryBuilder(
117 - match_fields=self.match_fields, 109 + match_fields=[],
  110 + field_boosts=self.config.field_boosts,
  111 + multilingual_fields=self.config.query_config.multilingual_fields,
  112 + shared_fields=self.config.query_config.shared_fields,
  113 + core_multilingual_fields=self.config.query_config.core_multilingual_fields,
118 text_embedding_field=self.text_embedding_field, 114 text_embedding_field=self.text_embedding_field,
119 image_embedding_field=self.image_embedding_field, 115 image_embedding_field=self.image_embedding_field,
120 source_fields=self.source_fields, 116 source_fields=self.source_fields,
121 function_score_config=self.config.function_score, 117 function_score_config=self.config.function_score,
122 default_language=self.config.query_config.default_language, 118 default_language=self.config.query_config.default_language,
123 - knn_boost=self.config.query_config.knn_boost 119 + knn_boost=self.config.query_config.knn_boost,
  120 + base_minimum_should_match=self.config.query_config.base_minimum_should_match,
  121 + translation_minimum_should_match=self.config.query_config.translation_minimum_should_match,
  122 + translation_boost=self.config.query_config.translation_boost,
  123 + translation_boost_when_source_missing=self.config.query_config.translation_boost_when_source_missing,
  124 + source_boost_when_missing=self.config.query_config.source_boost_when_missing,
  125 + keywords_boost=self.config.query_config.keywords_boost,
  126 + enable_phrase_query=self.config.query_config.enable_phrase_query,
  127 + tie_breaker_base_query=self.config.query_config.tie_breaker_base_query,
  128 + tie_breaker_keywords=self.config.query_config.tie_breaker_keywords,
124 ) 129 )
125 130
126 def _apply_source_filter(self, es_query: Dict[str, Any]) -> None: 131 def _apply_source_filter(self, es_query: Dict[str, Any]) -> None:
@@ -250,7 +255,7 @@ class Searcher: @@ -250,7 +255,7 @@ class Searcher:
250 translations=parsed_query.translations, 255 translations=parsed_query.translations,
251 query_vector=parsed_query.query_vector.tolist() if parsed_query.query_vector is not None else None, 256 query_vector=parsed_query.query_vector.tolist() if parsed_query.query_vector is not None else None,
252 domain=parsed_query.domain, 257 domain=parsed_query.domain,
253 - is_simple_query=self.boolean_parser.is_simple_query(parsed_query.rewritten_query) 258 + is_simple_query=True
254 ) 259 )
255 260
256 context.logger.info( 261 context.logger.info(
@@ -271,38 +276,7 @@ class Searcher: @@ -271,38 +276,7 @@ class Searcher:
271 finally: 276 finally:
272 context.end_stage(RequestContextStage.QUERY_PARSING) 277 context.end_stage(RequestContextStage.QUERY_PARSING)
273 278
274 - # Step 2: Boolean parsing  
275 - context.start_stage(RequestContextStage.BOOLEAN_PARSING)  
276 - try:  
277 - query_node = None  
278 - if self.boolean_parser.is_simple_query(parsed_query.rewritten_query):  
279 - # Simple query  
280 - query_text = parsed_query.rewritten_query  
281 - context.logger.debug(  
282 - f"简单查询 | 无布尔表达式",  
283 - extra={'reqid': context.reqid, 'uid': context.uid}  
284 - )  
285 - else:  
286 - # Complex boolean query  
287 - query_node = self.boolean_parser.parse(parsed_query.rewritten_query)  
288 - query_text = parsed_query.rewritten_query  
289 - context.store_intermediate_result('query_node', query_node)  
290 - context.store_intermediate_result('boolean_ast', str(query_node))  
291 - context.logger.info(  
292 - f"布尔表达式解析 | AST: {query_node}",  
293 - extra={'reqid': context.reqid, 'uid': context.uid}  
294 - )  
295 - except Exception as e:  
296 - context.set_error(e)  
297 - context.logger.error(  
298 - f"布尔表达式解析失败 | 错误: {str(e)}",  
299 - extra={'reqid': context.reqid, 'uid': context.uid}  
300 - )  
301 - raise  
302 - finally:  
303 - context.end_stage(RequestContextStage.BOOLEAN_PARSING)  
304 -  
305 - # Step 3: Query building 279 + # Step 2: Query building
306 context.start_stage(RequestContextStage.QUERY_BUILDING) 280 context.start_stage(RequestContextStage.QUERY_BUILDING)
307 try: 281 try:
308 # Generate tenant-specific index name 282 # Generate tenant-specific index name
@@ -314,7 +288,6 @@ class Searcher: @@ -314,7 +288,6 @@ class Searcher:
314 es_query = self.query_builder.build_query( 288 es_query = self.query_builder.build_query(
315 query_text=parsed_query.rewritten_query or parsed_query.query_normalized, 289 query_text=parsed_query.rewritten_query or parsed_query.query_normalized,
316 query_vector=parsed_query.query_vector if enable_embedding else None, 290 query_vector=parsed_query.query_vector if enable_embedding else None,
317 - query_node=query_node,  
318 filters=filters, 291 filters=filters,
319 range_filters=range_filters, 292 range_filters=range_filters,
320 facet_configs=facets, 293 facet_configs=facets,
@@ -529,7 +502,6 @@ class Searcher: @@ -529,7 +502,6 @@ class Searcher:
529 "translations": context.query_analysis.translations, 502 "translations": context.query_analysis.translations,
530 "has_vector": context.query_analysis.query_vector is not None, 503 "has_vector": context.query_analysis.query_vector is not None,
531 "is_simple_query": context.query_analysis.is_simple_query, 504 "is_simple_query": context.query_analysis.is_simple_query,
532 - "boolean_ast": context.get_intermediate_result('boolean_ast'),  
533 "domain": context.query_analysis.domain 505 "domain": context.query_analysis.domain
534 }, 506 },
535 "es_query": context.get_intermediate_result('es_query', {}), 507 "es_query": context.get_intermediate_result('es_query', {}),
@@ -666,12 +638,18 @@ class Searcher: @@ -666,12 +638,18 @@ class Searcher:
666 638
667 def get_domain_summary(self) -> Dict[str, Any]: 639 def get_domain_summary(self) -> Dict[str, Any]:
668 """ 640 """
669 - Get summary of all configured domains. 641 + Get summary of dynamic text retrieval configuration.
670 642
671 Returns: 643 Returns:
672 - Dictionary with domain information 644 + Dictionary with language-aware field information
673 """ 645 """
674 - return self.query_builder.get_domain_summary() 646 + return {
  647 + "mode": "dynamic_language_fields",
  648 + "multilingual_fields": self.config.query_config.multilingual_fields,
  649 + "shared_fields": self.config.query_config.shared_fields,
  650 + "core_multilingual_fields": self.config.query_config.core_multilingual_fields,
  651 + "field_boosts": self.config.field_boosts,
  652 + }
675 653
676 def get_document(self, tenant_id: str, doc_id: str) -> Optional[Dict[str, Any]]: 654 def get_document(self, tenant_id: str, doc_id: str) -> Optional[Dict[str, Any]]:
677 """ 655 """
suggestion/service.py
@@ -33,6 +33,68 @@ class SuggestionService: @@ -33,6 +33,68 @@ class SuggestionService:
33 return primary 33 return primary
34 return index_languages[0] 34 return index_languages[0]
35 35
  36 + def _completion_suggest(
  37 + self,
  38 + index_name: str,
  39 + query: str,
  40 + lang: str,
  41 + size: int,
  42 + ) -> List[Dict[str, Any]]:
  43 + """
  44 + Query ES completion suggester from `completion.<lang>`.
  45 +
  46 + Returns items in the same shape as search hits -> dicts with "text"/"lang"/"score"/"rank_score"/"sources".
  47 + """
  48 + field_name = f"completion.{lang}"
  49 + body = {
  50 + "suggest": {
  51 + "s": {
  52 + "prefix": query,
  53 + "completion": {
  54 + "field": field_name,
  55 + "size": size,
  56 + "skip_duplicates": True,
  57 + },
  58 + }
  59 + },
  60 + "_source": [
  61 + "text",
  62 + "lang",
  63 + "rank_score",
  64 + "sources",
  65 + "lang_source",
  66 + "lang_confidence",
  67 + "lang_conflict",
  68 + ],
  69 + }
  70 + try:
  71 + resp = self.es_client.client.search(index=index_name, body=body)
  72 + except Exception as e:
  73 + # completion is an optimization path; never hard-fail the whole endpoint
  74 + logger.warning("Completion suggest failed for index=%s field=%s: %s", index_name, field_name, e)
  75 + return []
  76 +
  77 + entries = (resp.get("suggest", {}) or {}).get("s", []) or []
  78 + if not entries:
  79 + return []
  80 + options = entries[0].get("options", []) or []
  81 + out: List[Dict[str, Any]] = []
  82 + for opt in options:
  83 + src = opt.get("_source", {}) or {}
  84 + out.append(
  85 + {
  86 + "text": src.get("text") or opt.get("text"),
  87 + "lang": src.get("lang") or lang,
  88 + "score": opt.get("_score", 0.0),
  89 + "rank_score": src.get("rank_score"),
  90 + "sources": src.get("sources", []),
  91 + "lang_source": src.get("lang_source"),
  92 + "lang_confidence": src.get("lang_confidence"),
  93 + "lang_conflict": src.get("lang_conflict", False),
  94 + }
  95 + )
  96 + return out
  97 +
36 def _search_products_for_suggestion( 98 def _search_products_for_suggestion(
37 self, 99 self,
38 tenant_id: str, 100 tenant_id: str,
@@ -95,6 +157,17 @@ class SuggestionService: @@ -95,6 +157,17 @@ class SuggestionService:
95 start = time.time() 157 start = time.time()
96 resolved_lang = self._resolve_language(tenant_id, language) 158 resolved_lang = self._resolve_language(tenant_id, language)
97 index_name = get_suggestion_index_name(tenant_id) 159 index_name = get_suggestion_index_name(tenant_id)
  160 + if not self.es_client.index_exists(index_name):
  161 + # On a fresh ES cluster the suggestion index might not be built yet.
  162 + # Keep endpoint stable for frontend autocomplete: return empty list instead of 500.
  163 + took_ms = int((time.time() - start) * 1000)
  164 + return {
  165 + "query": query,
  166 + "language": language,
  167 + "resolved_language": resolved_lang,
  168 + "suggestions": [],
  169 + "took_ms": took_ms,
  170 + }
98 171
99 sat_field = f"sat.{resolved_lang}" 172 sat_field = f"sat.{resolved_lang}"
100 dsl = { 173 dsl = {
@@ -139,14 +212,42 @@ class SuggestionService: @@ -139,14 +212,42 @@ class SuggestionService:
139 "lang_conflict", 212 "lang_conflict",
140 ], 213 ],
141 } 214 }
  215 + # Recall path A: bool_prefix on search_as_you_type
142 es_resp = self.es_client.search(index_name=index_name, body=dsl, size=size, from_=0) 216 es_resp = self.es_client.search(index_name=index_name, body=dsl, size=size, from_=0)
143 hits = es_resp.get("hits", {}).get("hits", []) or [] 217 hits = es_resp.get("hits", {}).get("hits", []) or []
144 218
  219 + # Recall path B: completion suggester (optional optimization)
  220 + completion_items = self._completion_suggest(
  221 + index_name=index_name,
  222 + query=query,
  223 + lang=resolved_lang,
  224 + size=size,
  225 + )
  226 +
145 suggestions: List[Dict[str, Any]] = [] 227 suggestions: List[Dict[str, Any]] = []
  228 + seen_text_norm: set = set()
  229 +
  230 + def _norm_text(v: Any) -> str:
  231 + return str(v or "").strip().lower()
  232 +
  233 + # Put completion results first (usually better prefix UX), then fill with sat results.
  234 + for item in completion_items:
  235 + text_val = item.get("text")
  236 + norm = _norm_text(text_val)
  237 + if not norm or norm in seen_text_norm:
  238 + continue
  239 + seen_text_norm.add(norm)
  240 + suggestions.append(dict(item))
  241 +
146 for hit in hits: 242 for hit in hits:
147 src = hit.get("_source", {}) or {} 243 src = hit.get("_source", {}) or {}
  244 + text_val = src.get("text")
  245 + norm = _norm_text(text_val)
  246 + if not norm or norm in seen_text_norm:
  247 + continue
  248 + seen_text_norm.add(norm)
148 item = { 249 item = {
149 - "text": src.get("text"), 250 + "text": text_val,
150 "lang": src.get("lang"), 251 "lang": src.get("lang"),
151 "score": hit.get("_score", 0.0), 252 "score": hit.get("_score", 0.0),
152 "rank_score": src.get("rank_score"), 253 "rank_score": src.get("rank_score"),
@@ -173,7 +274,7 @@ class SuggestionService: @@ -173,7 +274,7 @@ class SuggestionService:
173 "query": query, 274 "query": query,
174 "language": language, 275 "language": language,
175 "resolved_language": resolved_lang, 276 "resolved_language": resolved_lang,
176 - "suggestions": suggestions, 277 + "suggestions": suggestions[:size],
177 "took_ms": took_ms, 278 "took_ms": took_ms,
178 } 279 }
179 280