Commit bd96ceadef76dc327afcd2d07a023f4902d2f9f5

Authored by tangwang
1 parent 24e92141

1. 动态多语言字段与统一策略配置

- 配置改为“字段基名 + 动态语言后缀”方案,已不再依赖旧 `indexes`。
[config.yaml](/data/saas-search/config/config.yaml#L17)
- `search_fields` / `text_query_strategy` 已进入强校验与解析流程。
[config_loader.py](/data/saas-search/config/config_loader.py#L254)

2. 查询语言计划与翻译等待策略
- `QueryParser` 现在产出
  `query_text_by_lang`、`search_langs`、`source_in_index_languages`。
[query_parser.py](/data/saas-search/query/query_parser.py#L41)
- 你要求的两种翻译路径都在:
  - 源语言不在店铺 `index_languages`:`translate_multi_async` + 等待
    future
  - 源语言在 `index_languages`:`translate_multi(...,
    async_mode=True)`,尽量走缓存
[query_parser.py](/data/saas-search/query/query_parser.py#L284)

3. ES 查询统一文本策略(无 AST 分支)
- 主召回按 `search_langs` 动态拼 `field.{lang}`,翻译语种做次权重
  `should`。
[es_query_builder.py](/data/saas-search/search/es_query_builder.py#L454)
- 布尔 AST 路径已删除,仅保留统一文本策略。
[es_query_builder.py](/data/saas-search/search/es_query_builder.py#L185)

4. LanguageDetector 优化
- 从“拉丁字母默认英文”升级为:脚本优先 +
  拉丁语系打分(词典/变音/后缀)。
[language_detector.py](/data/saas-search/query/language_detector.py#L68)

5. 布尔能力清理(补充)
- 已删除废弃模块:
[boolean_parser.py](/data/saas-search/search/boolean_parser.py)
- `search/__init__` 已无相关导出。
[search/__init__.py](/data/saas-search/search/__init__.py)

6. `indexes` 过时收口(补充)
- 兼容函数改为基于动态字段生成,不再依赖 `config.indexes`。
[utils.py](/data/saas-search/config/utils.py#L24)
- Admin 配置接口改为返回动态字段配置,不再暴露 `num_indexes`。
[admin.py](/data/saas-search/api/routes/admin.py#L52)

7. suggest
1 1 # Elasticsearch Configuration
2   -ES_HOST=http://localhost:9200
3   -ES_USERNAME=saas
  2 +ES_HOST=http://120.76.41.98:9200
  3 +ES_USERNAME=essa
4 4 ES_PASSWORD=4hOaLaf41y2VuI8y
5 5  
6 6 # Redis Configuration (Optional) - AI 生产 10.200.16.14:6479
... ...
api/models.py
... ... @@ -70,12 +70,12 @@ class SearchRequest(BaseModel):
70 70 """搜索请求模型(重构版)"""
71 71  
72 72 # 基础搜索参数
73   - query: str = Field(..., description="搜索查询字符串,支持布尔表达式(AND, OR, RANK, ANDNOT)")
  73 + query: str = Field(..., description="搜索查询字符串(统一文本检索策略)")
74 74 size: int = Field(10, ge=1, le=1000, description="返回结果数量")
75 75 from_: int = Field(0, ge=0, alias="from", description="分页偏移量")
76   - language: Literal["zh", "en"] = Field(
77   - "zh",
78   - description="响应语言:'zh'(中文)或 'en'(英文),用于选择 title/description/vendor 等多语言字段"
  76 + language: str = Field(
  77 + "en",
  78 + description="响应语言代码(如 zh/en/de/fr/ar/ru),用于多语言字段返回优先级"
79 79 )
80 80  
81 81 # 过滤器 - 精确匹配和多值匹配
... ...
api/result_formatter.py
... ... @@ -27,20 +27,32 @@ class ResultFormatter:
27 27 List of SpuResult objects
28 28 """
29 29 results = []
30   - lang = (language or "en").lower()
31   - if lang not in ("zh", "en"):
32   - lang = "en"
  30 + lang = (language or "en").lower().replace("-", "_")
  31 + lang_base = lang.split("_")[0] if lang else "en"
33 32  
34 33 def pick_lang_field(src: Dict[str, Any], base: str) -> Optional[str]:
35 34 """从多语言对象字段中按语言选择一个值:{base: {"zh": "...", "en": "...", ...}}"""
36 35 obj = src.get(base)
37 36 if not isinstance(obj, dict):
38 37 return None
39   - zh_val = obj.get("zh")
40   - en_val = obj.get("en")
41   - if lang == "zh":
42   - return zh_val or en_val
43   - return en_val or zh_val
  38 + candidates = [
  39 + lang,
  40 + lang_base,
  41 + "en",
  42 + "zh",
  43 + ]
  44 + seen = set()
  45 + for cand in candidates:
  46 + if not cand or cand in seen:
  47 + continue
  48 + seen.add(cand)
  49 + value = obj.get(cand)
  50 + if value:
  51 + return value
  52 + for value in obj.values():
  53 + if value:
  54 + return value
  55 + return None
44 56  
45 57 for hit in es_hits:
46 58 source = hit.get('_source', {})
... ... @@ -434,4 +446,3 @@ class ResultFormatter:
434 446 """
435 447 # TODO: Implement related search generation logic
436 448 return []
437   -
... ...
api/routes/admin.py
... ... @@ -52,7 +52,9 @@ async def get_configuration():
52 52 return {
53 53 "es_index_name": config.es_index_name,
54 54 "num_field_boosts": len(config.field_boosts),
55   - "num_indexes": len(config.indexes),
  55 + "multilingual_fields": config.query_config.multilingual_fields,
  56 + "shared_fields": config.query_config.shared_fields,
  57 + "core_multilingual_fields": config.query_config.core_multilingual_fields,
56 58 "supported_languages": config.query_config.supported_languages,
57 59 "ranking_expression": config.ranking.expression,
58 60 "spu_enabled": config.spu_config.enabled
... ...
api/routes/search.py
... ... @@ -37,7 +37,7 @@ async def search(request: SearchRequest, http_request: Request):
37 37  
38 38 Supports:
39 39 - Multi-language query processing
40   - - Boolean operators (AND, OR, RANK, ANDNOT)
  40 + - Unified text retrieval strategy (no boolean AST parsing)
41 41 - Semantic search with embeddings
42 42 - Custom ranking functions
43 43 - Exact match filters and range filters
... ...
config/config.yaml
... ... @@ -12,71 +12,20 @@ es_settings:
12 12 refresh_interval: "30s"
13 13  
14 14 # 字段权重配置(用于搜索时的字段boost)
15   -# 只配置权重,不配置字段结构(字段结构由 mappings/search_products.json 定义)
  15 +# 统一按“字段基名”配置;查询时按 search_langs 动态拼接 .{lang}。
  16 +# 若需要按某个语言单独调权,也可以加显式 key(例如 title.de: 3.2)。
16 17 field_boosts:
17   - # 文本相关性字段
18   - "title.zh": 3.0
19   - "brief.zh": 1.5
20   - "description.zh": 1.0
21   - "vendor.zh": 1.5
22   - "title.en": 3.0
23   - "brief.en": 1.5
24   - "description.en": 1.0
25   - "vendor.en": 1.5
26   -
27   - # 分类相关字段
28   - "category_path.zh": 1.5
29   - "category_name_text.zh": 1.5
30   - "category_path.en": 1.5
31   - "category_name_text.en": 1.5
32   -
33   - # 标签和属性值字段
  18 + title: 3.0
  19 + brief: 1.5
  20 + description: 1.0
  21 + vendor: 1.5
  22 + category_path: 1.5
  23 + category_name_text: 1.5
34 24 tags: 1.0
35 25 option1_values: 0.5
36 26 option2_values: 0.5
37 27 option3_values: 0.5
38 28  
39   -# 搜索域配置(Query Domains)
40   -# 定义不同的搜索策略,指定哪些字段组合在一起搜索
41   -indexes:
42   - - name: "default"
43   - label: "默认搜索"
44   - fields:
45   - - "title.zh"
46   - - "brief.zh"
47   - - "description.zh"
48   - - "vendor.zh"
49   - - "tags"
50   - - "category_path.zh"
51   - - "category_name_text.zh"
52   - - "option1_values"
53   - boost: 1.0
54   -
55   - - name: "title"
56   - label: "标题搜索"
57   - fields:
58   - - "title.zh"
59   - boost: 2.0
60   -
61   - - name: "vendor"
62   - label: "品牌搜索"
63   - fields:
64   - - "vendor.zh"
65   - boost: 1.5
66   -
67   - - name: "category"
68   - label: "类目搜索"
69   - fields:
70   - - "category_path.zh"
71   - - "category_name_text.zh"
72   - boost: 1.5
73   -
74   - - name: "tags"
75   - label: "标签搜索"
76   - fields:
77   - - "tags"
78   - boost: 1.0
79   -
80 29 # Query Configuration(查询配置)
81 30 query_config:
82 31 # 支持的语言
... ... @@ -89,6 +38,41 @@ query_config:
89 38 enable_text_embedding: true
90 39 enable_query_rewrite: true
91 40  
  41 + # 动态多语言检索字段配置
  42 + # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式;
  43 + # shared_fields 为无语言后缀字段。
  44 + search_fields:
  45 + multilingual_fields:
  46 + - "title"
  47 + - "brief"
  48 + - "description"
  49 + - "vendor"
  50 + - "category_path"
  51 + - "category_name_text"
  52 + shared_fields:
  53 + - "tags"
  54 + - "option1_values"
  55 + - "option2_values"
  56 + - "option3_values"
  57 + core_multilingual_fields:
  58 + - "title"
  59 + - "brief"
  60 + - "vendor"
  61 + - "category_name_text"
  62 +
  63 + # 统一文本召回策略(主查询 + 翻译查询 + phrase/keywords)
  64 + text_query_strategy:
  65 + base_minimum_should_match: "75%"
  66 + translation_minimum_should_match: "75%"
  67 + translation_boost: 0.4
  68 + translation_boost_when_source_missing: 1.0
  69 + source_boost_when_missing: 0.6
  70 + original_query_fallback_boost_when_translation_missing: 0.2
  71 + keywords_boost: 0.1
  72 + enable_phrase_query: true
  73 + tie_breaker_base_query: 0.9
  74 + tie_breaker_keywords: 0.9
  75 +
92 76 # Embedding字段名称
93 77 text_embedding_field: "title_embedding"
94 78 image_embedding_field: null
... ...
config/config_loader.py
... ... @@ -57,6 +57,28 @@ class QueryConfig:
57 57  
58 58 # KNN boost configuration
59 59 knn_boost: float = 0.25 # Boost value for KNN (embedding recall)
  60 +
  61 + # Dynamic text fields for multi-language retrieval
  62 + multilingual_fields: List[str] = field(
  63 + default_factory=lambda: ["title", "brief", "description", "vendor", "category_path", "category_name_text"]
  64 + )
  65 + shared_fields: List[str] = field(
  66 + default_factory=lambda: ["tags", "option1_values", "option2_values", "option3_values"]
  67 + )
  68 + core_multilingual_fields: List[str] = field(
  69 + default_factory=lambda: ["title", "brief", "vendor", "category_name_text"]
  70 + )
  71 +
  72 + # Unified text strategy tuning
  73 + base_minimum_should_match: str = "75%"
  74 + translation_minimum_should_match: str = "75%"
  75 + translation_boost: float = 0.4
  76 + translation_boost_when_source_missing: float = 1.0
  77 + source_boost_when_missing: float = 0.6
  78 + keywords_boost: float = 0.1
  79 + enable_phrase_query: bool = True
  80 + tie_breaker_base_query: float = 0.9
  81 + tie_breaker_keywords: float = 0.9
60 82  
61 83  
62 84 @dataclass
... ... @@ -102,7 +124,7 @@ class SearchConfig:
102 124 # 字段权重配置(用于搜索)
103 125 field_boosts: Dict[str, float]
104 126  
105   - # Index structure (query domains)
  127 + # Legacy index domains (deprecated; kept for compatibility)
106 128 indexes: List[IndexConfig]
107 129  
108 130 # Query processing
... ... @@ -218,7 +240,7 @@ class ConfigLoader:
218 240 if not isinstance(field_boosts, dict):
219 241 raise ConfigurationError("field_boosts must be a dictionary")
220 242  
221   - # Parse indexes
  243 + # Parse indexes (deprecated; compatibility only)
222 244 indexes = []
223 245 for index_data in config_data.get("indexes", []):
224 246 indexes.append(self._parse_index_config(index_data))
... ... @@ -228,6 +250,8 @@ class ConfigLoader:
228 250 services_data = config_data.get("services", {}) if isinstance(config_data.get("services", {}), dict) else {}
229 251 rewrite_dictionary = self._load_rewrite_dictionary()
230 252 embedding_thresholds = query_config_data.get("embedding_disable_thresholds", {})
  253 + search_fields_cfg = query_config_data.get("search_fields", {})
  254 + text_strategy_cfg = query_config_data.get("text_query_strategy", {})
231 255  
232 256 query_config = QueryConfig(
233 257 supported_languages=query_config_data.get("supported_languages") or ["zh", "en"],
... ... @@ -245,7 +269,30 @@ class ConfigLoader:
245 269 embedding_disable_chinese_char_limit=embedding_thresholds.get("chinese_char_limit", 4),
246 270 embedding_disable_english_word_limit=embedding_thresholds.get("english_word_limit", 3),
247 271 source_fields=query_config_data.get("source_fields"),
248   - knn_boost=query_config_data.get("knn_boost", 0.25)
  272 + knn_boost=query_config_data.get("knn_boost", 0.25),
  273 + multilingual_fields=search_fields_cfg.get(
  274 + "multilingual_fields",
  275 + ["title", "brief", "description", "vendor", "category_path", "category_name_text"],
  276 + ),
  277 + shared_fields=search_fields_cfg.get(
  278 + "shared_fields",
  279 + ["tags", "option1_values", "option2_values", "option3_values"],
  280 + ),
  281 + core_multilingual_fields=search_fields_cfg.get(
  282 + "core_multilingual_fields",
  283 + ["title", "brief", "vendor", "category_name_text"],
  284 + ),
  285 + base_minimum_should_match=str(text_strategy_cfg.get("base_minimum_should_match", "75%")),
  286 + translation_minimum_should_match=str(text_strategy_cfg.get("translation_minimum_should_match", "75%")),
  287 + translation_boost=float(text_strategy_cfg.get("translation_boost", 0.4)),
  288 + translation_boost_when_source_missing=float(
  289 + text_strategy_cfg.get("translation_boost_when_source_missing", 1.0)
  290 + ),
  291 + source_boost_when_missing=float(text_strategy_cfg.get("source_boost_when_missing", 0.6)),
  292 + keywords_boost=float(text_strategy_cfg.get("keywords_boost", 0.1)),
  293 + enable_phrase_query=bool(text_strategy_cfg.get("enable_phrase_query", True)),
  294 + tie_breaker_base_query=float(text_strategy_cfg.get("tie_breaker_base_query", 0.9)),
  295 + tie_breaker_keywords=float(text_strategy_cfg.get("tie_breaker_keywords", 0.9)),
249 296 )
250 297  
251 298 # Parse ranking config
... ... @@ -336,10 +383,7 @@ class ConfigLoader:
336 383 elif boost < 0:
337 384 errors.append(f"field_boosts['{field_name}']: boost must be non-negative")
338 385  
339   - # Validate indexes
340   - if not config.indexes:
341   - errors.append("At least one index domain must be defined")
342   -
  386 + # Validate indexes (deprecated, optional)
343 387 index_names = set()
344 388 for index in config.indexes:
345 389 # Check for duplicate index names
... ... @@ -365,6 +409,39 @@ class ConfigLoader:
365 409 f"Default language '{config.query_config.default_language}' "
366 410 f"not in supported languages: {config.query_config.supported_languages}"
367 411 )
  412 +
  413 + # Validate dynamic search fields
  414 + def _validate_str_list(name: str, values: List[str]) -> None:
  415 + if not isinstance(values, list) or not values:
  416 + errors.append(f"query_config.{name} must be a non-empty list[str]")
  417 + return
  418 + for i, val in enumerate(values):
  419 + if not isinstance(val, str) or not val.strip():
  420 + errors.append(f"query_config.{name}[{i}] must be a non-empty string")
  421 +
  422 + _validate_str_list("multilingual_fields", config.query_config.multilingual_fields)
  423 + _validate_str_list("shared_fields", config.query_config.shared_fields)
  424 + _validate_str_list("core_multilingual_fields", config.query_config.core_multilingual_fields)
  425 +
  426 + core_set = set(config.query_config.core_multilingual_fields)
  427 + multi_set = set(config.query_config.multilingual_fields)
  428 + if not core_set.issubset(multi_set):
  429 + errors.append("query_config.core_multilingual_fields must be subset of multilingual_fields")
  430 +
  431 + # Validate text query strategy numbers
  432 + for name in (
  433 + "translation_boost",
  434 + "translation_boost_when_source_missing",
  435 + "source_boost_when_missing",
  436 + "keywords_boost",
  437 + "tie_breaker_base_query",
  438 + "tie_breaker_keywords",
  439 + ):
  440 + value = getattr(config.query_config, name, None)
  441 + if not isinstance(value, (int, float)):
  442 + errors.append(f"query_config.{name} must be a number")
  443 + elif value < 0:
  444 + errors.append(f"query_config.{name} must be non-negative")
368 445  
369 446 # Validate source_fields tri-state semantics
370 447 source_fields = config.query_config.source_fields
... ... @@ -409,7 +486,23 @@ class ConfigLoader:
409 486 "chinese_char_limit": config.query_config.embedding_disable_chinese_char_limit,
410 487 "english_word_limit": config.query_config.embedding_disable_english_word_limit
411 488 },
412   - "source_fields": config.query_config.source_fields
  489 + "source_fields": config.query_config.source_fields,
  490 + "search_fields": {
  491 + "multilingual_fields": config.query_config.multilingual_fields,
  492 + "shared_fields": config.query_config.shared_fields,
  493 + "core_multilingual_fields": config.query_config.core_multilingual_fields,
  494 + },
  495 + "text_query_strategy": {
  496 + "base_minimum_should_match": config.query_config.base_minimum_should_match,
  497 + "translation_minimum_should_match": config.query_config.translation_minimum_should_match,
  498 + "translation_boost": config.query_config.translation_boost,
  499 + "translation_boost_when_source_missing": config.query_config.translation_boost_when_source_missing,
  500 + "source_boost_when_missing": config.query_config.source_boost_when_missing,
  501 + "keywords_boost": config.query_config.keywords_boost,
  502 + "enable_phrase_query": config.query_config.enable_phrase_query,
  503 + "tie_breaker_base_query": config.query_config.tie_breaker_base_query,
  504 + "tie_breaker_keywords": config.query_config.tie_breaker_keywords,
  505 + }
413 506 }
414 507  
415 508 return {
... ...
config/utils.py
1   -"""
2   -Configuration utility functions.
3   -
4   -Helper functions for working with SearchConfig objects.
5   -"""
  1 +"""Configuration helper functions for dynamic multi-language search fields."""
6 2  
7 3 from typing import Dict, List
8 4 from .config_loader import SearchConfig
9 5  
10 6  
  7 +def _format_field_with_boost(field_name: str, boost: float) -> str:
  8 + if abs(float(boost) - 1.0) < 1e-9:
  9 + return field_name
  10 + return f"{field_name}^{boost}"
  11 +
  12 +
  13 +def _get_boost(config: SearchConfig, base_field: str, language: str = "") -> float:
  14 + lang = (language or "").strip().lower()
  15 + if lang:
  16 + lang_key = f"{base_field}.{lang}"
  17 + if lang_key in config.field_boosts:
  18 + return float(config.field_boosts[lang_key])
  19 + if base_field in config.field_boosts:
  20 + return float(config.field_boosts[base_field])
  21 + return 1.0
  22 +
  23 +
11 24 def get_match_fields_for_index(config: SearchConfig, index_name: str = "default") -> List[str]:
12 25 """
13   - Generate match fields list with boost from field_boosts.
14   -
15   - Args:
16   - config: SearchConfig instance
17   - index_name: Name of the index domain (default: "default")
18   -
19   - Returns:
20   - List of field names with boost, e.g., ["title.zh^3.0", "brief.zh^1.5"]
  26 + Deprecated compatibility wrapper.
  27 +
  28 + `indexes` is no longer used by runtime query building. This function now returns
  29 + dynamic match fields for the default language based on query_config.search_fields.
21 30 """
22   - # Find the index config
23   - index_config = None
24   - for idx in config.indexes:
25   - if idx.name == index_name:
26   - index_config = idx
27   - break
28   -
29   - if not index_config:
30   - return []
31   -
32   - # Generate match fields with boost
33   - match_fields = []
34   - for field_name in index_config.fields:
35   - # Get field boost from field_boosts dictionary
36   - field_boost = config.field_boosts.get(field_name, 1.0)
37   -
38   - # Combine index boost and field boost
39   - total_boost = index_config.boost * field_boost
40   -
41   - if total_boost != 1.0:
42   - match_fields.append(f"{field_name}^{total_boost}")
43   - else:
44   - match_fields.append(field_name)
45   -
  31 + del index_name
  32 + lang = (config.query_config.default_language or "en").strip().lower()
  33 + match_fields: List[str] = []
  34 +
  35 + for base_field in config.query_config.multilingual_fields:
  36 + field_name = f"{base_field}.{lang}"
  37 + match_fields.append(_format_field_with_boost(field_name, _get_boost(config, base_field, lang)))
  38 +
  39 + for shared_field in config.query_config.shared_fields:
  40 + match_fields.append(_format_field_with_boost(shared_field, _get_boost(config, shared_field)))
  41 +
46 42 return match_fields
47 43  
48 44  
49 45 def get_domain_fields(config: SearchConfig) -> Dict[str, List[str]]:
50 46 """
51   - Generate domain-specific match fields from all index configs.
52   -
53   - Args:
54   - config: SearchConfig instance
55   -
  47 + Get dynamic domain fields for compatibility with old diagnostics endpoints.
  48 +
56 49 Returns:
57   - Dictionary mapping domain name to list of match fields
  50 + A single `default` domain entry generated from dynamic search_fields.
58 51 """
59   - domain_fields = {}
60   - for index_config in config.indexes:
61   - domain_fields[index_config.name] = get_match_fields_for_index(config, index_config.name)
62   - return domain_fields
  52 + return {"default": get_match_fields_for_index(config)}
... ...
docs/DEVELOPER_GUIDE.md
... ... @@ -105,7 +105,7 @@ MySQL (店匠 SPU/SKU)
105 105 api/ # FastAPI 应用:搜索路由、管理路由、索引路由(indexer_app)
106 106 config/ # 配置加载与解析:config.yaml、services、env
107 107 indexer/ # MySQL → ES 管道:mapping、transformer、bulk、增量、build-docs
108   -query/ # 查询解析:规范化、改写、翻译、embedding 调用、布尔解析
  108 +query/ # 查询解析:规范化、改写、翻译、embedding 调用、语言计划生成
109 109 search/ # 搜索执行:多语言查询构建、Searcher、重排客户端、分数融合
110 110 embeddings/ # 向量化:服务端(server)、文本/图像后端、协议与配置
111 111 reranker/ # 重排:服务端(server)、后端(backends)、配置
... ... @@ -144,7 +144,7 @@ docs/ # 文档(含本指南)
144 144  
145 145 ### 4.4 query
146 146  
147   -- **职责**:查询解析与预处理:规范化、语言检测、改写(词典)、翻译、文本向量化、布尔表达式解析;输出可供 Searcher 使用的结构化查询信息
  147 +- **职责**:查询解析与预处理:规范化、语言检测、改写(词典)、翻译、文本向量化;输出可供 Searcher 使用的结构化查询信息(含 search_langs 语言计划)
148 148 - **原则**:翻译/向量通过 `providers` 获取,不直接依赖具体服务 URL 或实现;支持按配置关闭翻译/向量(如短查询、typing 场景)。
149 149  
150 150 ### 4.5 search
... ... @@ -241,7 +241,7 @@ docs/ # 文档(含本指南)
241 241  
242 242 ### 6.1 主配置文件
243 243  
244   -- **config/config.yaml**:搜索行为(field_boosts、indexes、query_config、ranking、function_score、rerank 融合参数)、SPU 配置、**services**(翻译/向量/重排的 provider 与 backends)、tenant_config 等。
  244 +- **config/config.yaml**:搜索行为(field_boosts、query_config.search_fields、query_config.text_query_strategy、ranking、function_score、rerank 融合参数)、SPU 配置、**services**(翻译/向量/重排的 provider 与 backends)、tenant_config 等。
245 245 - **.env**:敏感信息与部署态变量(DB、ES、Redis、API Key、端口等);不提交敏感值,可提供 `.env.example` 模板。
246 246  
247 247 ### 6.2 services 块结构(能力统一约定)
... ...
docs/QUICKSTART.md
... ... @@ -329,7 +329,7 @@ saas-search 以 MySQL 中的店匠标准表为权威数据源:
329 329  
330 330 - **统一索引结构**:所有租户使用同一套 mapping(按租户数据分索引名 + 文档内 `tenant_id` 隔离)
331 331 - **SPU 级索引**:每个文档是一个 SPU,包含嵌套 `skus`、`specifications`
332   -- **配置文件驱动**:搜索权重、搜索域、重排融合、provider 全在 `config/config.yaml`,不再以“硬编码配置”为主
  332 +- **配置文件驱动**:搜索权重、动态多语言字段、重排融合、provider 全在 `config/config.yaml`,不再以“硬编码配置”为主
333 333  
334 334 ### 2.2 索引结构(Mapping)
335 335  
... ... @@ -338,7 +338,7 @@ saas-search 以 MySQL 中的店匠标准表为权威数据源:
338 338 核心字段可分为:
339 339  
340 340 - 标识字段:`tenant_id`, `spu_id`
341   -- 多语言文本:`title.zh/en`, `brief.zh/en`, `description.zh/en`, `vendor.zh/en`, `category_path.zh/en`, `category_name_text.zh/en`
  341 +- 多语言文本:`title.<lang>`, `brief.<lang>`, `description.<lang>`, `vendor.<lang>`, `category_path.<lang>`, `category_name_text.<lang>`
342 342 - 类目过滤:`category1_name`, `category2_name`, `category3_name` 等
343 343 - 规格/变体:`specifications`(nested)、`skus`(nested)
344 344 - 价格库存:`min_price`, `max_price`, `total_inventory` 等
... ... @@ -346,8 +346,9 @@ saas-search 以 MySQL 中的店匠标准表为权威数据源:
346 346  
347 347 ### 2.3 查询、权重、排序(`config/config.yaml`)
348 348  
349   -- `field_boosts`:字段权重(如标题、品牌、类目)
350   -- `indexes`:搜索域(default/title/vendor/category/tags)
  349 +- `field_boosts`:字段权重(统一按字段基名配置,运行时按 `.{lang}` 动态组装)
  350 +- `query_config.search_fields`:动态多语言检索字段(multilingual/shared/core)
  351 +- `query_config.text_query_strategy`:文本召回策略参数(minimum_should_match、翻译boost等)
351 352 - `query_config`:语言、embedding 开关、source_fields、knn_boost、翻译提示词等
352 353 - `ranking.expression`:融合表达式(例如 `bm25() + 0.25*text_embedding_relevance()`)
353 354 - `function_score`:ES 层加权函数
... ... @@ -364,7 +365,7 @@ saas-search 以 MySQL 中的店匠标准表为权威数据源:
364 365 | 修改项 | 操作 |
365 366 |--------|------|
366 367 | 索引结构(mapping) | 修改 `mappings/search_products.json` → `./scripts/create_tenant_index.sh <tenant_id>` → 重新导入 |
367   -| 搜索/权重/排序/重排 | 修改 `config/config.yaml` 对应块 |
  368 +| 搜索字段/权重/排序/重排 | 修改 `config/config.yaml` 对应块 |
368 369 | provider 与服务 URL | 修改 `config/config.yaml` 的 `services` 块,或用环境变量覆盖 |
369 370  
370 371 ---
... ...
docs/搜索API对接指南.md
... ... @@ -18,10 +18,9 @@
18 18 - 3.3 [过滤器详解](#33-过滤器详解)
19 19 - 3.4 [分面配置](#34-分面配置)
20 20 - 3.5 [SKU筛选维度](#35-sku筛选维度)
21   - - 3.6 [布尔表达式语法](#36-布尔表达式语法)
22   - - 3.7 [搜索建议接口](#37-搜索建议接口)
23   - - 3.8 [即时搜索接口](#38-即时搜索接口)
24   - - 3.9 [获取单个文档](#39-获取单个文档)
  21 + - 3.6 [搜索建议接口](#37-搜索建议接口)
  22 + - 3.7 [即时搜索接口](#38-即时搜索接口)
  23 + - 3.8 [获取单个文档](#39-获取单个文档)
25 24  
26 25 4. [响应格式说明](#响应格式说明)
27 26 - 4.1 [标准响应结构](#41-标准响应结构)
... ... @@ -56,8 +55,7 @@
56 55 - 8.3 [分面搜索](#83-分面搜索)
57 56 - 8.4 [规格过滤与分面](#84-规格过滤与分面)
58 57 - 8.5 [SKU筛选](#85-sku筛选)
59   - - 8.6 [布尔表达式搜索](#86-布尔表达式搜索)
60   - - 8.7 [分页查询](#87-分页查询)
  58 + - 8.6 [分页查询](#87-分页查询)
61 59  
62 60 9. [数据模型](#9-数据模型)
63 61 - 9.1 [商品字段定义](#91-商品字段定义)
... ... @@ -167,7 +165,7 @@ curl -X POST &quot;http://43.166.252.75:6002/search/&quot; \
167 165 ### 3.1 接口信息
168 166  
169 167 - **端点**: `POST /search/`
170   -- **描述**: 执行文本搜索查询,支持多语言、布尔表达式、过滤器和分面搜索
  168 +- **描述**: 执行文本搜索查询,支持多语言、过滤器和分面搜索
171 169 - **租户标识**:`tenant_id` 通过 HTTP 请求头 **`X-Tenant-ID`** 传递(推荐);也可通过 URL query 参数 **`tenant_id`** 传递。**不要放在请求体中。**
172 170  
173 171 **请求示例(推荐)**:
... ... @@ -210,7 +208,7 @@ response = requests.post(url, headers=headers, json={&quot;query&quot;: &quot;芭比娃娃&quot;})
210 208  
211 209 | 参数 | 类型 | 必填 | 默认值 | 说明 |
212 210 |------|------|------|--------|------|
213   -| `query` | string | Y | - | 搜索查询字符串,支持布尔表达式(AND, OR, RANK, ANDNOT) |
  211 +| `query` | string | Y | - | 搜索查询字符串(统一文本检索策略) |
214 212 | `size` | integer | N | 10 | 返回结果数量(1-100) |
215 213 | `from` | integer | N | 0 | 分页偏移量(用于分页) |
216 214 | `language` | string | N | "zh" | 返回语言:`zh`(中文)或 `en`(英文)。后端会根据此参数选择对应的中英文字段返回 |
... ... @@ -544,36 +542,6 @@ response = requests.post(url, headers=headers, json={&quot;query&quot;: &quot;芭比娃娃&quot;})
544 542 }
545 543 ```
546 544  
547   -### 3.6 布尔表达式语法
548   -
549   -搜索查询支持布尔表达式,提供更灵活的搜索能力。
550   -
551   -**支持的操作符**:
552   -
553   -| 操作符 | 描述 | 示例 |
554   -|--------|------|------|
555   -| `AND` | 所有词必须匹配 | `玩具 AND 乐高` |
556   -| `OR` | 任意词匹配 | `芭比 OR 娃娃` |
557   -| `ANDNOT` | 排除特定词 | `玩具 ANDNOT 电动` |
558   -| `RANK` | 排序加权(不强制匹配) | `玩具 RANK 乐高` |
559   -| `()` | 分组 | `玩具 AND (乐高 OR 芭比)` |
560   -
561   -**操作符优先级**(从高到低):
562   -1. `()` - 括号
563   -2. `ANDNOT` - 排除
564   -3. `AND` - 与
565   -4. `OR` - 或
566   -5. `RANK` - 排序
567   -
568   -**示例**:
569   -```
570   -"芭比娃娃" // 简单查询
571   -"玩具 AND 乐高" // AND 查询
572   -"芭比 OR 娃娃" // OR 查询
573   -"玩具 ANDNOT 电动" // 排除查询
574   -"玩具 AND (乐高 OR 芭比)" // 复杂查询
575   -```
576   -
577 545 ### 3.7 搜索建议接口
578 546  
579 547 - **端点**: `GET /search/suggestions`
... ... @@ -2020,17 +1988,6 @@ curl &quot;http://localhost:6006/health&quot;
2020 1988 - 每个SPU下,每种颜色只会返回第一个SKU
2021 1989 - 如果维度不匹配,返回所有SKU(不进行过滤)
2022 1990  
2023   -### 8.6 布尔表达式搜索
2024   -
2025   -**需求**: 搜索包含"手机"和"智能"的商品,排除"二手"
2026   -
2027   -```json
2028   -{
2029   - "query": "手机 AND 智能 ANDNOT 二手",
2030   - "size": 20
2031   -}
2032   -```
2033   -
2034 1991 ### 8.7 分页查询
2035 1992  
2036 1993 **需求**: 获取第2页结果(每页20条)
... ...
docs/搜索API速查表.md
... ... @@ -165,18 +165,6 @@ POST /search/
165 165  
166 166 ---
167 167  
168   -## 布尔表达式
169   -
170   -```bash
171   -{
172   - "query": "玩具 AND (乐高 OR 芭比) ANDNOT 电动"
173   -}
174   -```
175   -
176   -**操作符优先级**: `()` > `ANDNOT` > `AND` > `OR` > `RANK`
177   -
178   ----
179   -
180 168 ## 分页
181 169  
182 170 ```bash
... ...
query/language_detector.py
1 1 """
2 2 Language detection utility.
3 3  
4   -Detects the language of a query string.
  4 +Detects language of short e-commerce queries with script checks + lightweight
  5 +Latin-language scoring (de/fr/es/it/pt/nl/en).
5 6 """
6 7  
7   -from typing import Optional
  8 +from typing import Dict, List
8 9 import re
9 10  
10 11  
11 12 class LanguageDetector:
12   - """Simple rule-based language detector for common e-commerce languages."""
13   -
14   - # Unicode ranges for different scripts
15   - CJK_RANGES = [
16   - (0x4E00, 0x9FFF), # CJK Unified Ideographs
17   - (0x3400, 0x4DBF), # CJK Extension A
18   - (0x20000, 0x2A6DF), # CJK Extension B
19   - (0x3040, 0x309F), # Hiragana
20   - (0x30A0, 0x30FF), # Katakana
21   - ]
22   -
23   - CYRILLIC_RANGE = (0x0400, 0x04FF)
24   - ARABIC_RANGE = (0x0600, 0x06FF)
25   - LATIN_RANGE = (0x0041, 0x007A)
  13 + """Rule-based language detector for common e-commerce query languages."""
26 14  
27 15 def __init__(self):
28   - """Initialize language detector."""
29   - self.chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
30   - self.russian_pattern = re.compile(r'[\u0400-\u04ff]+')
31   - self.arabic_pattern = re.compile(r'[\u0600-\u06ff]+')
32   - self.japanese_pattern = re.compile(r'[\u3040-\u309f\u30a0-\u30ff]+')
  16 + self._re_zh = re.compile(r"[\u4e00-\u9fff]")
  17 + self._re_ja_kana = re.compile(r"[\u3040-\u30ff]")
  18 + self._re_ko = re.compile(r"[\uac00-\ud7af]")
  19 + self._re_ru = re.compile(r"[\u0400-\u04ff]")
  20 + self._re_ar = re.compile(r"[\u0600-\u06ff]")
  21 + self._re_hi = re.compile(r"[\u0900-\u097f]")
  22 + self._re_he = re.compile(r"[\u0590-\u05ff]")
  23 + self._re_th = re.compile(r"[\u0e00-\u0e7f]")
  24 + self._re_latin_word = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ]+")
  25 +
  26 + # Stopwords + e-commerce terms for Latin-family disambiguation.
  27 + self._latin_lexicons: Dict[str, set] = {
  28 + "en": {
  29 + "the", "and", "for", "with", "new", "women", "men", "kids",
  30 + "shoe", "shoes", "dress", "shirt", "jacket", "bag", "wireless",
  31 + },
  32 + "de": {
  33 + "der", "die", "das", "und", "mit", "für", "damen", "herren",
  34 + "kinder", "schuhe", "kleid", "hemd", "jacke", "tasche",
  35 + },
  36 + "fr": {
  37 + "le", "la", "les", "et", "avec", "pour", "femme", "homme",
  38 + "enfant", "chaussures", "robe", "chemise", "veste", "sac",
  39 + },
  40 + "es": {
  41 + "el", "la", "los", "las", "y", "con", "para", "mujer", "hombre",
  42 + "niño", "niña", "zapatos", "vestido", "camisa", "chaqueta", "bolso",
  43 + },
  44 + "it": {
  45 + "il", "lo", "la", "gli", "le", "e", "con", "per", "donna", "uomo",
  46 + "bambino", "scarpe", "abito", "camicia", "giacca", "borsa",
  47 + },
  48 + "pt": {
  49 + "o", "a", "os", "as", "e", "com", "para", "mulher", "homem",
  50 + "criança", "sapatos", "vestido", "camisa", "jaqueta", "bolsa",
  51 + },
  52 + "nl": {
  53 + "de", "het", "en", "met", "voor", "dames", "heren", "kinderen",
  54 + "schoenen", "jurk", "overhemd", "jas", "tas",
  55 + },
  56 + }
  57 + self._diacritic_weights: Dict[str, Dict[str, int]] = {
  58 + "de": {"ä": 3, "ö": 3, "ü": 3, "ß": 4},
  59 + "fr": {"é": 2, "è": 2, "ê": 2, "à": 2, "ç": 2, "ù": 2, "ô": 2},
  60 + "es": {"ñ": 3, "á": 2, "é": 2, "í": 2, "ó": 2, "ú": 2},
  61 + "it": {"à": 2, "è": 2, "é": 2, "ì": 2, "ò": 2, "ù": 2},
  62 + "pt": {"ã": 3, "õ": 3, "ç": 2, "á": 2, "â": 2, "ê": 2, "ô": 2},
  63 + "nl": {"ij": 2},
  64 + }
33 65  
34 66 def detect(self, text: str) -> str:
35 67 """
36   - Detect language of text.
37   -
38   - Args:
39   - text: Input text
  68 + Detect language code for text.
40 69  
41   - Returns:
42   - Language code: 'zh', 'en', 'ru', 'ar', 'ja', or 'unknown'
  70 + Returns one of: zh/en/ru/ar/ja/ko/de/fr/es/it/pt/nl/hi/he/th/unknown
43 71 """
44 72 if not text or not text.strip():
45   - return 'unknown'
46   -
47   - text = text.strip()
48   -
49   - # Count characters in each script
50   - char_counts = {
51   - 'chinese': 0,
52   - 'russian': 0,
53   - 'arabic': 0,
54   - 'japanese': 0,
55   - 'latin': 0
56   - }
57   -
58   - for char in text:
59   - code_point = ord(char)
60   -
61   - # Check CJK (Chinese/Japanese)
62   - is_cjk = any(start <= code_point <= end for start, end in self.CJK_RANGES)
63   - if is_cjk:
64   - char_counts['chinese'] += 1
65   -
66   - # Check Hiragana/Katakana (Japanese)
67   - if 0x3040 <= code_point <= 0x30FF:
68   - char_counts['japanese'] += 1
69   -
70   - # Check Cyrillic (Russian)
71   - if self.CYRILLIC_RANGE[0] <= code_point <= self.CYRILLIC_RANGE[1]:
72   - char_counts['russian'] += 1
73   -
74   - # Check Arabic
75   - if self.ARABIC_RANGE[0] <= code_point <= self.ARABIC_RANGE[1]:
76   - char_counts['arabic'] += 1
77   -
78   - # Check Latin
79   - if (0x0041 <= code_point <= 0x005A) or (0x0061 <= code_point <= 0x007A):
80   - char_counts['latin'] += 1
81   -
82   - # Determine dominant script
83   - total_chars = sum(char_counts.values())
84   - if total_chars == 0:
85   - return 'unknown'
86   -
87   - # Calculate percentages
88   - percentages = {
89   - script: count / total_chars
90   - for script, count in char_counts.items()
91   - }
92   -
93   - # Japanese has both Hiragana/Katakana and CJK
94   - if percentages['japanese'] > 0.1:
95   - return 'ja'
96   -
97   - # Russian (Cyrillic)
98   - if percentages['russian'] > 0.5:
99   - return 'ru'
100   -
101   - # Arabic
102   - if percentages['arabic'] > 0.5:
103   - return 'ar'
104   -
105   - # Chinese (CJK without Japanese kana)
106   - if percentages['chinese'] > 0.3:
107   - return 'zh'
108   -
109   - # English/Latin
110   - if percentages['latin'] > 0.5:
111   - return 'en'
112   -
113   - return 'unknown'
  73 + return "unknown"
  74 + q = text.strip().lower()
  75 +
  76 + # Script-first detection for non-Latin languages.
  77 + if self._re_ja_kana.search(q):
  78 + return "ja"
  79 + if self._re_ko.search(q):
  80 + return "ko"
  81 + if self._re_zh.search(q):
  82 + return "zh"
  83 + if self._re_ru.search(q):
  84 + return "ru"
  85 + if self._re_ar.search(q):
  86 + return "ar"
  87 + if self._re_hi.search(q):
  88 + return "hi"
  89 + if self._re_he.search(q):
  90 + return "he"
  91 + if self._re_th.search(q):
  92 + return "th"
  93 +
  94 + # Latin-family scoring.
  95 + tokens = self._re_latin_word.findall(q)
  96 + if not tokens:
  97 + return "unknown"
  98 +
  99 + scores: Dict[str, float] = {k: 0.0 for k in self._latin_lexicons.keys()}
  100 + scores["en"] = scores.get("en", 0.0)
  101 + token_set = set(tokens)
  102 +
  103 + # Lexicon matches
  104 + for lang, lex in self._latin_lexicons.items():
  105 + overlap = len(token_set & lex)
  106 + if overlap:
  107 + scores[lang] += overlap * 2.0
  108 +
  109 + # Diacritics / orthographic hints
  110 + for lang, hints in self._diacritic_weights.items():
  111 + for marker, weight in hints.items():
  112 + if marker in q:
  113 + scores[lang] += weight
  114 +
  115 + # Light suffix hints for common product words
  116 + for t in tokens:
  117 + if t.endswith("ung") or t.endswith("chen"):
  118 + scores["de"] += 0.6
  119 + if t.endswith("ción") or t.endswith("ado") or t.endswith("ada"):
  120 + scores["es"] += 0.6
  121 + if t.endswith("zione") or t.endswith("etto") or t.endswith("ella"):
  122 + scores["it"] += 0.6
  123 + if t.endswith("ção") or t.endswith("mente"):
  124 + scores["pt"] += 0.6
  125 + if t.endswith("ment") or t.endswith("eau"):
  126 + scores["fr"] += 0.5
  127 +
  128 + # Fallback preference: English for pure Latin short tokens.
  129 + scores["en"] += 0.2
  130 +
  131 + best_lang = max(scores.items(), key=lambda x: x[1])[0]
  132 + best_score = scores[best_lang]
  133 + if best_score <= 0:
  134 + return "en"
  135 + return best_lang
114 136  
115 137 def is_chinese(self, text: str) -> bool:
116   - """Check if text is primarily Chinese."""
117   - return self.detect(text) == 'zh'
  138 + return self.detect(text) == "zh"
118 139  
119 140 def is_english(self, text: str) -> bool:
120   - """Check if text is primarily English."""
121   - return self.detect(text) == 'en'
  141 + return self.detect(text) == "en"
122 142  
123 143 def is_russian(self, text: str) -> bool:
124   - """Check if text is primarily Russian."""
125   - return self.detect(text) == 'ru'
  144 + return self.detect(text) == "ru"
126 145  
127 146 def is_arabic(self, text: str) -> bool:
128   - """Check if text is primarily Arabic."""
129   - return self.detect(text) == 'ar'
  147 + return self.detect(text) == "ar"
130 148  
131 149 def is_japanese(self, text: str) -> bool:
132   - """Check if text is primarily Japanese."""
133   - return self.detect(text) == 'ja'
  150 + return self.detect(text) == "ja"
... ...
query/query_parser.py
... ... @@ -37,7 +37,11 @@ class ParsedQuery:
37 37 domain: str = "default",
38 38 keywords: str = "",
39 39 token_count: int = 0,
40   - query_tokens: Optional[List[str]] = None
  40 + query_tokens: Optional[List[str]] = None,
  41 + query_text_by_lang: Optional[Dict[str, str]] = None,
  42 + search_langs: Optional[List[str]] = None,
  43 + index_languages: Optional[List[str]] = None,
  44 + source_in_index_languages: bool = True,
41 45 ):
42 46 self.original_query = original_query
43 47 self.query_normalized = query_normalized
... ... @@ -50,6 +54,10 @@ class ParsedQuery:
50 54 self.keywords = keywords
51 55 self.token_count = token_count
52 56 self.query_tokens = query_tokens or []
  57 + self.query_text_by_lang = query_text_by_lang or {}
  58 + self.search_langs = search_langs or []
  59 + self.index_languages = index_languages or []
  60 + self.source_in_index_languages = bool(source_in_index_languages)
53 61  
54 62 def to_dict(self) -> Dict[str, Any]:
55 63 """Convert to dictionary representation."""
... ... @@ -61,6 +69,10 @@ class ParsedQuery:
61 69 "translations": self.translations,
62 70 "domain": self.domain
63 71 }
  72 + result["query_text_by_lang"] = self.query_text_by_lang
  73 + result["search_langs"] = self.search_langs
  74 + result["index_languages"] = self.index_languages
  75 + result["source_in_index_languages"] = self.source_in_index_languages
64 76 return result
65 77  
66 78  
... ... @@ -253,12 +265,21 @@ class QueryParser:
253 265 # Stage 4: Translation (with async support and conditional waiting)
254 266 translations = {}
255 267 translation_futures = {}
  268 + index_langs = ["en", "zh"]
256 269 try:
257 270 # 根据租户配置的 index_languages 决定翻译目标语言
258 271 from config.tenant_config_loader import get_tenant_config_loader
259 272 tenant_loader = get_tenant_config_loader()
260 273 tenant_cfg = tenant_loader.get_tenant_config(tenant_id or "default")
261   - index_langs = tenant_cfg.get("index_languages") or ["en", "zh"]
  274 + raw_index_langs = tenant_cfg.get("index_languages") or ["en", "zh"]
  275 + index_langs = []
  276 + seen_langs = set()
  277 + for lang in raw_index_langs:
  278 + norm_lang = str(lang or "").strip().lower()
  279 + if not norm_lang or norm_lang in seen_langs:
  280 + continue
  281 + seen_langs.add(norm_lang)
  282 + index_langs.append(norm_lang)
262 283  
263 284 target_langs_for_translation = [lang for lang in index_langs if lang != detected_lang]
264 285  
... ... @@ -269,8 +290,12 @@ class QueryParser:
269 290 # Use e-commerce context for better disambiguation
270 291 translation_context = self.config.query_config.translation_context
271 292 # For query translation, we use a general prompt (not language-specific)
272   - query_prompt = self.config.query_config.translation_prompts.get('query_zh') or \
273   - self.config.query_config.translation_prompts.get('default_zh')
  293 + query_prompt = (
  294 + self.config.query_config.translation_prompts.get(f"query_{detected_lang}")
  295 + or self.config.query_config.translation_prompts.get("query_en")
  296 + or self.config.query_config.translation_prompts.get("default_en")
  297 + or self.config.query_config.translation_prompts.get("default_zh")
  298 + )
274 299  
275 300 # Determine if we need to wait for translation results
276 301 # If detected_lang is not in index_languages, we must wait for translation
... ... @@ -417,6 +442,33 @@ class QueryParser:
417 442 # Update translations in context after all are complete
418 443 if translations and context:
419 444 context.store_intermediate_result('translations', translations)
  445 +
  446 + # Build language-scoped query plan: source language + available translations
  447 + query_text_by_lang: Dict[str, str] = {}
  448 + if query_text:
  449 + query_text_by_lang[detected_lang] = query_text
  450 + for lang, translated_text in (translations or {}).items():
  451 + if translated_text and str(translated_text).strip():
  452 + query_text_by_lang[str(lang).strip().lower()] = str(translated_text)
  453 +
  454 + source_in_index_languages = detected_lang in index_langs
  455 + ordered_search_langs: List[str] = []
  456 + seen_order = set()
  457 + if detected_lang in query_text_by_lang:
  458 + ordered_search_langs.append(detected_lang)
  459 + seen_order.add(detected_lang)
  460 + for lang in index_langs:
  461 + if lang in query_text_by_lang and lang not in seen_order:
  462 + ordered_search_langs.append(lang)
  463 + seen_order.add(lang)
  464 + for lang in query_text_by_lang.keys():
  465 + if lang not in seen_order:
  466 + ordered_search_langs.append(lang)
  467 + seen_order.add(lang)
  468 +
  469 + if context:
  470 + context.store_intermediate_result("search_langs", ordered_search_langs)
  471 + context.store_intermediate_result("query_text_by_lang", query_text_by_lang)
420 472  
421 473 # Build result
422 474 result = ParsedQuery(
... ... @@ -429,7 +481,11 @@ class QueryParser:
429 481 domain=domain,
430 482 keywords=keywords,
431 483 token_count=token_count,
432   - query_tokens=query_tokens
  484 + query_tokens=query_tokens,
  485 + query_text_by_lang=query_text_by_lang,
  486 + search_langs=ordered_search_langs,
  487 + index_languages=index_langs,
  488 + source_in_index_languages=source_in_index_languages,
433 489 )
434 490  
435 491 if context and hasattr(context, 'logger'):
... ...
query/query_rewriter.py
... ... @@ -19,7 +19,7 @@ class QueryRewriter:
19 19  
20 20 Args:
21 21 rewrite_dict: Dictionary mapping exact query terms to rewrite expressions
22   - e.g., {"芭比": "brand:芭比 OR name:芭比娃娃"}
  22 + e.g., {"芭比": "芭比娃娃"}
23 23 Only full word matches will be rewritten, no partial matching.
24 24 """
25 25 self.rewrite_dict = rewrite_dict or {}
... ... @@ -107,13 +107,13 @@ class QueryNormalizer:
107 107 return query
108 108  
109 109 @staticmethod
110   - def remove_punctuation(query: str, keep_operators: bool = True) -> str:
  110 + def remove_punctuation(query: str, keep_operators: bool = False) -> str:
111 111 """
112 112 Remove punctuation from query.
113 113  
114 114 Args:
115 115 query: Original query
116   - keep_operators: Whether to keep boolean operators (AND, OR, etc.)
  116 + keep_operators: Whether to keep symbols used in old query syntax.
117 117  
118 118 Returns:
119 119 Query without punctuation
... ...
search/__init__.py
1 1 """Search package initialization."""
2 2  
3   -from .boolean_parser import BooleanParser, QueryNode
4 3 from .es_query_builder import ESQueryBuilder
5 4 from .searcher import Searcher, SearchResult
6 5  
7 6 __all__ = [
8   - 'BooleanParser',
9   - 'QueryNode',
10 7 'ESQueryBuilder',
11 8 'Searcher',
12 9 'SearchResult',
... ...
search/boolean_parser.py deleted
... ... @@ -1,201 +0,0 @@
1   -"""
2   -Boolean expression parser for search queries.
3   -
4   -Supports: AND, OR, RANK, ANDNOT operators with parentheses.
5   -Precedence (high to low): (), ANDNOT, AND, OR, RANK
6   -"""
7   -
8   -import re
9   -from typing import List, Tuple, Optional
10   -from dataclasses import dataclass
11   -
12   -
13   -@dataclass
14   -class QueryNode:
15   - """Represents a node in the parsed query tree."""
16   - operator: str # 'AND', 'OR', 'RANK', 'ANDNOT', 'TERM'
17   - terms: List['QueryNode'] = None # Child nodes for operators
18   - value: str = None # Value for leaf nodes (TERM)
19   -
20   - def __repr__(self):
21   - if self.operator == 'TERM':
22   - return f"TERM({self.value})"
23   - else:
24   - return f"{self.operator}({', '.join(str(t) for t in self.terms)})"
25   -
26   -
27   -class BooleanParser:
28   - """
29   - Parser for boolean search expressions.
30   -
31   - Operator precedence (high to low):
32   - 1. () - Parentheses
33   - 2. ANDNOT - AND NOT (exclusion)
34   - 3. AND - All terms must match
35   - 4. OR - Any term must match
36   - 5. RANK - Scoring boost (like OR but affects ranking)
37   - """
38   -
39   - OPERATORS = {'AND', 'OR', 'RANK', 'ANDNOT'}
40   - PRECEDENCE = {
41   - 'ANDNOT': 3,
42   - 'AND': 2,
43   - 'OR': 1,
44   - 'RANK': 0
45   - }
46   -
47   - def __init__(self):
48   - """Initialize boolean parser."""
49   - pass
50   -
51   - def parse(self, expression: str) -> QueryNode:
52   - """
53   - Parse boolean expression into query tree.
54   -
55   - Args:
56   - expression: Boolean expression string
57   - Example: "laptop AND (gaming OR professional) ANDNOT cheap"
58   -
59   - Returns:
60   - Root QueryNode of parsed tree
61   - """
62   - if not expression or not expression.strip():
63   - return QueryNode(operator='TERM', value='')
64   -
65   - # Tokenize
66   - tokens = self._tokenize(expression)
67   -
68   - if not tokens:
69   - return QueryNode(operator='TERM', value='')
70   -
71   - # Parse with precedence
72   - return self._parse_expression(tokens)
73   -
74   - def _tokenize(self, expression: str) -> List[str]:
75   - """
76   - Tokenize expression into terms and operators.
77   -
78   - Args:
79   - expression: Expression string
80   -
81   - Returns:
82   - List of tokens
83   - """
84   - # Pattern to match: operators, parentheses, or terms (with domain prefix support)
85   - pattern = r'\b(AND|OR|RANK|ANDNOT)\b|[()]|(?:\w+:)?[^\s()]+'
86   -
87   - tokens = []
88   - for match in re.finditer(pattern, expression):
89   - token = match.group().strip()
90   - if token:
91   - tokens.append(token)
92   -
93   - return tokens
94   -
95   - def _parse_expression(self, tokens: List[str], start: int = 0) -> Tuple[QueryNode, int]:
96   - """
97   - Parse expression with operator precedence.
98   -
99   - Args:
100   - tokens: List of tokens
101   - start: Starting index
102   -
103   - Returns:
104   - Tuple of (QueryNode, next_index)
105   - """
106   - # Start with lowest precedence (RANK)
107   - return self._parse_rank(tokens, start)
108   -
109   - def _parse_rank(self, tokens: List[str], start: int) -> Tuple[QueryNode, int]:
110   - """Parse RANK operator (lowest precedence)."""
111   - left, pos = self._parse_or(tokens, start)
112   -
113   - while pos < len(tokens) and tokens[pos] == 'RANK':
114   - pos += 1 # Skip 'RANK'
115   - right, pos = self._parse_or(tokens, pos)
116   - left = QueryNode(operator='RANK', terms=[left, right])
117   -
118   - return left, pos
119   -
120   - def _parse_or(self, tokens: List[str], start: int) -> Tuple[QueryNode, int]:
121   - """Parse OR operator."""
122   - left, pos = self._parse_and(tokens, start)
123   -
124   - while pos < len(tokens) and tokens[pos] == 'OR':
125   - pos += 1 # Skip 'OR'
126   - right, pos = self._parse_and(tokens, pos)
127   - left = QueryNode(operator='OR', terms=[left, right])
128   -
129   - return left, pos
130   -
131   - def _parse_and(self, tokens: List[str], start: int) -> Tuple[QueryNode, int]:
132   - """Parse AND operator."""
133   - left, pos = self._parse_andnot(tokens, start)
134   -
135   - while pos < len(tokens) and tokens[pos] == 'AND':
136   - pos += 1 # Skip 'AND'
137   - right, pos = self._parse_andnot(tokens, pos)
138   - left = QueryNode(operator='AND', terms=[left, right])
139   -
140   - return left, pos
141   -
142   - def _parse_andnot(self, tokens: List[str], start: int) -> Tuple[QueryNode, int]:
143   - """Parse ANDNOT operator (highest precedence)."""
144   - left, pos = self._parse_primary(tokens, start)
145   -
146   - while pos < len(tokens) and tokens[pos] == 'ANDNOT':
147   - pos += 1 # Skip 'ANDNOT'
148   - right, pos = self._parse_primary(tokens, pos)
149   - left = QueryNode(operator='ANDNOT', terms=[left, right])
150   -
151   - return left, pos
152   -
153   - def _parse_primary(self, tokens: List[str], start: int) -> Tuple[QueryNode, int]:
154   - """Parse primary expression (terms or parentheses)."""
155   - if start >= len(tokens):
156   - return QueryNode(operator='TERM', value=''), start
157   -
158   - token = tokens[start]
159   -
160   - # Handle parentheses
161   - if token == '(':
162   - # Find matching closing parenthesis
163   - depth = 1
164   - pos = start + 1
165   - while pos < len(tokens) and depth > 0:
166   - if tokens[pos] == '(':
167   - depth += 1
168   - elif tokens[pos] == ')':
169   - depth -= 1
170   - pos += 1
171   -
172   - # Parse contents of parentheses
173   - inner_tokens = tokens[start + 1:pos - 1]
174   - if inner_tokens:
175   - node, _ = self._parse_expression(inner_tokens, 0)
176   - return node, pos
177   - else:
178   - return QueryNode(operator='TERM', value=''), pos
179   -
180   - # Handle term
181   - if token not in self.OPERATORS and token not in ['(', ')']:
182   - return QueryNode(operator='TERM', value=token), start + 1
183   -
184   - # Unexpected token
185   - return QueryNode(operator='TERM', value=''), start + 1
186   -
187   - def is_simple_query(self, expression: str) -> bool:
188   - """
189   - Check if query is simple (no boolean operators).
190   -
191   - Args:
192   - expression: Query expression
193   -
194   - Returns:
195   - True if simple query (no operators)
196   - """
197   - tokens = self._tokenize(expression)
198   - for token in tokens:
199   - if token in self.OPERATORS:
200   - return False
201   - return True
search/es_query_builder.py
... ... @@ -10,7 +10,6 @@ Simplified architecture:
10 10  
11 11 from typing import Dict, Any, List, Optional, Union, Tuple
12 12 import numpy as np
13   -from .boolean_parser import QueryNode
14 13 from config import FunctionScoreConfig
15 14  
16 15  
... ... @@ -20,18 +19,31 @@ class ESQueryBuilder:
20 19 def __init__(
21 20 self,
22 21 match_fields: List[str],
  22 + field_boosts: Optional[Dict[str, float]] = None,
  23 + multilingual_fields: Optional[List[str]] = None,
  24 + shared_fields: Optional[List[str]] = None,
  25 + core_multilingual_fields: Optional[List[str]] = None,
23 26 text_embedding_field: Optional[str] = None,
24 27 image_embedding_field: Optional[str] = None,
25 28 source_fields: Optional[List[str]] = None,
26 29 function_score_config: Optional[FunctionScoreConfig] = None,
27 30 default_language: str = "en",
28   - knn_boost: float = 0.25
  31 + knn_boost: float = 0.25,
  32 + base_minimum_should_match: str = "75%",
  33 + translation_minimum_should_match: str = "75%",
  34 + translation_boost: float = 0.4,
  35 + translation_boost_when_source_missing: float = 1.0,
  36 + source_boost_when_missing: float = 0.6,
  37 + keywords_boost: float = 0.1,
  38 + enable_phrase_query: bool = True,
  39 + tie_breaker_base_query: float = 0.9,
  40 + tie_breaker_keywords: float = 0.9,
29 41 ):
30 42 """
31 43 Initialize query builder.
32 44  
33 45 Multi-language search (translation-based cross-language recall) is always enabled:
34   - queries are matched against both detected-language and translated zh/en clauses.
  46 + queries are matched against detected-language and translated target-language clauses.
35 47  
36 48 Args:
37 49 match_fields: Fields to search for text matching
... ... @@ -43,12 +55,27 @@ class ESQueryBuilder:
43 55 knn_boost: Boost value for KNN (embedding recall)
44 56 """
45 57 self.match_fields = match_fields
  58 + self.field_boosts = field_boosts or {}
  59 + self.multilingual_fields = multilingual_fields or [
  60 + "title", "brief", "description", "vendor", "category_path", "category_name_text"
  61 + ]
  62 + self.shared_fields = shared_fields or ["tags", "option1_values", "option2_values", "option3_values"]
  63 + self.core_multilingual_fields = core_multilingual_fields or ["title", "brief", "vendor", "category_name_text"]
46 64 self.text_embedding_field = text_embedding_field
47 65 self.image_embedding_field = image_embedding_field
48 66 self.source_fields = source_fields
49 67 self.function_score_config = function_score_config
50 68 self.default_language = default_language
51 69 self.knn_boost = knn_boost
  70 + self.base_minimum_should_match = base_minimum_should_match
  71 + self.translation_minimum_should_match = translation_minimum_should_match
  72 + self.translation_boost = float(translation_boost)
  73 + self.translation_boost_when_source_missing = float(translation_boost_when_source_missing)
  74 + self.source_boost_when_missing = float(source_boost_when_missing)
  75 + self.keywords_boost = float(keywords_boost)
  76 + self.enable_phrase_query = bool(enable_phrase_query)
  77 + self.tie_breaker_base_query = float(tie_breaker_base_query)
  78 + self.tie_breaker_keywords = float(tie_breaker_keywords)
52 79  
53 80 def _apply_source_filter(self, es_query: Dict[str, Any]) -> None:
54 81 """
... ... @@ -118,7 +145,6 @@ class ESQueryBuilder:
118 145 self,
119 146 query_text: str,
120 147 query_vector: Optional[np.ndarray] = None,
121   - query_node: Optional[QueryNode] = None,
122 148 filters: Optional[Dict[str, Any]] = None,
123 149 range_filters: Optional[Dict[str, Any]] = None,
124 150 facet_configs: Optional[List[Any]] = None,
... ... @@ -136,14 +162,13 @@ class ESQueryBuilder:
136 162 结构:filters and (text_recall or embedding_recall) + post_filter
137 163 - conjunctive_filters: 应用在 query.bool.filter(影响结果和聚合)
138 164 - disjunctive_filters: 应用在 post_filter(只影响结果,不影响聚合)
139   - - text_recall: 文本相关性召回(中英文字段都用
  165 + - text_recall: 文本相关性召回(按 search_langs 动态语言字段
140 166 - embedding_recall: 向量召回(KNN)
141 167 - function_score: 包装召回部分,支持提权字段
142 168  
143 169 Args:
144 170 query_text: Query text for BM25 matching
145 171 query_vector: Query embedding for KNN search
146   - query_node: Parsed boolean expression tree
147 172 filters: Exact match filters
148 173 range_filters: Range filters for numeric fields (always applied in query)
149 174 facet_configs: Facet configurations (used to identify multi-select facets)
... ... @@ -157,6 +182,7 @@ class ESQueryBuilder:
157 182 Returns:
158 183 ES query DSL dictionary
159 184 """
  185 + # Boolean AST path has been removed; keep a single text strategy.
160 186 es_query = {
161 187 "size": size,
162 188 "from": from_
... ... @@ -170,12 +196,8 @@ class ESQueryBuilder:
170 196  
171 197 # Text recall (always include if query_text exists)
172 198 if query_text:
173   - if query_node and query_node.operator != 'TERM':
174   - # Complex boolean query
175   - text_query = self._build_boolean_query(query_node)
176   - else:
177   - # Simple text query - use advanced should-based multi-query strategy
178   - text_query = self._build_advanced_text_query(query_text, parsed_query)
  199 + # Unified text query strategy
  200 + text_query = self._build_advanced_text_query(query_text, parsed_query)
179 201 recall_clauses.append(text_query)
180 202  
181 203 # Embedding recall (KNN - separate from query, handled below)
... ... @@ -379,50 +401,49 @@ class ESQueryBuilder:
379 401 }
380 402 }
381 403  
  404 + def _format_field_with_boost(self, field_name: str, boost: float) -> str:
  405 + if abs(float(boost) - 1.0) < 1e-9:
  406 + return field_name
  407 + return f"{field_name}^{boost}"
  408 +
  409 + def _get_field_boost(self, base_field: str, language: Optional[str] = None) -> float:
  410 + # Language-specific override first (e.g. title.de), then base field (e.g. title)
  411 + if language:
  412 + lang_key = f"{base_field}.{language}"
  413 + if lang_key in self.field_boosts:
  414 + return float(self.field_boosts[lang_key])
  415 + if base_field in self.field_boosts:
  416 + return float(self.field_boosts[base_field])
  417 + return 1.0
  418 +
382 419 def _get_match_fields(self, language: str) -> Tuple[List[str], List[str]]:
383 420 """
384   - Get match fields for a specific language.
  421 + Build dynamic match fields for one language.
385 422  
386 423 Args:
387   - language: Language code ('zh' or 'en')
  424 + language: Language code (e.g. zh/en/de/fr/...)
388 425  
389 426 Returns:
390 427 (all_fields, core_fields) - core_fields are for phrase/keyword queries
391 428 """
392   - if language == 'zh':
393   - all_fields = [
394   - "title.zh^3.0",
395   - "brief.zh^1.5",
396   - "description.zh",
397   - "vendor.zh^1.5",
398   - "tags",
399   - "category_path.zh^1.5",
400   - "category_name_text.zh^1.5",
401   - "option1_values^0.5"
402   - ]
403   - core_fields = [
404   - "title.zh^3.0",
405   - "brief.zh^1.5",
406   - "vendor.zh^1.5",
407   - "category_name_text.zh^1.5"
408   - ]
409   - else: # en
410   - all_fields = [
411   - "title.en^3.0",
412   - "brief.en^1.5",
413   - "description.en",
414   - "vendor.en^1.5",
415   - "tags",
416   - "category_path.en^1.5",
417   - "category_name_text.en^1.5",
418   - "option1_values^0.5"
419   - ]
420   - core_fields = [
421   - "title.en^3.0",
422   - "brief.en^1.5",
423   - "vendor.en^1.5",
424   - "category_name_text.en^1.5"
425   - ]
  429 + lang = (language or "").strip().lower()
  430 + all_fields: List[str] = []
  431 + core_fields: List[str] = []
  432 +
  433 + for base in self.multilingual_fields:
  434 + field = f"{base}.{lang}"
  435 + boost = self._get_field_boost(base, lang)
  436 + all_fields.append(self._format_field_with_boost(field, boost))
  437 +
  438 + for shared in self.shared_fields:
  439 + boost = self._get_field_boost(shared, None)
  440 + all_fields.append(self._format_field_with_boost(shared, boost))
  441 +
  442 + for base in self.core_multilingual_fields:
  443 + field = f"{base}.{lang}"
  444 + boost = self._get_field_boost(base, lang)
  445 + core_fields.append(self._format_field_with_boost(field, boost))
  446 +
426 447 return all_fields, core_fields
427 448  
428 449 def _get_embedding_field(self, language: str) -> str:
... ... @@ -434,9 +455,9 @@ class ESQueryBuilder:
434 455 """
435 456 Build advanced text query using should clauses with multiple query strategies.
436 457  
437   - Reference implementation:
438   - - base_query: main query with AND operator and 75% minimum_should_match
439   - - translation queries: lower boost (0.4) for other languages
  458 + Unified implementation:
  459 + - base_query: source-language clause
  460 + - translation queries: target-language clauses from search_langs/query_text_by_lang
440 461 - phrase query: for short queries (2+ tokens)
441 462 - keywords query: extracted nouns from query
442 463 - KNN query: added separately in build_query
... ... @@ -451,94 +472,89 @@ class ESQueryBuilder:
451 472 should_clauses = []
452 473  
453 474 # Get query analysis from parsed_query
454   - translations = {}
455   - language = self.default_language
  475 + query_text_by_lang: Dict[str, str] = {}
  476 + search_langs: List[str] = []
  477 + source_lang = self.default_language
  478 + source_in_index_languages = True
456 479 keywords = ""
457 480 query_tokens = []
458 481 token_count = 0
459 482  
460 483 if parsed_query:
461   - translations = parsed_query.translations or {}
462   - # Use default language if detected_language is None or "unknown"
463   - detected_lang = parsed_query.detected_language
464   - if not detected_lang or detected_lang == "unknown":
465   - language = self.default_language
466   - else:
467   - language = detected_lang
  484 + query_text_by_lang = getattr(parsed_query, "query_text_by_lang", None) or {}
  485 + search_langs = getattr(parsed_query, "search_langs", None) or []
  486 + detected_lang = getattr(parsed_query, "detected_language", None)
  487 + source_lang = detected_lang if detected_lang and detected_lang != "unknown" else self.default_language
  488 + source_in_index_languages = bool(
  489 + getattr(parsed_query, "source_in_index_languages", True)
  490 + )
468 491 keywords = getattr(parsed_query, 'keywords', '') or ""
469 492 query_tokens = getattr(parsed_query, 'query_tokens', None) or []
470 493 token_count = len(query_tokens) or getattr(parsed_query, 'token_count', 0) or 0
471 494  
472   - # Get match fields for the detected language
473   - match_fields, core_fields = self._get_match_fields(language)
474   -
475   - # Tie breaker values
476   - tie_breaker_base_query = 0.9
477   - tie_breaker_keywords = 0.9
478   -
479   - # 1. Base query - main query with AND operator
480   - should_clauses.append({
481   - "multi_match": {
482   - "_name": "base_query",
483   - "fields": match_fields,
484   - "minimum_should_match": "75%",
485   - # "operator": "AND",
486   - "query": query_text,
487   - "tie_breaker": tie_breaker_base_query
488   - }
489   - })
490   -
491   - # 2. Translation queries - lower boost (0.4) for other languages (multi-language search always on)
492   - if language != 'zh' and translations.get('zh'):
493   - zh_fields, _ = self._get_match_fields('zh')
494   - should_clauses.append({
495   - "multi_match": {
496   - "query": translations['zh'],
497   - "fields": zh_fields,
498   - "minimum_should_match": "75%",
499   - "tie_breaker": tie_breaker_base_query,
500   - "boost": 0.4,
501   - "_name": "base_query_trans_zh"
502   - }
503   - })
504   - if language != 'en' and translations.get('en'):
505   - en_fields, _ = self._get_match_fields('en')
506   - should_clauses.append({
507   - "multi_match": {
508   - "query": translations['en'],
509   - "fields": en_fields,
510   - "minimum_should_match": "75%",
511   - "tie_breaker": tie_breaker_base_query,
512   - "boost": 0.4,
513   - "_name": "base_query_trans_en"
514   - }
515   - })
  495 + if not query_text_by_lang:
  496 + query_text_by_lang = {source_lang: query_text}
  497 + if source_lang not in query_text_by_lang and query_text:
  498 + query_text_by_lang[source_lang] = query_text
  499 + if not search_langs:
  500 + search_langs = list(query_text_by_lang.keys())
  501 +
  502 + # Core fields for phrase/keyword based on source language.
  503 + _, core_fields = self._get_match_fields(source_lang)
  504 + if not core_fields and search_langs:
  505 + _, core_fields = self._get_match_fields(search_langs[0])
  506 +
  507 + # Base + translated clauses based on language plan.
  508 + for lang in search_langs:
  509 + lang_query = query_text_by_lang.get(lang)
  510 + if not lang_query:
  511 + continue
  512 + match_fields, _ = self._get_match_fields(lang)
  513 + if not match_fields:
  514 + continue
516 515  
517   - if False and is_long_query:
518   - boost = 0.5 * pow(min(1.0, token_count / 10.0), 0.9)
519   - minimum_should_match = "70%"
520   - should_clauses.append({
  516 + is_source = (lang == source_lang)
  517 + clause_boost = 1.0
  518 + clause_name = "base_query" if is_source else f"base_query_trans_{lang}"
  519 + minimum_should_match = (
  520 + self.base_minimum_should_match if is_source else self.translation_minimum_should_match
  521 + )
  522 + if is_source and not source_in_index_languages:
  523 + clause_boost = self.source_boost_when_missing
  524 + elif not is_source:
  525 + clause_boost = (
  526 + self.translation_boost
  527 + if source_in_index_languages
  528 + else self.translation_boost_when_source_missing
  529 + )
  530 +
  531 + clause = {
521 532 "multi_match": {
522   - "query": query_text,
  533 + "_name": clause_name,
523 534 "fields": match_fields,
524 535 "minimum_should_match": minimum_should_match,
525   - "boost": boost,
526   - "tie_breaker": tie_breaker_long_query,
527   - "_name": "long_query"
  536 + "query": lang_query,
  537 + "tie_breaker": self.tie_breaker_base_query,
528 538 }
  539 + }
  540 + if abs(clause_boost - 1.0) > 1e-9:
  541 + clause["multi_match"]["boost"] = clause_boost
  542 + should_clauses.append({
  543 + "multi_match": clause["multi_match"]
529 544 })
530 545  
531 546 # 3. Short query - add phrase query (derived from query_tokens)
532 547 # is_short: quoted or ((token_count <= 2 or len <= 4) and no space)
533   - ENABLE_PHRASE_QUERY = True
  548 + source_query_text = query_text_by_lang.get(source_lang) or query_text
  549 + ENABLE_PHRASE_QUERY = self.enable_phrase_query
534 550 is_quoted = query_text.startswith('"') and query_text.endswith('"')
535 551 is_short = is_quoted or ((token_count <= 2 or len(query_text) <= 4) and ' ' not in query_text)
536   - if ENABLE_PHRASE_QUERY and token_count >= 2 and is_short:
  552 + if ENABLE_PHRASE_QUERY and core_fields and token_count >= 2 and is_short:
537 553 query_length = len(query_text)
538 554 slop = 0 if query_length < 3 else 1 if query_length < 5 else 2
539 555 should_clauses.append({
540 556 "multi_match": {
541   - "query": query_text,
  557 + "query": source_query_text,
542 558 "fields": core_fields,
543 559 "type": "phrase",
544 560 "slop": slop,
... ... @@ -548,18 +564,31 @@ class ESQueryBuilder:
548 564 })
549 565  
550 566 # 4. Keywords query - extracted nouns from query
551   - elif keywords and len(keywords.split()) <= 2 and 2 * len(keywords.replace(' ', '')) <= len(query_text):
  567 + elif core_fields and keywords and len(keywords.split()) <= 2 and 2 * len(keywords.replace(' ', '')) <= len(query_text):
552 568 should_clauses.append({
553 569 "multi_match": {
554 570 "query": keywords,
555 571 "fields": core_fields,
556 572 # "operator": "AND",
557   - "tie_breaker": tie_breaker_keywords,
558   - "boost": 0.1,
  573 + "tie_breaker": self.tie_breaker_keywords,
  574 + "boost": self.keywords_boost,
559 575 "_name": "keywords_query"
560 576 }
561 577 })
562 578  
  579 + # Fallback to a simple query when language fields cannot be resolved.
  580 + if not should_clauses:
  581 + fallback_fields = self.match_fields or ["title.en^1.0"]
  582 + return {
  583 + "multi_match": {
  584 + "_name": "base_query_fallback",
  585 + "query": query_text,
  586 + "fields": fallback_fields,
  587 + "minimum_should_match": self.base_minimum_should_match,
  588 + "tie_breaker": self.tie_breaker_base_query,
  589 + }
  590 + }
  591 +
563 592 # Return bool query with should clauses
564 593 if len(should_clauses) == 1:
565 594 return should_clauses[0]
... ... @@ -571,70 +600,6 @@ class ESQueryBuilder:
571 600 }
572 601 }
573 602  
574   - def _build_boolean_query(self, node: QueryNode) -> Dict[str, Any]:
575   - """
576   - Build query from boolean expression tree.
577   -
578   - Args:
579   - node: Query tree node
580   -
581   - Returns:
582   - ES query clause
583   - """
584   - if node.operator == 'TERM':
585   - # Leaf node - simple text query
586   - return self._build_text_query(node.value)
587   -
588   - elif node.operator == 'AND':
589   - # All terms must match
590   - return {
591   - "bool": {
592   - "must": [
593   - self._build_boolean_query(term)
594   - for term in node.terms
595   - ]
596   - }
597   - }
598   -
599   - elif node.operator == 'OR':
600   - # Any term must match
601   - return {
602   - "bool": {
603   - "should": [
604   - self._build_boolean_query(term)
605   - for term in node.terms
606   - ],
607   - "minimum_should_match": 1
608   - }
609   - }
610   -
611   - elif node.operator == 'ANDNOT':
612   - # First term must match, second must not
613   - if len(node.terms) >= 2:
614   - return {
615   - "bool": {
616   - "must": [self._build_boolean_query(node.terms[0])],
617   - "must_not": [self._build_boolean_query(node.terms[1])]
618   - }
619   - }
620   - else:
621   - return self._build_boolean_query(node.terms[0])
622   -
623   - elif node.operator == 'RANK':
624   - # Like OR but for ranking (all terms contribute to score)
625   - return {
626   - "bool": {
627   - "should": [
628   - self._build_boolean_query(term)
629   - for term in node.terms
630   - ]
631   - }
632   - }
633   -
634   - else:
635   - # Unknown operator
636   - return {"match_all": {}}
637   -
638 603 def _build_filters(
639 604 self,
640 605 filters: Optional[Dict[str, Any]] = None,
... ...
search/searcher.py
1 1 """
2 2 Main Searcher module - executes search queries against Elasticsearch.
3 3  
4   -Handles query parsing, boolean expressions, ranking, and result formatting.
  4 +Handles query parsing, ranking, and result formatting.
5 5 """
6 6  
7 7 from typing import Dict, Any, List, Optional, Union
... ... @@ -12,11 +12,9 @@ import logging
12 12 from utils.es_client import ESClient
13 13 from query import QueryParser, ParsedQuery
14 14 from embeddings.image_encoder import CLIPImageEncoder
15   -from .boolean_parser import BooleanParser, QueryNode
16 15 from .es_query_builder import ESQueryBuilder
17 16 from config import SearchConfig
18 17 from config.tenant_config_loader import get_tenant_config_loader
19   -from config.utils import get_match_fields_for_index
20 18 from context.request_context import RequestContext, RequestContextStage
21 19 from api.models import FacetResult, FacetValue, FacetConfig
22 20 from api.result_formatter import ResultFormatter
... ... @@ -73,7 +71,7 @@ class Searcher:
73 71  
74 72 Handles:
75 73 - Query parsing and translation
76   - - Boolean expression parsing
  74 + - Dynamic multi-language text recall planning
77 75 - ES query building
78 76 - Result ranking and formatting
79 77 """
... ... @@ -98,12 +96,6 @@ class Searcher:
98 96 self.config = config
99 97 # Index name is now generated dynamically per tenant, no longer stored here
100 98 self.query_parser = query_parser or QueryParser(config)
101   -
102   - # Initialize components
103   - self.boolean_parser = BooleanParser()
104   -
105   - # Get match fields from config
106   - self.match_fields = get_match_fields_for_index(config, "default")
107 99 self.text_embedding_field = config.query_config.text_embedding_field or "title_embedding"
108 100 self.image_embedding_field = config.query_config.image_embedding_field
109 101 if self.image_embedding_field and image_encoder is None:
... ... @@ -114,13 +106,26 @@ class Searcher:
114 106  
115 107 # Query builder - simplified single-layer architecture
116 108 self.query_builder = ESQueryBuilder(
117   - match_fields=self.match_fields,
  109 + match_fields=[],
  110 + field_boosts=self.config.field_boosts,
  111 + multilingual_fields=self.config.query_config.multilingual_fields,
  112 + shared_fields=self.config.query_config.shared_fields,
  113 + core_multilingual_fields=self.config.query_config.core_multilingual_fields,
118 114 text_embedding_field=self.text_embedding_field,
119 115 image_embedding_field=self.image_embedding_field,
120 116 source_fields=self.source_fields,
121 117 function_score_config=self.config.function_score,
122 118 default_language=self.config.query_config.default_language,
123   - knn_boost=self.config.query_config.knn_boost
  119 + knn_boost=self.config.query_config.knn_boost,
  120 + base_minimum_should_match=self.config.query_config.base_minimum_should_match,
  121 + translation_minimum_should_match=self.config.query_config.translation_minimum_should_match,
  122 + translation_boost=self.config.query_config.translation_boost,
  123 + translation_boost_when_source_missing=self.config.query_config.translation_boost_when_source_missing,
  124 + source_boost_when_missing=self.config.query_config.source_boost_when_missing,
  125 + keywords_boost=self.config.query_config.keywords_boost,
  126 + enable_phrase_query=self.config.query_config.enable_phrase_query,
  127 + tie_breaker_base_query=self.config.query_config.tie_breaker_base_query,
  128 + tie_breaker_keywords=self.config.query_config.tie_breaker_keywords,
124 129 )
125 130  
126 131 def _apply_source_filter(self, es_query: Dict[str, Any]) -> None:
... ... @@ -250,7 +255,7 @@ class Searcher:
250 255 translations=parsed_query.translations,
251 256 query_vector=parsed_query.query_vector.tolist() if parsed_query.query_vector is not None else None,
252 257 domain=parsed_query.domain,
253   - is_simple_query=self.boolean_parser.is_simple_query(parsed_query.rewritten_query)
  258 + is_simple_query=True
254 259 )
255 260  
256 261 context.logger.info(
... ... @@ -271,38 +276,7 @@ class Searcher:
271 276 finally:
272 277 context.end_stage(RequestContextStage.QUERY_PARSING)
273 278  
274   - # Step 2: Boolean parsing
275   - context.start_stage(RequestContextStage.BOOLEAN_PARSING)
276   - try:
277   - query_node = None
278   - if self.boolean_parser.is_simple_query(parsed_query.rewritten_query):
279   - # Simple query
280   - query_text = parsed_query.rewritten_query
281   - context.logger.debug(
282   - f"简单查询 | 无布尔表达式",
283   - extra={'reqid': context.reqid, 'uid': context.uid}
284   - )
285   - else:
286   - # Complex boolean query
287   - query_node = self.boolean_parser.parse(parsed_query.rewritten_query)
288   - query_text = parsed_query.rewritten_query
289   - context.store_intermediate_result('query_node', query_node)
290   - context.store_intermediate_result('boolean_ast', str(query_node))
291   - context.logger.info(
292   - f"布尔表达式解析 | AST: {query_node}",
293   - extra={'reqid': context.reqid, 'uid': context.uid}
294   - )
295   - except Exception as e:
296   - context.set_error(e)
297   - context.logger.error(
298   - f"布尔表达式解析失败 | 错误: {str(e)}",
299   - extra={'reqid': context.reqid, 'uid': context.uid}
300   - )
301   - raise
302   - finally:
303   - context.end_stage(RequestContextStage.BOOLEAN_PARSING)
304   -
305   - # Step 3: Query building
  279 + # Step 2: Query building
306 280 context.start_stage(RequestContextStage.QUERY_BUILDING)
307 281 try:
308 282 # Generate tenant-specific index name
... ... @@ -314,7 +288,6 @@ class Searcher:
314 288 es_query = self.query_builder.build_query(
315 289 query_text=parsed_query.rewritten_query or parsed_query.query_normalized,
316 290 query_vector=parsed_query.query_vector if enable_embedding else None,
317   - query_node=query_node,
318 291 filters=filters,
319 292 range_filters=range_filters,
320 293 facet_configs=facets,
... ... @@ -529,7 +502,6 @@ class Searcher:
529 502 "translations": context.query_analysis.translations,
530 503 "has_vector": context.query_analysis.query_vector is not None,
531 504 "is_simple_query": context.query_analysis.is_simple_query,
532   - "boolean_ast": context.get_intermediate_result('boolean_ast'),
533 505 "domain": context.query_analysis.domain
534 506 },
535 507 "es_query": context.get_intermediate_result('es_query', {}),
... ... @@ -666,12 +638,18 @@ class Searcher:
666 638  
667 639 def get_domain_summary(self) -> Dict[str, Any]:
668 640 """
669   - Get summary of all configured domains.
  641 + Get summary of dynamic text retrieval configuration.
670 642  
671 643 Returns:
672   - Dictionary with domain information
  644 + Dictionary with language-aware field information
673 645 """
674   - return self.query_builder.get_domain_summary()
  646 + return {
  647 + "mode": "dynamic_language_fields",
  648 + "multilingual_fields": self.config.query_config.multilingual_fields,
  649 + "shared_fields": self.config.query_config.shared_fields,
  650 + "core_multilingual_fields": self.config.query_config.core_multilingual_fields,
  651 + "field_boosts": self.config.field_boosts,
  652 + }
675 653  
676 654 def get_document(self, tenant_id: str, doc_id: str) -> Optional[Dict[str, Any]]:
677 655 """
... ...
suggestion/service.py
... ... @@ -33,6 +33,68 @@ class SuggestionService:
33 33 return primary
34 34 return index_languages[0]
35 35  
  36 + def _completion_suggest(
  37 + self,
  38 + index_name: str,
  39 + query: str,
  40 + lang: str,
  41 + size: int,
  42 + ) -> List[Dict[str, Any]]:
  43 + """
  44 + Query ES completion suggester from `completion.<lang>`.
  45 +
  46 + Returns items in the same shape as search hits -> dicts with "text"/"lang"/"score"/"rank_score"/"sources".
  47 + """
  48 + field_name = f"completion.{lang}"
  49 + body = {
  50 + "suggest": {
  51 + "s": {
  52 + "prefix": query,
  53 + "completion": {
  54 + "field": field_name,
  55 + "size": size,
  56 + "skip_duplicates": True,
  57 + },
  58 + }
  59 + },
  60 + "_source": [
  61 + "text",
  62 + "lang",
  63 + "rank_score",
  64 + "sources",
  65 + "lang_source",
  66 + "lang_confidence",
  67 + "lang_conflict",
  68 + ],
  69 + }
  70 + try:
  71 + resp = self.es_client.client.search(index=index_name, body=body)
  72 + except Exception as e:
  73 + # completion is an optimization path; never hard-fail the whole endpoint
  74 + logger.warning("Completion suggest failed for index=%s field=%s: %s", index_name, field_name, e)
  75 + return []
  76 +
  77 + entries = (resp.get("suggest", {}) or {}).get("s", []) or []
  78 + if not entries:
  79 + return []
  80 + options = entries[0].get("options", []) or []
  81 + out: List[Dict[str, Any]] = []
  82 + for opt in options:
  83 + src = opt.get("_source", {}) or {}
  84 + out.append(
  85 + {
  86 + "text": src.get("text") or opt.get("text"),
  87 + "lang": src.get("lang") or lang,
  88 + "score": opt.get("_score", 0.0),
  89 + "rank_score": src.get("rank_score"),
  90 + "sources": src.get("sources", []),
  91 + "lang_source": src.get("lang_source"),
  92 + "lang_confidence": src.get("lang_confidence"),
  93 + "lang_conflict": src.get("lang_conflict", False),
  94 + }
  95 + )
  96 + return out
  97 +
36 98 def _search_products_for_suggestion(
37 99 self,
38 100 tenant_id: str,
... ... @@ -95,6 +157,17 @@ class SuggestionService:
95 157 start = time.time()
96 158 resolved_lang = self._resolve_language(tenant_id, language)
97 159 index_name = get_suggestion_index_name(tenant_id)
  160 + if not self.es_client.index_exists(index_name):
  161 + # On a fresh ES cluster the suggestion index might not be built yet.
  162 + # Keep endpoint stable for frontend autocomplete: return empty list instead of 500.
  163 + took_ms = int((time.time() - start) * 1000)
  164 + return {
  165 + "query": query,
  166 + "language": language,
  167 + "resolved_language": resolved_lang,
  168 + "suggestions": [],
  169 + "took_ms": took_ms,
  170 + }
98 171  
99 172 sat_field = f"sat.{resolved_lang}"
100 173 dsl = {
... ... @@ -139,14 +212,42 @@ class SuggestionService:
139 212 "lang_conflict",
140 213 ],
141 214 }
  215 + # Recall path A: bool_prefix on search_as_you_type
142 216 es_resp = self.es_client.search(index_name=index_name, body=dsl, size=size, from_=0)
143 217 hits = es_resp.get("hits", {}).get("hits", []) or []
144 218  
  219 + # Recall path B: completion suggester (optional optimization)
  220 + completion_items = self._completion_suggest(
  221 + index_name=index_name,
  222 + query=query,
  223 + lang=resolved_lang,
  224 + size=size,
  225 + )
  226 +
145 227 suggestions: List[Dict[str, Any]] = []
  228 + seen_text_norm: set = set()
  229 +
  230 + def _norm_text(v: Any) -> str:
  231 + return str(v or "").strip().lower()
  232 +
  233 + # Put completion results first (usually better prefix UX), then fill with sat results.
  234 + for item in completion_items:
  235 + text_val = item.get("text")
  236 + norm = _norm_text(text_val)
  237 + if not norm or norm in seen_text_norm:
  238 + continue
  239 + seen_text_norm.add(norm)
  240 + suggestions.append(dict(item))
  241 +
146 242 for hit in hits:
147 243 src = hit.get("_source", {}) or {}
  244 + text_val = src.get("text")
  245 + norm = _norm_text(text_val)
  246 + if not norm or norm in seen_text_norm:
  247 + continue
  248 + seen_text_norm.add(norm)
148 249 item = {
149   - "text": src.get("text"),
  250 + "text": text_val,
150 251 "lang": src.get("lang"),
151 252 "score": hit.get("_score", 0.0),
152 253 "rank_score": src.get("rank_score"),
... ... @@ -173,7 +274,7 @@ class SuggestionService:
173 274 "query": query,
174 275 "language": language,
175 276 "resolved_language": resolved_lang,
176   - "suggestions": suggestions,
  277 + "suggestions": suggestions[:size],
177 278 "took_ms": took_ms,
178 279 }
179 280  
... ...