Commit bd96ceadef76dc327afcd2d07a023f4902d2f9f5
1 parent
24e92141
1. 动态多语言字段与统一策略配置
- 配置改为“字段基名 + 动态语言后缀”方案,已不再依赖旧 `indexes`。
[config.yaml](/data/saas-search/config/config.yaml#L17)
- `search_fields` / `text_query_strategy` 已进入强校验与解析流程。
[config_loader.py](/data/saas-search/config/config_loader.py#L254)
2. 查询语言计划与翻译等待策略
- `QueryParser` 现在产出
`query_text_by_lang`、`search_langs`、`source_in_index_languages`。
[query_parser.py](/data/saas-search/query/query_parser.py#L41)
- 你要求的两种翻译路径都在:
- 源语言不在店铺 `index_languages`:`translate_multi_async` + 等待
future
- 源语言在 `index_languages`:`translate_multi(...,
async_mode=True)`,尽量走缓存
[query_parser.py](/data/saas-search/query/query_parser.py#L284)
3. ES 查询统一文本策略(无 AST 分支)
- 主召回按 `search_langs` 动态拼 `field.{lang}`,翻译语种做次权重
`should`。
[es_query_builder.py](/data/saas-search/search/es_query_builder.py#L454)
- 布尔 AST 路径已删除,仅保留统一文本策略。
[es_query_builder.py](/data/saas-search/search/es_query_builder.py#L185)
4. LanguageDetector 优化
- 从“拉丁字母默认英文”升级为:脚本优先 +
拉丁语系打分(词典/变音/后缀)。
[language_detector.py](/data/saas-search/query/language_detector.py#L68)
5. 布尔能力清理(补充)
- 已删除废弃模块:
[boolean_parser.py](/data/saas-search/search/boolean_parser.py)
- `search/__init__` 已无相关导出。
[search/__init__.py](/data/saas-search/search/__init__.py)
6. `indexes` 过时收口(补充)
- 兼容函数改为基于动态字段生成,不再依赖 `config.indexes`。
[utils.py](/data/saas-search/config/utils.py#L24)
- Admin 配置接口改为返回动态字段配置,不再暴露 `num_indexes`。
[admin.py](/data/saas-search/api/routes/admin.py#L52)
7. suggest
Showing
20 changed files
with
691 additions
and
752 deletions
Show diff stats
api/models.py
| ... | ... | @@ -70,12 +70,12 @@ class SearchRequest(BaseModel): |
| 70 | 70 | """搜索请求模型(重构版)""" |
| 71 | 71 | |
| 72 | 72 | # 基础搜索参数 |
| 73 | - query: str = Field(..., description="搜索查询字符串,支持布尔表达式(AND, OR, RANK, ANDNOT)") | |
| 73 | + query: str = Field(..., description="搜索查询字符串(统一文本检索策略)") | |
| 74 | 74 | size: int = Field(10, ge=1, le=1000, description="返回结果数量") |
| 75 | 75 | from_: int = Field(0, ge=0, alias="from", description="分页偏移量") |
| 76 | - language: Literal["zh", "en"] = Field( | |
| 77 | - "zh", | |
| 78 | - description="响应语言:'zh'(中文)或 'en'(英文),用于选择 title/description/vendor 等多语言字段" | |
| 76 | + language: str = Field( | |
| 77 | + "en", | |
| 78 | + description="响应语言代码(如 zh/en/de/fr/ar/ru),用于多语言字段返回优先级" | |
| 79 | 79 | ) |
| 80 | 80 | |
| 81 | 81 | # 过滤器 - 精确匹配和多值匹配 | ... | ... |
api/result_formatter.py
| ... | ... | @@ -27,20 +27,32 @@ class ResultFormatter: |
| 27 | 27 | List of SpuResult objects |
| 28 | 28 | """ |
| 29 | 29 | results = [] |
| 30 | - lang = (language or "en").lower() | |
| 31 | - if lang not in ("zh", "en"): | |
| 32 | - lang = "en" | |
| 30 | + lang = (language or "en").lower().replace("-", "_") | |
| 31 | + lang_base = lang.split("_")[0] if lang else "en" | |
| 33 | 32 | |
| 34 | 33 | def pick_lang_field(src: Dict[str, Any], base: str) -> Optional[str]: |
| 35 | 34 | """从多语言对象字段中按语言选择一个值:{base: {"zh": "...", "en": "...", ...}}""" |
| 36 | 35 | obj = src.get(base) |
| 37 | 36 | if not isinstance(obj, dict): |
| 38 | 37 | return None |
| 39 | - zh_val = obj.get("zh") | |
| 40 | - en_val = obj.get("en") | |
| 41 | - if lang == "zh": | |
| 42 | - return zh_val or en_val | |
| 43 | - return en_val or zh_val | |
| 38 | + candidates = [ | |
| 39 | + lang, | |
| 40 | + lang_base, | |
| 41 | + "en", | |
| 42 | + "zh", | |
| 43 | + ] | |
| 44 | + seen = set() | |
| 45 | + for cand in candidates: | |
| 46 | + if not cand or cand in seen: | |
| 47 | + continue | |
| 48 | + seen.add(cand) | |
| 49 | + value = obj.get(cand) | |
| 50 | + if value: | |
| 51 | + return value | |
| 52 | + for value in obj.values(): | |
| 53 | + if value: | |
| 54 | + return value | |
| 55 | + return None | |
| 44 | 56 | |
| 45 | 57 | for hit in es_hits: |
| 46 | 58 | source = hit.get('_source', {}) |
| ... | ... | @@ -434,4 +446,3 @@ class ResultFormatter: |
| 434 | 446 | """ |
| 435 | 447 | # TODO: Implement related search generation logic |
| 436 | 448 | return [] |
| 437 | - | ... | ... |
api/routes/admin.py
| ... | ... | @@ -52,7 +52,9 @@ async def get_configuration(): |
| 52 | 52 | return { |
| 53 | 53 | "es_index_name": config.es_index_name, |
| 54 | 54 | "num_field_boosts": len(config.field_boosts), |
| 55 | - "num_indexes": len(config.indexes), | |
| 55 | + "multilingual_fields": config.query_config.multilingual_fields, | |
| 56 | + "shared_fields": config.query_config.shared_fields, | |
| 57 | + "core_multilingual_fields": config.query_config.core_multilingual_fields, | |
| 56 | 58 | "supported_languages": config.query_config.supported_languages, |
| 57 | 59 | "ranking_expression": config.ranking.expression, |
| 58 | 60 | "spu_enabled": config.spu_config.enabled | ... | ... |
api/routes/search.py
| ... | ... | @@ -37,7 +37,7 @@ async def search(request: SearchRequest, http_request: Request): |
| 37 | 37 | |
| 38 | 38 | Supports: |
| 39 | 39 | - Multi-language query processing |
| 40 | - - Boolean operators (AND, OR, RANK, ANDNOT) | |
| 40 | + - Unified text retrieval strategy (no boolean AST parsing) | |
| 41 | 41 | - Semantic search with embeddings |
| 42 | 42 | - Custom ranking functions |
| 43 | 43 | - Exact match filters and range filters | ... | ... |
config/config.yaml
| ... | ... | @@ -12,71 +12,20 @@ es_settings: |
| 12 | 12 | refresh_interval: "30s" |
| 13 | 13 | |
| 14 | 14 | # 字段权重配置(用于搜索时的字段boost) |
| 15 | -# 只配置权重,不配置字段结构(字段结构由 mappings/search_products.json 定义) | |
| 15 | +# 统一按“字段基名”配置;查询时按 search_langs 动态拼接 .{lang}。 | |
| 16 | +# 若需要按某个语言单独调权,也可以加显式 key(例如 title.de: 3.2)。 | |
| 16 | 17 | field_boosts: |
| 17 | - # 文本相关性字段 | |
| 18 | - "title.zh": 3.0 | |
| 19 | - "brief.zh": 1.5 | |
| 20 | - "description.zh": 1.0 | |
| 21 | - "vendor.zh": 1.5 | |
| 22 | - "title.en": 3.0 | |
| 23 | - "brief.en": 1.5 | |
| 24 | - "description.en": 1.0 | |
| 25 | - "vendor.en": 1.5 | |
| 26 | - | |
| 27 | - # 分类相关字段 | |
| 28 | - "category_path.zh": 1.5 | |
| 29 | - "category_name_text.zh": 1.5 | |
| 30 | - "category_path.en": 1.5 | |
| 31 | - "category_name_text.en": 1.5 | |
| 32 | - | |
| 33 | - # 标签和属性值字段 | |
| 18 | + title: 3.0 | |
| 19 | + brief: 1.5 | |
| 20 | + description: 1.0 | |
| 21 | + vendor: 1.5 | |
| 22 | + category_path: 1.5 | |
| 23 | + category_name_text: 1.5 | |
| 34 | 24 | tags: 1.0 |
| 35 | 25 | option1_values: 0.5 |
| 36 | 26 | option2_values: 0.5 |
| 37 | 27 | option3_values: 0.5 |
| 38 | 28 | |
| 39 | -# 搜索域配置(Query Domains) | |
| 40 | -# 定义不同的搜索策略,指定哪些字段组合在一起搜索 | |
| 41 | -indexes: | |
| 42 | - - name: "default" | |
| 43 | - label: "默认搜索" | |
| 44 | - fields: | |
| 45 | - - "title.zh" | |
| 46 | - - "brief.zh" | |
| 47 | - - "description.zh" | |
| 48 | - - "vendor.zh" | |
| 49 | - - "tags" | |
| 50 | - - "category_path.zh" | |
| 51 | - - "category_name_text.zh" | |
| 52 | - - "option1_values" | |
| 53 | - boost: 1.0 | |
| 54 | - | |
| 55 | - - name: "title" | |
| 56 | - label: "标题搜索" | |
| 57 | - fields: | |
| 58 | - - "title.zh" | |
| 59 | - boost: 2.0 | |
| 60 | - | |
| 61 | - - name: "vendor" | |
| 62 | - label: "品牌搜索" | |
| 63 | - fields: | |
| 64 | - - "vendor.zh" | |
| 65 | - boost: 1.5 | |
| 66 | - | |
| 67 | - - name: "category" | |
| 68 | - label: "类目搜索" | |
| 69 | - fields: | |
| 70 | - - "category_path.zh" | |
| 71 | - - "category_name_text.zh" | |
| 72 | - boost: 1.5 | |
| 73 | - | |
| 74 | - - name: "tags" | |
| 75 | - label: "标签搜索" | |
| 76 | - fields: | |
| 77 | - - "tags" | |
| 78 | - boost: 1.0 | |
| 79 | - | |
| 80 | 29 | # Query Configuration(查询配置) |
| 81 | 30 | query_config: |
| 82 | 31 | # 支持的语言 |
| ... | ... | @@ -89,6 +38,41 @@ query_config: |
| 89 | 38 | enable_text_embedding: true |
| 90 | 39 | enable_query_rewrite: true |
| 91 | 40 | |
| 41 | + # 动态多语言检索字段配置 | |
| 42 | + # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式; | |
| 43 | + # shared_fields 为无语言后缀字段。 | |
| 44 | + search_fields: | |
| 45 | + multilingual_fields: | |
| 46 | + - "title" | |
| 47 | + - "brief" | |
| 48 | + - "description" | |
| 49 | + - "vendor" | |
| 50 | + - "category_path" | |
| 51 | + - "category_name_text" | |
| 52 | + shared_fields: | |
| 53 | + - "tags" | |
| 54 | + - "option1_values" | |
| 55 | + - "option2_values" | |
| 56 | + - "option3_values" | |
| 57 | + core_multilingual_fields: | |
| 58 | + - "title" | |
| 59 | + - "brief" | |
| 60 | + - "vendor" | |
| 61 | + - "category_name_text" | |
| 62 | + | |
| 63 | + # 统一文本召回策略(主查询 + 翻译查询 + phrase/keywords) | |
| 64 | + text_query_strategy: | |
| 65 | + base_minimum_should_match: "75%" | |
| 66 | + translation_minimum_should_match: "75%" | |
| 67 | + translation_boost: 0.4 | |
| 68 | + translation_boost_when_source_missing: 1.0 | |
| 69 | + source_boost_when_missing: 0.6 | |
| 70 | + original_query_fallback_boost_when_translation_missing: 0.2 | |
| 71 | + keywords_boost: 0.1 | |
| 72 | + enable_phrase_query: true | |
| 73 | + tie_breaker_base_query: 0.9 | |
| 74 | + tie_breaker_keywords: 0.9 | |
| 75 | + | |
| 92 | 76 | # Embedding字段名称 |
| 93 | 77 | text_embedding_field: "title_embedding" |
| 94 | 78 | image_embedding_field: null | ... | ... |
config/config_loader.py
| ... | ... | @@ -57,6 +57,28 @@ class QueryConfig: |
| 57 | 57 | |
| 58 | 58 | # KNN boost configuration |
| 59 | 59 | knn_boost: float = 0.25 # Boost value for KNN (embedding recall) |
| 60 | + | |
| 61 | + # Dynamic text fields for multi-language retrieval | |
| 62 | + multilingual_fields: List[str] = field( | |
| 63 | + default_factory=lambda: ["title", "brief", "description", "vendor", "category_path", "category_name_text"] | |
| 64 | + ) | |
| 65 | + shared_fields: List[str] = field( | |
| 66 | + default_factory=lambda: ["tags", "option1_values", "option2_values", "option3_values"] | |
| 67 | + ) | |
| 68 | + core_multilingual_fields: List[str] = field( | |
| 69 | + default_factory=lambda: ["title", "brief", "vendor", "category_name_text"] | |
| 70 | + ) | |
| 71 | + | |
| 72 | + # Unified text strategy tuning | |
| 73 | + base_minimum_should_match: str = "75%" | |
| 74 | + translation_minimum_should_match: str = "75%" | |
| 75 | + translation_boost: float = 0.4 | |
| 76 | + translation_boost_when_source_missing: float = 1.0 | |
| 77 | + source_boost_when_missing: float = 0.6 | |
| 78 | + keywords_boost: float = 0.1 | |
| 79 | + enable_phrase_query: bool = True | |
| 80 | + tie_breaker_base_query: float = 0.9 | |
| 81 | + tie_breaker_keywords: float = 0.9 | |
| 60 | 82 | |
| 61 | 83 | |
| 62 | 84 | @dataclass |
| ... | ... | @@ -102,7 +124,7 @@ class SearchConfig: |
| 102 | 124 | # 字段权重配置(用于搜索) |
| 103 | 125 | field_boosts: Dict[str, float] |
| 104 | 126 | |
| 105 | - # Index structure (query domains) | |
| 127 | + # Legacy index domains (deprecated; kept for compatibility) | |
| 106 | 128 | indexes: List[IndexConfig] |
| 107 | 129 | |
| 108 | 130 | # Query processing |
| ... | ... | @@ -218,7 +240,7 @@ class ConfigLoader: |
| 218 | 240 | if not isinstance(field_boosts, dict): |
| 219 | 241 | raise ConfigurationError("field_boosts must be a dictionary") |
| 220 | 242 | |
| 221 | - # Parse indexes | |
| 243 | + # Parse indexes (deprecated; compatibility only) | |
| 222 | 244 | indexes = [] |
| 223 | 245 | for index_data in config_data.get("indexes", []): |
| 224 | 246 | indexes.append(self._parse_index_config(index_data)) |
| ... | ... | @@ -228,6 +250,8 @@ class ConfigLoader: |
| 228 | 250 | services_data = config_data.get("services", {}) if isinstance(config_data.get("services", {}), dict) else {} |
| 229 | 251 | rewrite_dictionary = self._load_rewrite_dictionary() |
| 230 | 252 | embedding_thresholds = query_config_data.get("embedding_disable_thresholds", {}) |
| 253 | + search_fields_cfg = query_config_data.get("search_fields", {}) | |
| 254 | + text_strategy_cfg = query_config_data.get("text_query_strategy", {}) | |
| 231 | 255 | |
| 232 | 256 | query_config = QueryConfig( |
| 233 | 257 | supported_languages=query_config_data.get("supported_languages") or ["zh", "en"], |
| ... | ... | @@ -245,7 +269,30 @@ class ConfigLoader: |
| 245 | 269 | embedding_disable_chinese_char_limit=embedding_thresholds.get("chinese_char_limit", 4), |
| 246 | 270 | embedding_disable_english_word_limit=embedding_thresholds.get("english_word_limit", 3), |
| 247 | 271 | source_fields=query_config_data.get("source_fields"), |
| 248 | - knn_boost=query_config_data.get("knn_boost", 0.25) | |
| 272 | + knn_boost=query_config_data.get("knn_boost", 0.25), | |
| 273 | + multilingual_fields=search_fields_cfg.get( | |
| 274 | + "multilingual_fields", | |
| 275 | + ["title", "brief", "description", "vendor", "category_path", "category_name_text"], | |
| 276 | + ), | |
| 277 | + shared_fields=search_fields_cfg.get( | |
| 278 | + "shared_fields", | |
| 279 | + ["tags", "option1_values", "option2_values", "option3_values"], | |
| 280 | + ), | |
| 281 | + core_multilingual_fields=search_fields_cfg.get( | |
| 282 | + "core_multilingual_fields", | |
| 283 | + ["title", "brief", "vendor", "category_name_text"], | |
| 284 | + ), | |
| 285 | + base_minimum_should_match=str(text_strategy_cfg.get("base_minimum_should_match", "75%")), | |
| 286 | + translation_minimum_should_match=str(text_strategy_cfg.get("translation_minimum_should_match", "75%")), | |
| 287 | + translation_boost=float(text_strategy_cfg.get("translation_boost", 0.4)), | |
| 288 | + translation_boost_when_source_missing=float( | |
| 289 | + text_strategy_cfg.get("translation_boost_when_source_missing", 1.0) | |
| 290 | + ), | |
| 291 | + source_boost_when_missing=float(text_strategy_cfg.get("source_boost_when_missing", 0.6)), | |
| 292 | + keywords_boost=float(text_strategy_cfg.get("keywords_boost", 0.1)), | |
| 293 | + enable_phrase_query=bool(text_strategy_cfg.get("enable_phrase_query", True)), | |
| 294 | + tie_breaker_base_query=float(text_strategy_cfg.get("tie_breaker_base_query", 0.9)), | |
| 295 | + tie_breaker_keywords=float(text_strategy_cfg.get("tie_breaker_keywords", 0.9)), | |
| 249 | 296 | ) |
| 250 | 297 | |
| 251 | 298 | # Parse ranking config |
| ... | ... | @@ -336,10 +383,7 @@ class ConfigLoader: |
| 336 | 383 | elif boost < 0: |
| 337 | 384 | errors.append(f"field_boosts['{field_name}']: boost must be non-negative") |
| 338 | 385 | |
| 339 | - # Validate indexes | |
| 340 | - if not config.indexes: | |
| 341 | - errors.append("At least one index domain must be defined") | |
| 342 | - | |
| 386 | + # Validate indexes (deprecated, optional) | |
| 343 | 387 | index_names = set() |
| 344 | 388 | for index in config.indexes: |
| 345 | 389 | # Check for duplicate index names |
| ... | ... | @@ -365,6 +409,39 @@ class ConfigLoader: |
| 365 | 409 | f"Default language '{config.query_config.default_language}' " |
| 366 | 410 | f"not in supported languages: {config.query_config.supported_languages}" |
| 367 | 411 | ) |
| 412 | + | |
| 413 | + # Validate dynamic search fields | |
| 414 | + def _validate_str_list(name: str, values: List[str]) -> None: | |
| 415 | + if not isinstance(values, list) or not values: | |
| 416 | + errors.append(f"query_config.{name} must be a non-empty list[str]") | |
| 417 | + return | |
| 418 | + for i, val in enumerate(values): | |
| 419 | + if not isinstance(val, str) or not val.strip(): | |
| 420 | + errors.append(f"query_config.{name}[{i}] must be a non-empty string") | |
| 421 | + | |
| 422 | + _validate_str_list("multilingual_fields", config.query_config.multilingual_fields) | |
| 423 | + _validate_str_list("shared_fields", config.query_config.shared_fields) | |
| 424 | + _validate_str_list("core_multilingual_fields", config.query_config.core_multilingual_fields) | |
| 425 | + | |
| 426 | + core_set = set(config.query_config.core_multilingual_fields) | |
| 427 | + multi_set = set(config.query_config.multilingual_fields) | |
| 428 | + if not core_set.issubset(multi_set): | |
| 429 | + errors.append("query_config.core_multilingual_fields must be subset of multilingual_fields") | |
| 430 | + | |
| 431 | + # Validate text query strategy numbers | |
| 432 | + for name in ( | |
| 433 | + "translation_boost", | |
| 434 | + "translation_boost_when_source_missing", | |
| 435 | + "source_boost_when_missing", | |
| 436 | + "keywords_boost", | |
| 437 | + "tie_breaker_base_query", | |
| 438 | + "tie_breaker_keywords", | |
| 439 | + ): | |
| 440 | + value = getattr(config.query_config, name, None) | |
| 441 | + if not isinstance(value, (int, float)): | |
| 442 | + errors.append(f"query_config.{name} must be a number") | |
| 443 | + elif value < 0: | |
| 444 | + errors.append(f"query_config.{name} must be non-negative") | |
| 368 | 445 | |
| 369 | 446 | # Validate source_fields tri-state semantics |
| 370 | 447 | source_fields = config.query_config.source_fields |
| ... | ... | @@ -409,7 +486,23 @@ class ConfigLoader: |
| 409 | 486 | "chinese_char_limit": config.query_config.embedding_disable_chinese_char_limit, |
| 410 | 487 | "english_word_limit": config.query_config.embedding_disable_english_word_limit |
| 411 | 488 | }, |
| 412 | - "source_fields": config.query_config.source_fields | |
| 489 | + "source_fields": config.query_config.source_fields, | |
| 490 | + "search_fields": { | |
| 491 | + "multilingual_fields": config.query_config.multilingual_fields, | |
| 492 | + "shared_fields": config.query_config.shared_fields, | |
| 493 | + "core_multilingual_fields": config.query_config.core_multilingual_fields, | |
| 494 | + }, | |
| 495 | + "text_query_strategy": { | |
| 496 | + "base_minimum_should_match": config.query_config.base_minimum_should_match, | |
| 497 | + "translation_minimum_should_match": config.query_config.translation_minimum_should_match, | |
| 498 | + "translation_boost": config.query_config.translation_boost, | |
| 499 | + "translation_boost_when_source_missing": config.query_config.translation_boost_when_source_missing, | |
| 500 | + "source_boost_when_missing": config.query_config.source_boost_when_missing, | |
| 501 | + "keywords_boost": config.query_config.keywords_boost, | |
| 502 | + "enable_phrase_query": config.query_config.enable_phrase_query, | |
| 503 | + "tie_breaker_base_query": config.query_config.tie_breaker_base_query, | |
| 504 | + "tie_breaker_keywords": config.query_config.tie_breaker_keywords, | |
| 505 | + } | |
| 413 | 506 | } |
| 414 | 507 | |
| 415 | 508 | return { | ... | ... |
config/utils.py
| 1 | -""" | |
| 2 | -Configuration utility functions. | |
| 3 | - | |
| 4 | -Helper functions for working with SearchConfig objects. | |
| 5 | -""" | |
| 1 | +"""Configuration helper functions for dynamic multi-language search fields.""" | |
| 6 | 2 | |
| 7 | 3 | from typing import Dict, List |
| 8 | 4 | from .config_loader import SearchConfig |
| 9 | 5 | |
| 10 | 6 | |
| 7 | +def _format_field_with_boost(field_name: str, boost: float) -> str: | |
| 8 | + if abs(float(boost) - 1.0) < 1e-9: | |
| 9 | + return field_name | |
| 10 | + return f"{field_name}^{boost}" | |
| 11 | + | |
| 12 | + | |
| 13 | +def _get_boost(config: SearchConfig, base_field: str, language: str = "") -> float: | |
| 14 | + lang = (language or "").strip().lower() | |
| 15 | + if lang: | |
| 16 | + lang_key = f"{base_field}.{lang}" | |
| 17 | + if lang_key in config.field_boosts: | |
| 18 | + return float(config.field_boosts[lang_key]) | |
| 19 | + if base_field in config.field_boosts: | |
| 20 | + return float(config.field_boosts[base_field]) | |
| 21 | + return 1.0 | |
| 22 | + | |
| 23 | + | |
| 11 | 24 | def get_match_fields_for_index(config: SearchConfig, index_name: str = "default") -> List[str]: |
| 12 | 25 | """ |
| 13 | - Generate match fields list with boost from field_boosts. | |
| 14 | - | |
| 15 | - Args: | |
| 16 | - config: SearchConfig instance | |
| 17 | - index_name: Name of the index domain (default: "default") | |
| 18 | - | |
| 19 | - Returns: | |
| 20 | - List of field names with boost, e.g., ["title.zh^3.0", "brief.zh^1.5"] | |
| 26 | + Deprecated compatibility wrapper. | |
| 27 | + | |
| 28 | + `indexes` is no longer used by runtime query building. This function now returns | |
| 29 | + dynamic match fields for the default language based on query_config.search_fields. | |
| 21 | 30 | """ |
| 22 | - # Find the index config | |
| 23 | - index_config = None | |
| 24 | - for idx in config.indexes: | |
| 25 | - if idx.name == index_name: | |
| 26 | - index_config = idx | |
| 27 | - break | |
| 28 | - | |
| 29 | - if not index_config: | |
| 30 | - return [] | |
| 31 | - | |
| 32 | - # Generate match fields with boost | |
| 33 | - match_fields = [] | |
| 34 | - for field_name in index_config.fields: | |
| 35 | - # Get field boost from field_boosts dictionary | |
| 36 | - field_boost = config.field_boosts.get(field_name, 1.0) | |
| 37 | - | |
| 38 | - # Combine index boost and field boost | |
| 39 | - total_boost = index_config.boost * field_boost | |
| 40 | - | |
| 41 | - if total_boost != 1.0: | |
| 42 | - match_fields.append(f"{field_name}^{total_boost}") | |
| 43 | - else: | |
| 44 | - match_fields.append(field_name) | |
| 45 | - | |
| 31 | + del index_name | |
| 32 | + lang = (config.query_config.default_language or "en").strip().lower() | |
| 33 | + match_fields: List[str] = [] | |
| 34 | + | |
| 35 | + for base_field in config.query_config.multilingual_fields: | |
| 36 | + field_name = f"{base_field}.{lang}" | |
| 37 | + match_fields.append(_format_field_with_boost(field_name, _get_boost(config, base_field, lang))) | |
| 38 | + | |
| 39 | + for shared_field in config.query_config.shared_fields: | |
| 40 | + match_fields.append(_format_field_with_boost(shared_field, _get_boost(config, shared_field))) | |
| 41 | + | |
| 46 | 42 | return match_fields |
| 47 | 43 | |
| 48 | 44 | |
| 49 | 45 | def get_domain_fields(config: SearchConfig) -> Dict[str, List[str]]: |
| 50 | 46 | """ |
| 51 | - Generate domain-specific match fields from all index configs. | |
| 52 | - | |
| 53 | - Args: | |
| 54 | - config: SearchConfig instance | |
| 55 | - | |
| 47 | + Get dynamic domain fields for compatibility with old diagnostics endpoints. | |
| 48 | + | |
| 56 | 49 | Returns: |
| 57 | - Dictionary mapping domain name to list of match fields | |
| 50 | + A single `default` domain entry generated from dynamic search_fields. | |
| 58 | 51 | """ |
| 59 | - domain_fields = {} | |
| 60 | - for index_config in config.indexes: | |
| 61 | - domain_fields[index_config.name] = get_match_fields_for_index(config, index_config.name) | |
| 62 | - return domain_fields | |
| 52 | + return {"default": get_match_fields_for_index(config)} | ... | ... |
docs/DEVELOPER_GUIDE.md
| ... | ... | @@ -105,7 +105,7 @@ MySQL (店匠 SPU/SKU) |
| 105 | 105 | api/ # FastAPI 应用:搜索路由、管理路由、索引路由(indexer_app) |
| 106 | 106 | config/ # 配置加载与解析:config.yaml、services、env |
| 107 | 107 | indexer/ # MySQL → ES 管道:mapping、transformer、bulk、增量、build-docs |
| 108 | -query/ # 查询解析:规范化、改写、翻译、embedding 调用、布尔解析 | |
| 108 | +query/ # 查询解析:规范化、改写、翻译、embedding 调用、语言计划生成 | |
| 109 | 109 | search/ # 搜索执行:多语言查询构建、Searcher、重排客户端、分数融合 |
| 110 | 110 | embeddings/ # 向量化:服务端(server)、文本/图像后端、协议与配置 |
| 111 | 111 | reranker/ # 重排:服务端(server)、后端(backends)、配置 |
| ... | ... | @@ -144,7 +144,7 @@ docs/ # 文档(含本指南) |
| 144 | 144 | |
| 145 | 145 | ### 4.4 query |
| 146 | 146 | |
| 147 | -- **职责**:查询解析与预处理:规范化、语言检测、改写(词典)、翻译、文本向量化、布尔表达式解析;输出可供 Searcher 使用的结构化查询信息。 | |
| 147 | +- **职责**:查询解析与预处理:规范化、语言检测、改写(词典)、翻译、文本向量化;输出可供 Searcher 使用的结构化查询信息(含 search_langs 语言计划)。 | |
| 148 | 148 | - **原则**:翻译/向量通过 `providers` 获取,不直接依赖具体服务 URL 或实现;支持按配置关闭翻译/向量(如短查询、typing 场景)。 |
| 149 | 149 | |
| 150 | 150 | ### 4.5 search |
| ... | ... | @@ -241,7 +241,7 @@ docs/ # 文档(含本指南) |
| 241 | 241 | |
| 242 | 242 | ### 6.1 主配置文件 |
| 243 | 243 | |
| 244 | -- **config/config.yaml**:搜索行为(field_boosts、indexes、query_config、ranking、function_score、rerank 融合参数)、SPU 配置、**services**(翻译/向量/重排的 provider 与 backends)、tenant_config 等。 | |
| 244 | +- **config/config.yaml**:搜索行为(field_boosts、query_config.search_fields、query_config.text_query_strategy、ranking、function_score、rerank 融合参数)、SPU 配置、**services**(翻译/向量/重排的 provider 与 backends)、tenant_config 等。 | |
| 245 | 245 | - **.env**:敏感信息与部署态变量(DB、ES、Redis、API Key、端口等);不提交敏感值,可提供 `.env.example` 模板。 |
| 246 | 246 | |
| 247 | 247 | ### 6.2 services 块结构(能力统一约定) | ... | ... |
docs/QUICKSTART.md
| ... | ... | @@ -329,7 +329,7 @@ saas-search 以 MySQL 中的店匠标准表为权威数据源: |
| 329 | 329 | |
| 330 | 330 | - **统一索引结构**:所有租户使用同一套 mapping(按租户数据分索引名 + 文档内 `tenant_id` 隔离) |
| 331 | 331 | - **SPU 级索引**:每个文档是一个 SPU,包含嵌套 `skus`、`specifications` |
| 332 | -- **配置文件驱动**:搜索权重、搜索域、重排融合、provider 全在 `config/config.yaml`,不再以“硬编码配置”为主 | |
| 332 | +- **配置文件驱动**:搜索权重、动态多语言字段、重排融合、provider 全在 `config/config.yaml`,不再以“硬编码配置”为主 | |
| 333 | 333 | |
| 334 | 334 | ### 2.2 索引结构(Mapping) |
| 335 | 335 | |
| ... | ... | @@ -338,7 +338,7 @@ saas-search 以 MySQL 中的店匠标准表为权威数据源: |
| 338 | 338 | 核心字段可分为: |
| 339 | 339 | |
| 340 | 340 | - 标识字段:`tenant_id`, `spu_id` |
| 341 | -- 多语言文本:`title.zh/en`, `brief.zh/en`, `description.zh/en`, `vendor.zh/en`, `category_path.zh/en`, `category_name_text.zh/en` | |
| 341 | +- 多语言文本:`title.<lang>`, `brief.<lang>`, `description.<lang>`, `vendor.<lang>`, `category_path.<lang>`, `category_name_text.<lang>` | |
| 342 | 342 | - 类目过滤:`category1_name`, `category2_name`, `category3_name` 等 |
| 343 | 343 | - 规格/变体:`specifications`(nested)、`skus`(nested) |
| 344 | 344 | - 价格库存:`min_price`, `max_price`, `total_inventory` 等 |
| ... | ... | @@ -346,8 +346,9 @@ saas-search 以 MySQL 中的店匠标准表为权威数据源: |
| 346 | 346 | |
| 347 | 347 | ### 2.3 查询、权重、排序(`config/config.yaml`) |
| 348 | 348 | |
| 349 | -- `field_boosts`:字段权重(如标题、品牌、类目) | |
| 350 | -- `indexes`:搜索域(default/title/vendor/category/tags) | |
| 349 | +- `field_boosts`:字段权重(统一按字段基名配置,运行时按 `.{lang}` 动态组装) | |
| 350 | +- `query_config.search_fields`:动态多语言检索字段(multilingual/shared/core) | |
| 351 | +- `query_config.text_query_strategy`:文本召回策略参数(minimum_should_match、翻译boost等) | |
| 351 | 352 | - `query_config`:语言、embedding 开关、source_fields、knn_boost、翻译提示词等 |
| 352 | 353 | - `ranking.expression`:融合表达式(例如 `bm25() + 0.25*text_embedding_relevance()`) |
| 353 | 354 | - `function_score`:ES 层加权函数 |
| ... | ... | @@ -364,7 +365,7 @@ saas-search 以 MySQL 中的店匠标准表为权威数据源: |
| 364 | 365 | | 修改项 | 操作 | |
| 365 | 366 | |--------|------| |
| 366 | 367 | | 索引结构(mapping) | 修改 `mappings/search_products.json` → `./scripts/create_tenant_index.sh <tenant_id>` → 重新导入 | |
| 367 | -| 搜索域/权重/排序/重排 | 修改 `config/config.yaml` 对应块 | | |
| 368 | +| 搜索字段/权重/排序/重排 | 修改 `config/config.yaml` 对应块 | | |
| 368 | 369 | | provider 与服务 URL | 修改 `config/config.yaml` 的 `services` 块,或用环境变量覆盖 | |
| 369 | 370 | |
| 370 | 371 | --- | ... | ... |
docs/搜索API对接指南.md
| ... | ... | @@ -18,10 +18,9 @@ |
| 18 | 18 | - 3.3 [过滤器详解](#33-过滤器详解) |
| 19 | 19 | - 3.4 [分面配置](#34-分面配置) |
| 20 | 20 | - 3.5 [SKU筛选维度](#35-sku筛选维度) |
| 21 | - - 3.6 [布尔表达式语法](#36-布尔表达式语法) | |
| 22 | - - 3.7 [搜索建议接口](#37-搜索建议接口) | |
| 23 | - - 3.8 [即时搜索接口](#38-即时搜索接口) | |
| 24 | - - 3.9 [获取单个文档](#39-获取单个文档) | |
| 21 | + - 3.6 [搜索建议接口](#37-搜索建议接口) | |
| 22 | + - 3.7 [即时搜索接口](#38-即时搜索接口) | |
| 23 | + - 3.8 [获取单个文档](#39-获取单个文档) | |
| 25 | 24 | |
| 26 | 25 | 4. [响应格式说明](#响应格式说明) |
| 27 | 26 | - 4.1 [标准响应结构](#41-标准响应结构) |
| ... | ... | @@ -56,8 +55,7 @@ |
| 56 | 55 | - 8.3 [分面搜索](#83-分面搜索) |
| 57 | 56 | - 8.4 [规格过滤与分面](#84-规格过滤与分面) |
| 58 | 57 | - 8.5 [SKU筛选](#85-sku筛选) |
| 59 | - - 8.6 [布尔表达式搜索](#86-布尔表达式搜索) | |
| 60 | - - 8.7 [分页查询](#87-分页查询) | |
| 58 | + - 8.6 [分页查询](#87-分页查询) | |
| 61 | 59 | |
| 62 | 60 | 9. [数据模型](#9-数据模型) |
| 63 | 61 | - 9.1 [商品字段定义](#91-商品字段定义) |
| ... | ... | @@ -167,7 +165,7 @@ curl -X POST "http://43.166.252.75:6002/search/" \ |
| 167 | 165 | ### 3.1 接口信息 |
| 168 | 166 | |
| 169 | 167 | - **端点**: `POST /search/` |
| 170 | -- **描述**: 执行文本搜索查询,支持多语言、布尔表达式、过滤器和分面搜索 | |
| 168 | +- **描述**: 执行文本搜索查询,支持多语言、过滤器和分面搜索 | |
| 171 | 169 | - **租户标识**:`tenant_id` 通过 HTTP 请求头 **`X-Tenant-ID`** 传递(推荐);也可通过 URL query 参数 **`tenant_id`** 传递。**不要放在请求体中。** |
| 172 | 170 | |
| 173 | 171 | **请求示例(推荐)**: |
| ... | ... | @@ -210,7 +208,7 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"}) |
| 210 | 208 | |
| 211 | 209 | | 参数 | 类型 | 必填 | 默认值 | 说明 | |
| 212 | 210 | |------|------|------|--------|------| |
| 213 | -| `query` | string | Y | - | 搜索查询字符串,支持布尔表达式(AND, OR, RANK, ANDNOT) | | |
| 211 | +| `query` | string | Y | - | 搜索查询字符串(统一文本检索策略) | | |
| 214 | 212 | | `size` | integer | N | 10 | 返回结果数量(1-100) | |
| 215 | 213 | | `from` | integer | N | 0 | 分页偏移量(用于分页) | |
| 216 | 214 | | `language` | string | N | "zh" | 返回语言:`zh`(中文)或 `en`(英文)。后端会根据此参数选择对应的中英文字段返回 | |
| ... | ... | @@ -544,36 +542,6 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"}) |
| 544 | 542 | } |
| 545 | 543 | ``` |
| 546 | 544 | |
| 547 | -### 3.6 布尔表达式语法 | |
| 548 | - | |
| 549 | -搜索查询支持布尔表达式,提供更灵活的搜索能力。 | |
| 550 | - | |
| 551 | -**支持的操作符**: | |
| 552 | - | |
| 553 | -| 操作符 | 描述 | 示例 | | |
| 554 | -|--------|------|------| | |
| 555 | -| `AND` | 所有词必须匹配 | `玩具 AND 乐高` | | |
| 556 | -| `OR` | 任意词匹配 | `芭比 OR 娃娃` | | |
| 557 | -| `ANDNOT` | 排除特定词 | `玩具 ANDNOT 电动` | | |
| 558 | -| `RANK` | 排序加权(不强制匹配) | `玩具 RANK 乐高` | | |
| 559 | -| `()` | 分组 | `玩具 AND (乐高 OR 芭比)` | | |
| 560 | - | |
| 561 | -**操作符优先级**(从高到低): | |
| 562 | -1. `()` - 括号 | |
| 563 | -2. `ANDNOT` - 排除 | |
| 564 | -3. `AND` - 与 | |
| 565 | -4. `OR` - 或 | |
| 566 | -5. `RANK` - 排序 | |
| 567 | - | |
| 568 | -**示例**: | |
| 569 | -``` | |
| 570 | -"芭比娃娃" // 简单查询 | |
| 571 | -"玩具 AND 乐高" // AND 查询 | |
| 572 | -"芭比 OR 娃娃" // OR 查询 | |
| 573 | -"玩具 ANDNOT 电动" // 排除查询 | |
| 574 | -"玩具 AND (乐高 OR 芭比)" // 复杂查询 | |
| 575 | -``` | |
| 576 | - | |
| 577 | 545 | ### 3.7 搜索建议接口 |
| 578 | 546 | |
| 579 | 547 | - **端点**: `GET /search/suggestions` |
| ... | ... | @@ -2020,17 +1988,6 @@ curl "http://localhost:6006/health" |
| 2020 | 1988 | - 每个SPU下,每种颜色只会返回第一个SKU |
| 2021 | 1989 | - 如果维度不匹配,返回所有SKU(不进行过滤) |
| 2022 | 1990 | |
| 2023 | -### 8.6 布尔表达式搜索 | |
| 2024 | - | |
| 2025 | -**需求**: 搜索包含"手机"和"智能"的商品,排除"二手" | |
| 2026 | - | |
| 2027 | -```json | |
| 2028 | -{ | |
| 2029 | - "query": "手机 AND 智能 ANDNOT 二手", | |
| 2030 | - "size": 20 | |
| 2031 | -} | |
| 2032 | -``` | |
| 2033 | - | |
| 2034 | 1991 | ### 8.7 分页查询 |
| 2035 | 1992 | |
| 2036 | 1993 | **需求**: 获取第2页结果(每页20条) | ... | ... |
docs/搜索API速查表.md
| ... | ... | @@ -165,18 +165,6 @@ POST /search/ |
| 165 | 165 | |
| 166 | 166 | --- |
| 167 | 167 | |
| 168 | -## 布尔表达式 | |
| 169 | - | |
| 170 | -```bash | |
| 171 | -{ | |
| 172 | - "query": "玩具 AND (乐高 OR 芭比) ANDNOT 电动" | |
| 173 | -} | |
| 174 | -``` | |
| 175 | - | |
| 176 | -**操作符优先级**: `()` > `ANDNOT` > `AND` > `OR` > `RANK` | |
| 177 | - | |
| 178 | ---- | |
| 179 | - | |
| 180 | 168 | ## 分页 |
| 181 | 169 | |
| 182 | 170 | ```bash | ... | ... |
query/language_detector.py
| 1 | 1 | """ |
| 2 | 2 | Language detection utility. |
| 3 | 3 | |
| 4 | -Detects the language of a query string. | |
| 4 | +Detects language of short e-commerce queries with script checks + lightweight | |
| 5 | +Latin-language scoring (de/fr/es/it/pt/nl/en). | |
| 5 | 6 | """ |
| 6 | 7 | |
| 7 | -from typing import Optional | |
| 8 | +from typing import Dict, List | |
| 8 | 9 | import re |
| 9 | 10 | |
| 10 | 11 | |
| 11 | 12 | class LanguageDetector: |
| 12 | - """Simple rule-based language detector for common e-commerce languages.""" | |
| 13 | - | |
| 14 | - # Unicode ranges for different scripts | |
| 15 | - CJK_RANGES = [ | |
| 16 | - (0x4E00, 0x9FFF), # CJK Unified Ideographs | |
| 17 | - (0x3400, 0x4DBF), # CJK Extension A | |
| 18 | - (0x20000, 0x2A6DF), # CJK Extension B | |
| 19 | - (0x3040, 0x309F), # Hiragana | |
| 20 | - (0x30A0, 0x30FF), # Katakana | |
| 21 | - ] | |
| 22 | - | |
| 23 | - CYRILLIC_RANGE = (0x0400, 0x04FF) | |
| 24 | - ARABIC_RANGE = (0x0600, 0x06FF) | |
| 25 | - LATIN_RANGE = (0x0041, 0x007A) | |
| 13 | + """Rule-based language detector for common e-commerce query languages.""" | |
| 26 | 14 | |
| 27 | 15 | def __init__(self): |
| 28 | - """Initialize language detector.""" | |
| 29 | - self.chinese_pattern = re.compile(r'[\u4e00-\u9fff]+') | |
| 30 | - self.russian_pattern = re.compile(r'[\u0400-\u04ff]+') | |
| 31 | - self.arabic_pattern = re.compile(r'[\u0600-\u06ff]+') | |
| 32 | - self.japanese_pattern = re.compile(r'[\u3040-\u309f\u30a0-\u30ff]+') | |
| 16 | + self._re_zh = re.compile(r"[\u4e00-\u9fff]") | |
| 17 | + self._re_ja_kana = re.compile(r"[\u3040-\u30ff]") | |
| 18 | + self._re_ko = re.compile(r"[\uac00-\ud7af]") | |
| 19 | + self._re_ru = re.compile(r"[\u0400-\u04ff]") | |
| 20 | + self._re_ar = re.compile(r"[\u0600-\u06ff]") | |
| 21 | + self._re_hi = re.compile(r"[\u0900-\u097f]") | |
| 22 | + self._re_he = re.compile(r"[\u0590-\u05ff]") | |
| 23 | + self._re_th = re.compile(r"[\u0e00-\u0e7f]") | |
| 24 | + self._re_latin_word = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ]+") | |
| 25 | + | |
| 26 | + # Stopwords + e-commerce terms for Latin-family disambiguation. | |
| 27 | + self._latin_lexicons: Dict[str, set] = { | |
| 28 | + "en": { | |
| 29 | + "the", "and", "for", "with", "new", "women", "men", "kids", | |
| 30 | + "shoe", "shoes", "dress", "shirt", "jacket", "bag", "wireless", | |
| 31 | + }, | |
| 32 | + "de": { | |
| 33 | + "der", "die", "das", "und", "mit", "für", "damen", "herren", | |
| 34 | + "kinder", "schuhe", "kleid", "hemd", "jacke", "tasche", | |
| 35 | + }, | |
| 36 | + "fr": { | |
| 37 | + "le", "la", "les", "et", "avec", "pour", "femme", "homme", | |
| 38 | + "enfant", "chaussures", "robe", "chemise", "veste", "sac", | |
| 39 | + }, | |
| 40 | + "es": { | |
| 41 | + "el", "la", "los", "las", "y", "con", "para", "mujer", "hombre", | |
| 42 | + "niño", "niña", "zapatos", "vestido", "camisa", "chaqueta", "bolso", | |
| 43 | + }, | |
| 44 | + "it": { | |
| 45 | + "il", "lo", "la", "gli", "le", "e", "con", "per", "donna", "uomo", | |
| 46 | + "bambino", "scarpe", "abito", "camicia", "giacca", "borsa", | |
| 47 | + }, | |
| 48 | + "pt": { | |
| 49 | + "o", "a", "os", "as", "e", "com", "para", "mulher", "homem", | |
| 50 | + "criança", "sapatos", "vestido", "camisa", "jaqueta", "bolsa", | |
| 51 | + }, | |
| 52 | + "nl": { | |
| 53 | + "de", "het", "en", "met", "voor", "dames", "heren", "kinderen", | |
| 54 | + "schoenen", "jurk", "overhemd", "jas", "tas", | |
| 55 | + }, | |
| 56 | + } | |
| 57 | + self._diacritic_weights: Dict[str, Dict[str, int]] = { | |
| 58 | + "de": {"ä": 3, "ö": 3, "ü": 3, "ß": 4}, | |
| 59 | + "fr": {"é": 2, "è": 2, "ê": 2, "à": 2, "ç": 2, "ù": 2, "ô": 2}, | |
| 60 | + "es": {"ñ": 3, "á": 2, "é": 2, "í": 2, "ó": 2, "ú": 2}, | |
| 61 | + "it": {"à": 2, "è": 2, "é": 2, "ì": 2, "ò": 2, "ù": 2}, | |
| 62 | + "pt": {"ã": 3, "õ": 3, "ç": 2, "á": 2, "â": 2, "ê": 2, "ô": 2}, | |
| 63 | + "nl": {"ij": 2}, | |
| 64 | + } | |
| 33 | 65 | |
| 34 | 66 | def detect(self, text: str) -> str: |
| 35 | 67 | """ |
| 36 | - Detect language of text. | |
| 37 | - | |
| 38 | - Args: | |
| 39 | - text: Input text | |
| 68 | + Detect language code for text. | |
| 40 | 69 | |
| 41 | - Returns: | |
| 42 | - Language code: 'zh', 'en', 'ru', 'ar', 'ja', or 'unknown' | |
| 70 | + Returns one of: zh/en/ru/ar/ja/ko/de/fr/es/it/pt/nl/hi/he/th/unknown | |
| 43 | 71 | """ |
| 44 | 72 | if not text or not text.strip(): |
| 45 | - return 'unknown' | |
| 46 | - | |
| 47 | - text = text.strip() | |
| 48 | - | |
| 49 | - # Count characters in each script | |
| 50 | - char_counts = { | |
| 51 | - 'chinese': 0, | |
| 52 | - 'russian': 0, | |
| 53 | - 'arabic': 0, | |
| 54 | - 'japanese': 0, | |
| 55 | - 'latin': 0 | |
| 56 | - } | |
| 57 | - | |
| 58 | - for char in text: | |
| 59 | - code_point = ord(char) | |
| 60 | - | |
| 61 | - # Check CJK (Chinese/Japanese) | |
| 62 | - is_cjk = any(start <= code_point <= end for start, end in self.CJK_RANGES) | |
| 63 | - if is_cjk: | |
| 64 | - char_counts['chinese'] += 1 | |
| 65 | - | |
| 66 | - # Check Hiragana/Katakana (Japanese) | |
| 67 | - if 0x3040 <= code_point <= 0x30FF: | |
| 68 | - char_counts['japanese'] += 1 | |
| 69 | - | |
| 70 | - # Check Cyrillic (Russian) | |
| 71 | - if self.CYRILLIC_RANGE[0] <= code_point <= self.CYRILLIC_RANGE[1]: | |
| 72 | - char_counts['russian'] += 1 | |
| 73 | - | |
| 74 | - # Check Arabic | |
| 75 | - if self.ARABIC_RANGE[0] <= code_point <= self.ARABIC_RANGE[1]: | |
| 76 | - char_counts['arabic'] += 1 | |
| 77 | - | |
| 78 | - # Check Latin | |
| 79 | - if (0x0041 <= code_point <= 0x005A) or (0x0061 <= code_point <= 0x007A): | |
| 80 | - char_counts['latin'] += 1 | |
| 81 | - | |
| 82 | - # Determine dominant script | |
| 83 | - total_chars = sum(char_counts.values()) | |
| 84 | - if total_chars == 0: | |
| 85 | - return 'unknown' | |
| 86 | - | |
| 87 | - # Calculate percentages | |
| 88 | - percentages = { | |
| 89 | - script: count / total_chars | |
| 90 | - for script, count in char_counts.items() | |
| 91 | - } | |
| 92 | - | |
| 93 | - # Japanese has both Hiragana/Katakana and CJK | |
| 94 | - if percentages['japanese'] > 0.1: | |
| 95 | - return 'ja' | |
| 96 | - | |
| 97 | - # Russian (Cyrillic) | |
| 98 | - if percentages['russian'] > 0.5: | |
| 99 | - return 'ru' | |
| 100 | - | |
| 101 | - # Arabic | |
| 102 | - if percentages['arabic'] > 0.5: | |
| 103 | - return 'ar' | |
| 104 | - | |
| 105 | - # Chinese (CJK without Japanese kana) | |
| 106 | - if percentages['chinese'] > 0.3: | |
| 107 | - return 'zh' | |
| 108 | - | |
| 109 | - # English/Latin | |
| 110 | - if percentages['latin'] > 0.5: | |
| 111 | - return 'en' | |
| 112 | - | |
| 113 | - return 'unknown' | |
| 73 | + return "unknown" | |
| 74 | + q = text.strip().lower() | |
| 75 | + | |
| 76 | + # Script-first detection for non-Latin languages. | |
| 77 | + if self._re_ja_kana.search(q): | |
| 78 | + return "ja" | |
| 79 | + if self._re_ko.search(q): | |
| 80 | + return "ko" | |
| 81 | + if self._re_zh.search(q): | |
| 82 | + return "zh" | |
| 83 | + if self._re_ru.search(q): | |
| 84 | + return "ru" | |
| 85 | + if self._re_ar.search(q): | |
| 86 | + return "ar" | |
| 87 | + if self._re_hi.search(q): | |
| 88 | + return "hi" | |
| 89 | + if self._re_he.search(q): | |
| 90 | + return "he" | |
| 91 | + if self._re_th.search(q): | |
| 92 | + return "th" | |
| 93 | + | |
| 94 | + # Latin-family scoring. | |
| 95 | + tokens = self._re_latin_word.findall(q) | |
| 96 | + if not tokens: | |
| 97 | + return "unknown" | |
| 98 | + | |
| 99 | + scores: Dict[str, float] = {k: 0.0 for k in self._latin_lexicons.keys()} | |
| 100 | + scores["en"] = scores.get("en", 0.0) | |
| 101 | + token_set = set(tokens) | |
| 102 | + | |
| 103 | + # Lexicon matches | |
| 104 | + for lang, lex in self._latin_lexicons.items(): | |
| 105 | + overlap = len(token_set & lex) | |
| 106 | + if overlap: | |
| 107 | + scores[lang] += overlap * 2.0 | |
| 108 | + | |
| 109 | + # Diacritics / orthographic hints | |
| 110 | + for lang, hints in self._diacritic_weights.items(): | |
| 111 | + for marker, weight in hints.items(): | |
| 112 | + if marker in q: | |
| 113 | + scores[lang] += weight | |
| 114 | + | |
| 115 | + # Light suffix hints for common product words | |
| 116 | + for t in tokens: | |
| 117 | + if t.endswith("ung") or t.endswith("chen"): | |
| 118 | + scores["de"] += 0.6 | |
| 119 | + if t.endswith("ción") or t.endswith("ado") or t.endswith("ada"): | |
| 120 | + scores["es"] += 0.6 | |
| 121 | + if t.endswith("zione") or t.endswith("etto") or t.endswith("ella"): | |
| 122 | + scores["it"] += 0.6 | |
| 123 | + if t.endswith("ção") or t.endswith("mente"): | |
| 124 | + scores["pt"] += 0.6 | |
| 125 | + if t.endswith("ment") or t.endswith("eau"): | |
| 126 | + scores["fr"] += 0.5 | |
| 127 | + | |
| 128 | + # Fallback preference: English for pure Latin short tokens. | |
| 129 | + scores["en"] += 0.2 | |
| 130 | + | |
| 131 | + best_lang = max(scores.items(), key=lambda x: x[1])[0] | |
| 132 | + best_score = scores[best_lang] | |
| 133 | + if best_score <= 0: | |
| 134 | + return "en" | |
| 135 | + return best_lang | |
| 114 | 136 | |
| 115 | 137 | def is_chinese(self, text: str) -> bool: |
| 116 | - """Check if text is primarily Chinese.""" | |
| 117 | - return self.detect(text) == 'zh' | |
| 138 | + return self.detect(text) == "zh" | |
| 118 | 139 | |
| 119 | 140 | def is_english(self, text: str) -> bool: |
| 120 | - """Check if text is primarily English.""" | |
| 121 | - return self.detect(text) == 'en' | |
| 141 | + return self.detect(text) == "en" | |
| 122 | 142 | |
| 123 | 143 | def is_russian(self, text: str) -> bool: |
| 124 | - """Check if text is primarily Russian.""" | |
| 125 | - return self.detect(text) == 'ru' | |
| 144 | + return self.detect(text) == "ru" | |
| 126 | 145 | |
| 127 | 146 | def is_arabic(self, text: str) -> bool: |
| 128 | - """Check if text is primarily Arabic.""" | |
| 129 | - return self.detect(text) == 'ar' | |
| 147 | + return self.detect(text) == "ar" | |
| 130 | 148 | |
| 131 | 149 | def is_japanese(self, text: str) -> bool: |
| 132 | - """Check if text is primarily Japanese.""" | |
| 133 | - return self.detect(text) == 'ja' | |
| 150 | + return self.detect(text) == "ja" | ... | ... |
query/query_parser.py
| ... | ... | @@ -37,7 +37,11 @@ class ParsedQuery: |
| 37 | 37 | domain: str = "default", |
| 38 | 38 | keywords: str = "", |
| 39 | 39 | token_count: int = 0, |
| 40 | - query_tokens: Optional[List[str]] = None | |
| 40 | + query_tokens: Optional[List[str]] = None, | |
| 41 | + query_text_by_lang: Optional[Dict[str, str]] = None, | |
| 42 | + search_langs: Optional[List[str]] = None, | |
| 43 | + index_languages: Optional[List[str]] = None, | |
| 44 | + source_in_index_languages: bool = True, | |
| 41 | 45 | ): |
| 42 | 46 | self.original_query = original_query |
| 43 | 47 | self.query_normalized = query_normalized |
| ... | ... | @@ -50,6 +54,10 @@ class ParsedQuery: |
| 50 | 54 | self.keywords = keywords |
| 51 | 55 | self.token_count = token_count |
| 52 | 56 | self.query_tokens = query_tokens or [] |
| 57 | + self.query_text_by_lang = query_text_by_lang or {} | |
| 58 | + self.search_langs = search_langs or [] | |
| 59 | + self.index_languages = index_languages or [] | |
| 60 | + self.source_in_index_languages = bool(source_in_index_languages) | |
| 53 | 61 | |
| 54 | 62 | def to_dict(self) -> Dict[str, Any]: |
| 55 | 63 | """Convert to dictionary representation.""" |
| ... | ... | @@ -61,6 +69,10 @@ class ParsedQuery: |
| 61 | 69 | "translations": self.translations, |
| 62 | 70 | "domain": self.domain |
| 63 | 71 | } |
| 72 | + result["query_text_by_lang"] = self.query_text_by_lang | |
| 73 | + result["search_langs"] = self.search_langs | |
| 74 | + result["index_languages"] = self.index_languages | |
| 75 | + result["source_in_index_languages"] = self.source_in_index_languages | |
| 64 | 76 | return result |
| 65 | 77 | |
| 66 | 78 | |
| ... | ... | @@ -253,12 +265,21 @@ class QueryParser: |
| 253 | 265 | # Stage 4: Translation (with async support and conditional waiting) |
| 254 | 266 | translations = {} |
| 255 | 267 | translation_futures = {} |
| 268 | + index_langs = ["en", "zh"] | |
| 256 | 269 | try: |
| 257 | 270 | # 根据租户配置的 index_languages 决定翻译目标语言 |
| 258 | 271 | from config.tenant_config_loader import get_tenant_config_loader |
| 259 | 272 | tenant_loader = get_tenant_config_loader() |
| 260 | 273 | tenant_cfg = tenant_loader.get_tenant_config(tenant_id or "default") |
| 261 | - index_langs = tenant_cfg.get("index_languages") or ["en", "zh"] | |
| 274 | + raw_index_langs = tenant_cfg.get("index_languages") or ["en", "zh"] | |
| 275 | + index_langs = [] | |
| 276 | + seen_langs = set() | |
| 277 | + for lang in raw_index_langs: | |
| 278 | + norm_lang = str(lang or "").strip().lower() | |
| 279 | + if not norm_lang or norm_lang in seen_langs: | |
| 280 | + continue | |
| 281 | + seen_langs.add(norm_lang) | |
| 282 | + index_langs.append(norm_lang) | |
| 262 | 283 | |
| 263 | 284 | target_langs_for_translation = [lang for lang in index_langs if lang != detected_lang] |
| 264 | 285 | |
| ... | ... | @@ -269,8 +290,12 @@ class QueryParser: |
| 269 | 290 | # Use e-commerce context for better disambiguation |
| 270 | 291 | translation_context = self.config.query_config.translation_context |
| 271 | 292 | # For query translation, we use a general prompt (not language-specific) |
| 272 | - query_prompt = self.config.query_config.translation_prompts.get('query_zh') or \ | |
| 273 | - self.config.query_config.translation_prompts.get('default_zh') | |
| 293 | + query_prompt = ( | |
| 294 | + self.config.query_config.translation_prompts.get(f"query_{detected_lang}") | |
| 295 | + or self.config.query_config.translation_prompts.get("query_en") | |
| 296 | + or self.config.query_config.translation_prompts.get("default_en") | |
| 297 | + or self.config.query_config.translation_prompts.get("default_zh") | |
| 298 | + ) | |
| 274 | 299 | |
| 275 | 300 | # Determine if we need to wait for translation results |
| 276 | 301 | # If detected_lang is not in index_languages, we must wait for translation |
| ... | ... | @@ -417,6 +442,33 @@ class QueryParser: |
| 417 | 442 | # Update translations in context after all are complete |
| 418 | 443 | if translations and context: |
| 419 | 444 | context.store_intermediate_result('translations', translations) |
| 445 | + | |
| 446 | + # Build language-scoped query plan: source language + available translations | |
| 447 | + query_text_by_lang: Dict[str, str] = {} | |
| 448 | + if query_text: | |
| 449 | + query_text_by_lang[detected_lang] = query_text | |
| 450 | + for lang, translated_text in (translations or {}).items(): | |
| 451 | + if translated_text and str(translated_text).strip(): | |
| 452 | + query_text_by_lang[str(lang).strip().lower()] = str(translated_text) | |
| 453 | + | |
| 454 | + source_in_index_languages = detected_lang in index_langs | |
| 455 | + ordered_search_langs: List[str] = [] | |
| 456 | + seen_order = set() | |
| 457 | + if detected_lang in query_text_by_lang: | |
| 458 | + ordered_search_langs.append(detected_lang) | |
| 459 | + seen_order.add(detected_lang) | |
| 460 | + for lang in index_langs: | |
| 461 | + if lang in query_text_by_lang and lang not in seen_order: | |
| 462 | + ordered_search_langs.append(lang) | |
| 463 | + seen_order.add(lang) | |
| 464 | + for lang in query_text_by_lang.keys(): | |
| 465 | + if lang not in seen_order: | |
| 466 | + ordered_search_langs.append(lang) | |
| 467 | + seen_order.add(lang) | |
| 468 | + | |
| 469 | + if context: | |
| 470 | + context.store_intermediate_result("search_langs", ordered_search_langs) | |
| 471 | + context.store_intermediate_result("query_text_by_lang", query_text_by_lang) | |
| 420 | 472 | |
| 421 | 473 | # Build result |
| 422 | 474 | result = ParsedQuery( |
| ... | ... | @@ -429,7 +481,11 @@ class QueryParser: |
| 429 | 481 | domain=domain, |
| 430 | 482 | keywords=keywords, |
| 431 | 483 | token_count=token_count, |
| 432 | - query_tokens=query_tokens | |
| 484 | + query_tokens=query_tokens, | |
| 485 | + query_text_by_lang=query_text_by_lang, | |
| 486 | + search_langs=ordered_search_langs, | |
| 487 | + index_languages=index_langs, | |
| 488 | + source_in_index_languages=source_in_index_languages, | |
| 433 | 489 | ) |
| 434 | 490 | |
| 435 | 491 | if context and hasattr(context, 'logger'): | ... | ... |
query/query_rewriter.py
| ... | ... | @@ -19,7 +19,7 @@ class QueryRewriter: |
| 19 | 19 | |
| 20 | 20 | Args: |
| 21 | 21 | rewrite_dict: Dictionary mapping exact query terms to rewrite expressions |
| 22 | - e.g., {"芭比": "brand:芭比 OR name:芭比娃娃"} | |
| 22 | + e.g., {"芭比": "芭比娃娃"} | |
| 23 | 23 | Only full word matches will be rewritten, no partial matching. |
| 24 | 24 | """ |
| 25 | 25 | self.rewrite_dict = rewrite_dict or {} |
| ... | ... | @@ -107,13 +107,13 @@ class QueryNormalizer: |
| 107 | 107 | return query |
| 108 | 108 | |
| 109 | 109 | @staticmethod |
| 110 | - def remove_punctuation(query: str, keep_operators: bool = True) -> str: | |
| 110 | + def remove_punctuation(query: str, keep_operators: bool = False) -> str: | |
| 111 | 111 | """ |
| 112 | 112 | Remove punctuation from query. |
| 113 | 113 | |
| 114 | 114 | Args: |
| 115 | 115 | query: Original query |
| 116 | - keep_operators: Whether to keep boolean operators (AND, OR, etc.) | |
| 116 | + keep_operators: Whether to keep symbols used in old query syntax. | |
| 117 | 117 | |
| 118 | 118 | Returns: |
| 119 | 119 | Query without punctuation | ... | ... |
search/__init__.py
| 1 | 1 | """Search package initialization.""" |
| 2 | 2 | |
| 3 | -from .boolean_parser import BooleanParser, QueryNode | |
| 4 | 3 | from .es_query_builder import ESQueryBuilder |
| 5 | 4 | from .searcher import Searcher, SearchResult |
| 6 | 5 | |
| 7 | 6 | __all__ = [ |
| 8 | - 'BooleanParser', | |
| 9 | - 'QueryNode', | |
| 10 | 7 | 'ESQueryBuilder', |
| 11 | 8 | 'Searcher', |
| 12 | 9 | 'SearchResult', | ... | ... |
search/boolean_parser.py deleted
| ... | ... | @@ -1,201 +0,0 @@ |
| 1 | -""" | |
| 2 | -Boolean expression parser for search queries. | |
| 3 | - | |
| 4 | -Supports: AND, OR, RANK, ANDNOT operators with parentheses. | |
| 5 | -Precedence (high to low): (), ANDNOT, AND, OR, RANK | |
| 6 | -""" | |
| 7 | - | |
| 8 | -import re | |
| 9 | -from typing import List, Tuple, Optional | |
| 10 | -from dataclasses import dataclass | |
| 11 | - | |
| 12 | - | |
| 13 | -@dataclass | |
| 14 | -class QueryNode: | |
| 15 | - """Represents a node in the parsed query tree.""" | |
| 16 | - operator: str # 'AND', 'OR', 'RANK', 'ANDNOT', 'TERM' | |
| 17 | - terms: List['QueryNode'] = None # Child nodes for operators | |
| 18 | - value: str = None # Value for leaf nodes (TERM) | |
| 19 | - | |
| 20 | - def __repr__(self): | |
| 21 | - if self.operator == 'TERM': | |
| 22 | - return f"TERM({self.value})" | |
| 23 | - else: | |
| 24 | - return f"{self.operator}({', '.join(str(t) for t in self.terms)})" | |
| 25 | - | |
| 26 | - | |
| 27 | -class BooleanParser: | |
| 28 | - """ | |
| 29 | - Parser for boolean search expressions. | |
| 30 | - | |
| 31 | - Operator precedence (high to low): | |
| 32 | - 1. () - Parentheses | |
| 33 | - 2. ANDNOT - AND NOT (exclusion) | |
| 34 | - 3. AND - All terms must match | |
| 35 | - 4. OR - Any term must match | |
| 36 | - 5. RANK - Scoring boost (like OR but affects ranking) | |
| 37 | - """ | |
| 38 | - | |
| 39 | - OPERATORS = {'AND', 'OR', 'RANK', 'ANDNOT'} | |
| 40 | - PRECEDENCE = { | |
| 41 | - 'ANDNOT': 3, | |
| 42 | - 'AND': 2, | |
| 43 | - 'OR': 1, | |
| 44 | - 'RANK': 0 | |
| 45 | - } | |
| 46 | - | |
| 47 | - def __init__(self): | |
| 48 | - """Initialize boolean parser.""" | |
| 49 | - pass | |
| 50 | - | |
| 51 | - def parse(self, expression: str) -> QueryNode: | |
| 52 | - """ | |
| 53 | - Parse boolean expression into query tree. | |
| 54 | - | |
| 55 | - Args: | |
| 56 | - expression: Boolean expression string | |
| 57 | - Example: "laptop AND (gaming OR professional) ANDNOT cheap" | |
| 58 | - | |
| 59 | - Returns: | |
| 60 | - Root QueryNode of parsed tree | |
| 61 | - """ | |
| 62 | - if not expression or not expression.strip(): | |
| 63 | - return QueryNode(operator='TERM', value='') | |
| 64 | - | |
| 65 | - # Tokenize | |
| 66 | - tokens = self._tokenize(expression) | |
| 67 | - | |
| 68 | - if not tokens: | |
| 69 | - return QueryNode(operator='TERM', value='') | |
| 70 | - | |
| 71 | - # Parse with precedence | |
| 72 | - return self._parse_expression(tokens) | |
| 73 | - | |
| 74 | - def _tokenize(self, expression: str) -> List[str]: | |
| 75 | - """ | |
| 76 | - Tokenize expression into terms and operators. | |
| 77 | - | |
| 78 | - Args: | |
| 79 | - expression: Expression string | |
| 80 | - | |
| 81 | - Returns: | |
| 82 | - List of tokens | |
| 83 | - """ | |
| 84 | - # Pattern to match: operators, parentheses, or terms (with domain prefix support) | |
| 85 | - pattern = r'\b(AND|OR|RANK|ANDNOT)\b|[()]|(?:\w+:)?[^\s()]+' | |
| 86 | - | |
| 87 | - tokens = [] | |
| 88 | - for match in re.finditer(pattern, expression): | |
| 89 | - token = match.group().strip() | |
| 90 | - if token: | |
| 91 | - tokens.append(token) | |
| 92 | - | |
| 93 | - return tokens | |
| 94 | - | |
| 95 | - def _parse_expression(self, tokens: List[str], start: int = 0) -> Tuple[QueryNode, int]: | |
| 96 | - """ | |
| 97 | - Parse expression with operator precedence. | |
| 98 | - | |
| 99 | - Args: | |
| 100 | - tokens: List of tokens | |
| 101 | - start: Starting index | |
| 102 | - | |
| 103 | - Returns: | |
| 104 | - Tuple of (QueryNode, next_index) | |
| 105 | - """ | |
| 106 | - # Start with lowest precedence (RANK) | |
| 107 | - return self._parse_rank(tokens, start) | |
| 108 | - | |
| 109 | - def _parse_rank(self, tokens: List[str], start: int) -> Tuple[QueryNode, int]: | |
| 110 | - """Parse RANK operator (lowest precedence).""" | |
| 111 | - left, pos = self._parse_or(tokens, start) | |
| 112 | - | |
| 113 | - while pos < len(tokens) and tokens[pos] == 'RANK': | |
| 114 | - pos += 1 # Skip 'RANK' | |
| 115 | - right, pos = self._parse_or(tokens, pos) | |
| 116 | - left = QueryNode(operator='RANK', terms=[left, right]) | |
| 117 | - | |
| 118 | - return left, pos | |
| 119 | - | |
| 120 | - def _parse_or(self, tokens: List[str], start: int) -> Tuple[QueryNode, int]: | |
| 121 | - """Parse OR operator.""" | |
| 122 | - left, pos = self._parse_and(tokens, start) | |
| 123 | - | |
| 124 | - while pos < len(tokens) and tokens[pos] == 'OR': | |
| 125 | - pos += 1 # Skip 'OR' | |
| 126 | - right, pos = self._parse_and(tokens, pos) | |
| 127 | - left = QueryNode(operator='OR', terms=[left, right]) | |
| 128 | - | |
| 129 | - return left, pos | |
| 130 | - | |
| 131 | - def _parse_and(self, tokens: List[str], start: int) -> Tuple[QueryNode, int]: | |
| 132 | - """Parse AND operator.""" | |
| 133 | - left, pos = self._parse_andnot(tokens, start) | |
| 134 | - | |
| 135 | - while pos < len(tokens) and tokens[pos] == 'AND': | |
| 136 | - pos += 1 # Skip 'AND' | |
| 137 | - right, pos = self._parse_andnot(tokens, pos) | |
| 138 | - left = QueryNode(operator='AND', terms=[left, right]) | |
| 139 | - | |
| 140 | - return left, pos | |
| 141 | - | |
| 142 | - def _parse_andnot(self, tokens: List[str], start: int) -> Tuple[QueryNode, int]: | |
| 143 | - """Parse ANDNOT operator (highest precedence).""" | |
| 144 | - left, pos = self._parse_primary(tokens, start) | |
| 145 | - | |
| 146 | - while pos < len(tokens) and tokens[pos] == 'ANDNOT': | |
| 147 | - pos += 1 # Skip 'ANDNOT' | |
| 148 | - right, pos = self._parse_primary(tokens, pos) | |
| 149 | - left = QueryNode(operator='ANDNOT', terms=[left, right]) | |
| 150 | - | |
| 151 | - return left, pos | |
| 152 | - | |
| 153 | - def _parse_primary(self, tokens: List[str], start: int) -> Tuple[QueryNode, int]: | |
| 154 | - """Parse primary expression (terms or parentheses).""" | |
| 155 | - if start >= len(tokens): | |
| 156 | - return QueryNode(operator='TERM', value=''), start | |
| 157 | - | |
| 158 | - token = tokens[start] | |
| 159 | - | |
| 160 | - # Handle parentheses | |
| 161 | - if token == '(': | |
| 162 | - # Find matching closing parenthesis | |
| 163 | - depth = 1 | |
| 164 | - pos = start + 1 | |
| 165 | - while pos < len(tokens) and depth > 0: | |
| 166 | - if tokens[pos] == '(': | |
| 167 | - depth += 1 | |
| 168 | - elif tokens[pos] == ')': | |
| 169 | - depth -= 1 | |
| 170 | - pos += 1 | |
| 171 | - | |
| 172 | - # Parse contents of parentheses | |
| 173 | - inner_tokens = tokens[start + 1:pos - 1] | |
| 174 | - if inner_tokens: | |
| 175 | - node, _ = self._parse_expression(inner_tokens, 0) | |
| 176 | - return node, pos | |
| 177 | - else: | |
| 178 | - return QueryNode(operator='TERM', value=''), pos | |
| 179 | - | |
| 180 | - # Handle term | |
| 181 | - if token not in self.OPERATORS and token not in ['(', ')']: | |
| 182 | - return QueryNode(operator='TERM', value=token), start + 1 | |
| 183 | - | |
| 184 | - # Unexpected token | |
| 185 | - return QueryNode(operator='TERM', value=''), start + 1 | |
| 186 | - | |
| 187 | - def is_simple_query(self, expression: str) -> bool: | |
| 188 | - """ | |
| 189 | - Check if query is simple (no boolean operators). | |
| 190 | - | |
| 191 | - Args: | |
| 192 | - expression: Query expression | |
| 193 | - | |
| 194 | - Returns: | |
| 195 | - True if simple query (no operators) | |
| 196 | - """ | |
| 197 | - tokens = self._tokenize(expression) | |
| 198 | - for token in tokens: | |
| 199 | - if token in self.OPERATORS: | |
| 200 | - return False | |
| 201 | - return True |
search/es_query_builder.py
| ... | ... | @@ -10,7 +10,6 @@ Simplified architecture: |
| 10 | 10 | |
| 11 | 11 | from typing import Dict, Any, List, Optional, Union, Tuple |
| 12 | 12 | import numpy as np |
| 13 | -from .boolean_parser import QueryNode | |
| 14 | 13 | from config import FunctionScoreConfig |
| 15 | 14 | |
| 16 | 15 | |
| ... | ... | @@ -20,18 +19,31 @@ class ESQueryBuilder: |
| 20 | 19 | def __init__( |
| 21 | 20 | self, |
| 22 | 21 | match_fields: List[str], |
| 22 | + field_boosts: Optional[Dict[str, float]] = None, | |
| 23 | + multilingual_fields: Optional[List[str]] = None, | |
| 24 | + shared_fields: Optional[List[str]] = None, | |
| 25 | + core_multilingual_fields: Optional[List[str]] = None, | |
| 23 | 26 | text_embedding_field: Optional[str] = None, |
| 24 | 27 | image_embedding_field: Optional[str] = None, |
| 25 | 28 | source_fields: Optional[List[str]] = None, |
| 26 | 29 | function_score_config: Optional[FunctionScoreConfig] = None, |
| 27 | 30 | default_language: str = "en", |
| 28 | - knn_boost: float = 0.25 | |
| 31 | + knn_boost: float = 0.25, | |
| 32 | + base_minimum_should_match: str = "75%", | |
| 33 | + translation_minimum_should_match: str = "75%", | |
| 34 | + translation_boost: float = 0.4, | |
| 35 | + translation_boost_when_source_missing: float = 1.0, | |
| 36 | + source_boost_when_missing: float = 0.6, | |
| 37 | + keywords_boost: float = 0.1, | |
| 38 | + enable_phrase_query: bool = True, | |
| 39 | + tie_breaker_base_query: float = 0.9, | |
| 40 | + tie_breaker_keywords: float = 0.9, | |
| 29 | 41 | ): |
| 30 | 42 | """ |
| 31 | 43 | Initialize query builder. |
| 32 | 44 | |
| 33 | 45 | Multi-language search (translation-based cross-language recall) is always enabled: |
| 34 | - queries are matched against both detected-language and translated zh/en clauses. | |
| 46 | + queries are matched against detected-language and translated target-language clauses. | |
| 35 | 47 | |
| 36 | 48 | Args: |
| 37 | 49 | match_fields: Fields to search for text matching |
| ... | ... | @@ -43,12 +55,27 @@ class ESQueryBuilder: |
| 43 | 55 | knn_boost: Boost value for KNN (embedding recall) |
| 44 | 56 | """ |
| 45 | 57 | self.match_fields = match_fields |
| 58 | + self.field_boosts = field_boosts or {} | |
| 59 | + self.multilingual_fields = multilingual_fields or [ | |
| 60 | + "title", "brief", "description", "vendor", "category_path", "category_name_text" | |
| 61 | + ] | |
| 62 | + self.shared_fields = shared_fields or ["tags", "option1_values", "option2_values", "option3_values"] | |
| 63 | + self.core_multilingual_fields = core_multilingual_fields or ["title", "brief", "vendor", "category_name_text"] | |
| 46 | 64 | self.text_embedding_field = text_embedding_field |
| 47 | 65 | self.image_embedding_field = image_embedding_field |
| 48 | 66 | self.source_fields = source_fields |
| 49 | 67 | self.function_score_config = function_score_config |
| 50 | 68 | self.default_language = default_language |
| 51 | 69 | self.knn_boost = knn_boost |
| 70 | + self.base_minimum_should_match = base_minimum_should_match | |
| 71 | + self.translation_minimum_should_match = translation_minimum_should_match | |
| 72 | + self.translation_boost = float(translation_boost) | |
| 73 | + self.translation_boost_when_source_missing = float(translation_boost_when_source_missing) | |
| 74 | + self.source_boost_when_missing = float(source_boost_when_missing) | |
| 75 | + self.keywords_boost = float(keywords_boost) | |
| 76 | + self.enable_phrase_query = bool(enable_phrase_query) | |
| 77 | + self.tie_breaker_base_query = float(tie_breaker_base_query) | |
| 78 | + self.tie_breaker_keywords = float(tie_breaker_keywords) | |
| 52 | 79 | |
| 53 | 80 | def _apply_source_filter(self, es_query: Dict[str, Any]) -> None: |
| 54 | 81 | """ |
| ... | ... | @@ -118,7 +145,6 @@ class ESQueryBuilder: |
| 118 | 145 | self, |
| 119 | 146 | query_text: str, |
| 120 | 147 | query_vector: Optional[np.ndarray] = None, |
| 121 | - query_node: Optional[QueryNode] = None, | |
| 122 | 148 | filters: Optional[Dict[str, Any]] = None, |
| 123 | 149 | range_filters: Optional[Dict[str, Any]] = None, |
| 124 | 150 | facet_configs: Optional[List[Any]] = None, |
| ... | ... | @@ -136,14 +162,13 @@ class ESQueryBuilder: |
| 136 | 162 | 结构:filters and (text_recall or embedding_recall) + post_filter |
| 137 | 163 | - conjunctive_filters: 应用在 query.bool.filter(影响结果和聚合) |
| 138 | 164 | - disjunctive_filters: 应用在 post_filter(只影响结果,不影响聚合) |
| 139 | - - text_recall: 文本相关性召回(中英文字段都用) | |
| 165 | + - text_recall: 文本相关性召回(按 search_langs 动态语言字段) | |
| 140 | 166 | - embedding_recall: 向量召回(KNN) |
| 141 | 167 | - function_score: 包装召回部分,支持提权字段 |
| 142 | 168 | |
| 143 | 169 | Args: |
| 144 | 170 | query_text: Query text for BM25 matching |
| 145 | 171 | query_vector: Query embedding for KNN search |
| 146 | - query_node: Parsed boolean expression tree | |
| 147 | 172 | filters: Exact match filters |
| 148 | 173 | range_filters: Range filters for numeric fields (always applied in query) |
| 149 | 174 | facet_configs: Facet configurations (used to identify multi-select facets) |
| ... | ... | @@ -157,6 +182,7 @@ class ESQueryBuilder: |
| 157 | 182 | Returns: |
| 158 | 183 | ES query DSL dictionary |
| 159 | 184 | """ |
| 185 | + # Boolean AST path has been removed; keep a single text strategy. | |
| 160 | 186 | es_query = { |
| 161 | 187 | "size": size, |
| 162 | 188 | "from": from_ |
| ... | ... | @@ -170,12 +196,8 @@ class ESQueryBuilder: |
| 170 | 196 | |
| 171 | 197 | # Text recall (always include if query_text exists) |
| 172 | 198 | if query_text: |
| 173 | - if query_node and query_node.operator != 'TERM': | |
| 174 | - # Complex boolean query | |
| 175 | - text_query = self._build_boolean_query(query_node) | |
| 176 | - else: | |
| 177 | - # Simple text query - use advanced should-based multi-query strategy | |
| 178 | - text_query = self._build_advanced_text_query(query_text, parsed_query) | |
| 199 | + # Unified text query strategy | |
| 200 | + text_query = self._build_advanced_text_query(query_text, parsed_query) | |
| 179 | 201 | recall_clauses.append(text_query) |
| 180 | 202 | |
| 181 | 203 | # Embedding recall (KNN - separate from query, handled below) |
| ... | ... | @@ -379,50 +401,49 @@ class ESQueryBuilder: |
| 379 | 401 | } |
| 380 | 402 | } |
| 381 | 403 | |
| 404 | + def _format_field_with_boost(self, field_name: str, boost: float) -> str: | |
| 405 | + if abs(float(boost) - 1.0) < 1e-9: | |
| 406 | + return field_name | |
| 407 | + return f"{field_name}^{boost}" | |
| 408 | + | |
| 409 | + def _get_field_boost(self, base_field: str, language: Optional[str] = None) -> float: | |
| 410 | + # Language-specific override first (e.g. title.de), then base field (e.g. title) | |
| 411 | + if language: | |
| 412 | + lang_key = f"{base_field}.{language}" | |
| 413 | + if lang_key in self.field_boosts: | |
| 414 | + return float(self.field_boosts[lang_key]) | |
| 415 | + if base_field in self.field_boosts: | |
| 416 | + return float(self.field_boosts[base_field]) | |
| 417 | + return 1.0 | |
| 418 | + | |
| 382 | 419 | def _get_match_fields(self, language: str) -> Tuple[List[str], List[str]]: |
| 383 | 420 | """ |
| 384 | - Get match fields for a specific language. | |
| 421 | + Build dynamic match fields for one language. | |
| 385 | 422 | |
| 386 | 423 | Args: |
| 387 | - language: Language code ('zh' or 'en') | |
| 424 | + language: Language code (e.g. zh/en/de/fr/...) | |
| 388 | 425 | |
| 389 | 426 | Returns: |
| 390 | 427 | (all_fields, core_fields) - core_fields are for phrase/keyword queries |
| 391 | 428 | """ |
| 392 | - if language == 'zh': | |
| 393 | - all_fields = [ | |
| 394 | - "title.zh^3.0", | |
| 395 | - "brief.zh^1.5", | |
| 396 | - "description.zh", | |
| 397 | - "vendor.zh^1.5", | |
| 398 | - "tags", | |
| 399 | - "category_path.zh^1.5", | |
| 400 | - "category_name_text.zh^1.5", | |
| 401 | - "option1_values^0.5" | |
| 402 | - ] | |
| 403 | - core_fields = [ | |
| 404 | - "title.zh^3.0", | |
| 405 | - "brief.zh^1.5", | |
| 406 | - "vendor.zh^1.5", | |
| 407 | - "category_name_text.zh^1.5" | |
| 408 | - ] | |
| 409 | - else: # en | |
| 410 | - all_fields = [ | |
| 411 | - "title.en^3.0", | |
| 412 | - "brief.en^1.5", | |
| 413 | - "description.en", | |
| 414 | - "vendor.en^1.5", | |
| 415 | - "tags", | |
| 416 | - "category_path.en^1.5", | |
| 417 | - "category_name_text.en^1.5", | |
| 418 | - "option1_values^0.5" | |
| 419 | - ] | |
| 420 | - core_fields = [ | |
| 421 | - "title.en^3.0", | |
| 422 | - "brief.en^1.5", | |
| 423 | - "vendor.en^1.5", | |
| 424 | - "category_name_text.en^1.5" | |
| 425 | - ] | |
| 429 | + lang = (language or "").strip().lower() | |
| 430 | + all_fields: List[str] = [] | |
| 431 | + core_fields: List[str] = [] | |
| 432 | + | |
| 433 | + for base in self.multilingual_fields: | |
| 434 | + field = f"{base}.{lang}" | |
| 435 | + boost = self._get_field_boost(base, lang) | |
| 436 | + all_fields.append(self._format_field_with_boost(field, boost)) | |
| 437 | + | |
| 438 | + for shared in self.shared_fields: | |
| 439 | + boost = self._get_field_boost(shared, None) | |
| 440 | + all_fields.append(self._format_field_with_boost(shared, boost)) | |
| 441 | + | |
| 442 | + for base in self.core_multilingual_fields: | |
| 443 | + field = f"{base}.{lang}" | |
| 444 | + boost = self._get_field_boost(base, lang) | |
| 445 | + core_fields.append(self._format_field_with_boost(field, boost)) | |
| 446 | + | |
| 426 | 447 | return all_fields, core_fields |
| 427 | 448 | |
| 428 | 449 | def _get_embedding_field(self, language: str) -> str: |
| ... | ... | @@ -434,9 +455,9 @@ class ESQueryBuilder: |
| 434 | 455 | """ |
| 435 | 456 | Build advanced text query using should clauses with multiple query strategies. |
| 436 | 457 | |
| 437 | - Reference implementation: | |
| 438 | - - base_query: main query with AND operator and 75% minimum_should_match | |
| 439 | - - translation queries: lower boost (0.4) for other languages | |
| 458 | + Unified implementation: | |
| 459 | + - base_query: source-language clause | |
| 460 | + - translation queries: target-language clauses from search_langs/query_text_by_lang | |
| 440 | 461 | - phrase query: for short queries (2+ tokens) |
| 441 | 462 | - keywords query: extracted nouns from query |
| 442 | 463 | - KNN query: added separately in build_query |
| ... | ... | @@ -451,94 +472,89 @@ class ESQueryBuilder: |
| 451 | 472 | should_clauses = [] |
| 452 | 473 | |
| 453 | 474 | # Get query analysis from parsed_query |
| 454 | - translations = {} | |
| 455 | - language = self.default_language | |
| 475 | + query_text_by_lang: Dict[str, str] = {} | |
| 476 | + search_langs: List[str] = [] | |
| 477 | + source_lang = self.default_language | |
| 478 | + source_in_index_languages = True | |
| 456 | 479 | keywords = "" |
| 457 | 480 | query_tokens = [] |
| 458 | 481 | token_count = 0 |
| 459 | 482 | |
| 460 | 483 | if parsed_query: |
| 461 | - translations = parsed_query.translations or {} | |
| 462 | - # Use default language if detected_language is None or "unknown" | |
| 463 | - detected_lang = parsed_query.detected_language | |
| 464 | - if not detected_lang or detected_lang == "unknown": | |
| 465 | - language = self.default_language | |
| 466 | - else: | |
| 467 | - language = detected_lang | |
| 484 | + query_text_by_lang = getattr(parsed_query, "query_text_by_lang", None) or {} | |
| 485 | + search_langs = getattr(parsed_query, "search_langs", None) or [] | |
| 486 | + detected_lang = getattr(parsed_query, "detected_language", None) | |
| 487 | + source_lang = detected_lang if detected_lang and detected_lang != "unknown" else self.default_language | |
| 488 | + source_in_index_languages = bool( | |
| 489 | + getattr(parsed_query, "source_in_index_languages", True) | |
| 490 | + ) | |
| 468 | 491 | keywords = getattr(parsed_query, 'keywords', '') or "" |
| 469 | 492 | query_tokens = getattr(parsed_query, 'query_tokens', None) or [] |
| 470 | 493 | token_count = len(query_tokens) or getattr(parsed_query, 'token_count', 0) or 0 |
| 471 | 494 | |
| 472 | - # Get match fields for the detected language | |
| 473 | - match_fields, core_fields = self._get_match_fields(language) | |
| 474 | - | |
| 475 | - # Tie breaker values | |
| 476 | - tie_breaker_base_query = 0.9 | |
| 477 | - tie_breaker_keywords = 0.9 | |
| 478 | - | |
| 479 | - # 1. Base query - main query with AND operator | |
| 480 | - should_clauses.append({ | |
| 481 | - "multi_match": { | |
| 482 | - "_name": "base_query", | |
| 483 | - "fields": match_fields, | |
| 484 | - "minimum_should_match": "75%", | |
| 485 | - # "operator": "AND", | |
| 486 | - "query": query_text, | |
| 487 | - "tie_breaker": tie_breaker_base_query | |
| 488 | - } | |
| 489 | - }) | |
| 490 | - | |
| 491 | - # 2. Translation queries - lower boost (0.4) for other languages (multi-language search always on) | |
| 492 | - if language != 'zh' and translations.get('zh'): | |
| 493 | - zh_fields, _ = self._get_match_fields('zh') | |
| 494 | - should_clauses.append({ | |
| 495 | - "multi_match": { | |
| 496 | - "query": translations['zh'], | |
| 497 | - "fields": zh_fields, | |
| 498 | - "minimum_should_match": "75%", | |
| 499 | - "tie_breaker": tie_breaker_base_query, | |
| 500 | - "boost": 0.4, | |
| 501 | - "_name": "base_query_trans_zh" | |
| 502 | - } | |
| 503 | - }) | |
| 504 | - if language != 'en' and translations.get('en'): | |
| 505 | - en_fields, _ = self._get_match_fields('en') | |
| 506 | - should_clauses.append({ | |
| 507 | - "multi_match": { | |
| 508 | - "query": translations['en'], | |
| 509 | - "fields": en_fields, | |
| 510 | - "minimum_should_match": "75%", | |
| 511 | - "tie_breaker": tie_breaker_base_query, | |
| 512 | - "boost": 0.4, | |
| 513 | - "_name": "base_query_trans_en" | |
| 514 | - } | |
| 515 | - }) | |
| 495 | + if not query_text_by_lang: | |
| 496 | + query_text_by_lang = {source_lang: query_text} | |
| 497 | + if source_lang not in query_text_by_lang and query_text: | |
| 498 | + query_text_by_lang[source_lang] = query_text | |
| 499 | + if not search_langs: | |
| 500 | + search_langs = list(query_text_by_lang.keys()) | |
| 501 | + | |
| 502 | + # Core fields for phrase/keyword based on source language. | |
| 503 | + _, core_fields = self._get_match_fields(source_lang) | |
| 504 | + if not core_fields and search_langs: | |
| 505 | + _, core_fields = self._get_match_fields(search_langs[0]) | |
| 506 | + | |
| 507 | + # Base + translated clauses based on language plan. | |
| 508 | + for lang in search_langs: | |
| 509 | + lang_query = query_text_by_lang.get(lang) | |
| 510 | + if not lang_query: | |
| 511 | + continue | |
| 512 | + match_fields, _ = self._get_match_fields(lang) | |
| 513 | + if not match_fields: | |
| 514 | + continue | |
| 516 | 515 | |
| 517 | - if False and is_long_query: | |
| 518 | - boost = 0.5 * pow(min(1.0, token_count / 10.0), 0.9) | |
| 519 | - minimum_should_match = "70%" | |
| 520 | - should_clauses.append({ | |
| 516 | + is_source = (lang == source_lang) | |
| 517 | + clause_boost = 1.0 | |
| 518 | + clause_name = "base_query" if is_source else f"base_query_trans_{lang}" | |
| 519 | + minimum_should_match = ( | |
| 520 | + self.base_minimum_should_match if is_source else self.translation_minimum_should_match | |
| 521 | + ) | |
| 522 | + if is_source and not source_in_index_languages: | |
| 523 | + clause_boost = self.source_boost_when_missing | |
| 524 | + elif not is_source: | |
| 525 | + clause_boost = ( | |
| 526 | + self.translation_boost | |
| 527 | + if source_in_index_languages | |
| 528 | + else self.translation_boost_when_source_missing | |
| 529 | + ) | |
| 530 | + | |
| 531 | + clause = { | |
| 521 | 532 | "multi_match": { |
| 522 | - "query": query_text, | |
| 533 | + "_name": clause_name, | |
| 523 | 534 | "fields": match_fields, |
| 524 | 535 | "minimum_should_match": minimum_should_match, |
| 525 | - "boost": boost, | |
| 526 | - "tie_breaker": tie_breaker_long_query, | |
| 527 | - "_name": "long_query" | |
| 536 | + "query": lang_query, | |
| 537 | + "tie_breaker": self.tie_breaker_base_query, | |
| 528 | 538 | } |
| 539 | + } | |
| 540 | + if abs(clause_boost - 1.0) > 1e-9: | |
| 541 | + clause["multi_match"]["boost"] = clause_boost | |
| 542 | + should_clauses.append({ | |
| 543 | + "multi_match": clause["multi_match"] | |
| 529 | 544 | }) |
| 530 | 545 | |
| 531 | 546 | # 3. Short query - add phrase query (derived from query_tokens) |
| 532 | 547 | # is_short: quoted or ((token_count <= 2 or len <= 4) and no space) |
| 533 | - ENABLE_PHRASE_QUERY = True | |
| 548 | + source_query_text = query_text_by_lang.get(source_lang) or query_text | |
| 549 | + ENABLE_PHRASE_QUERY = self.enable_phrase_query | |
| 534 | 550 | is_quoted = query_text.startswith('"') and query_text.endswith('"') |
| 535 | 551 | is_short = is_quoted or ((token_count <= 2 or len(query_text) <= 4) and ' ' not in query_text) |
| 536 | - if ENABLE_PHRASE_QUERY and token_count >= 2 and is_short: | |
| 552 | + if ENABLE_PHRASE_QUERY and core_fields and token_count >= 2 and is_short: | |
| 537 | 553 | query_length = len(query_text) |
| 538 | 554 | slop = 0 if query_length < 3 else 1 if query_length < 5 else 2 |
| 539 | 555 | should_clauses.append({ |
| 540 | 556 | "multi_match": { |
| 541 | - "query": query_text, | |
| 557 | + "query": source_query_text, | |
| 542 | 558 | "fields": core_fields, |
| 543 | 559 | "type": "phrase", |
| 544 | 560 | "slop": slop, |
| ... | ... | @@ -548,18 +564,31 @@ class ESQueryBuilder: |
| 548 | 564 | }) |
| 549 | 565 | |
| 550 | 566 | # 4. Keywords query - extracted nouns from query |
| 551 | - elif keywords and len(keywords.split()) <= 2 and 2 * len(keywords.replace(' ', '')) <= len(query_text): | |
| 567 | + elif core_fields and keywords and len(keywords.split()) <= 2 and 2 * len(keywords.replace(' ', '')) <= len(query_text): | |
| 552 | 568 | should_clauses.append({ |
| 553 | 569 | "multi_match": { |
| 554 | 570 | "query": keywords, |
| 555 | 571 | "fields": core_fields, |
| 556 | 572 | # "operator": "AND", |
| 557 | - "tie_breaker": tie_breaker_keywords, | |
| 558 | - "boost": 0.1, | |
| 573 | + "tie_breaker": self.tie_breaker_keywords, | |
| 574 | + "boost": self.keywords_boost, | |
| 559 | 575 | "_name": "keywords_query" |
| 560 | 576 | } |
| 561 | 577 | }) |
| 562 | 578 | |
| 579 | + # Fallback to a simple query when language fields cannot be resolved. | |
| 580 | + if not should_clauses: | |
| 581 | + fallback_fields = self.match_fields or ["title.en^1.0"] | |
| 582 | + return { | |
| 583 | + "multi_match": { | |
| 584 | + "_name": "base_query_fallback", | |
| 585 | + "query": query_text, | |
| 586 | + "fields": fallback_fields, | |
| 587 | + "minimum_should_match": self.base_minimum_should_match, | |
| 588 | + "tie_breaker": self.tie_breaker_base_query, | |
| 589 | + } | |
| 590 | + } | |
| 591 | + | |
| 563 | 592 | # Return bool query with should clauses |
| 564 | 593 | if len(should_clauses) == 1: |
| 565 | 594 | return should_clauses[0] |
| ... | ... | @@ -571,70 +600,6 @@ class ESQueryBuilder: |
| 571 | 600 | } |
| 572 | 601 | } |
| 573 | 602 | |
| 574 | - def _build_boolean_query(self, node: QueryNode) -> Dict[str, Any]: | |
| 575 | - """ | |
| 576 | - Build query from boolean expression tree. | |
| 577 | - | |
| 578 | - Args: | |
| 579 | - node: Query tree node | |
| 580 | - | |
| 581 | - Returns: | |
| 582 | - ES query clause | |
| 583 | - """ | |
| 584 | - if node.operator == 'TERM': | |
| 585 | - # Leaf node - simple text query | |
| 586 | - return self._build_text_query(node.value) | |
| 587 | - | |
| 588 | - elif node.operator == 'AND': | |
| 589 | - # All terms must match | |
| 590 | - return { | |
| 591 | - "bool": { | |
| 592 | - "must": [ | |
| 593 | - self._build_boolean_query(term) | |
| 594 | - for term in node.terms | |
| 595 | - ] | |
| 596 | - } | |
| 597 | - } | |
| 598 | - | |
| 599 | - elif node.operator == 'OR': | |
| 600 | - # Any term must match | |
| 601 | - return { | |
| 602 | - "bool": { | |
| 603 | - "should": [ | |
| 604 | - self._build_boolean_query(term) | |
| 605 | - for term in node.terms | |
| 606 | - ], | |
| 607 | - "minimum_should_match": 1 | |
| 608 | - } | |
| 609 | - } | |
| 610 | - | |
| 611 | - elif node.operator == 'ANDNOT': | |
| 612 | - # First term must match, second must not | |
| 613 | - if len(node.terms) >= 2: | |
| 614 | - return { | |
| 615 | - "bool": { | |
| 616 | - "must": [self._build_boolean_query(node.terms[0])], | |
| 617 | - "must_not": [self._build_boolean_query(node.terms[1])] | |
| 618 | - } | |
| 619 | - } | |
| 620 | - else: | |
| 621 | - return self._build_boolean_query(node.terms[0]) | |
| 622 | - | |
| 623 | - elif node.operator == 'RANK': | |
| 624 | - # Like OR but for ranking (all terms contribute to score) | |
| 625 | - return { | |
| 626 | - "bool": { | |
| 627 | - "should": [ | |
| 628 | - self._build_boolean_query(term) | |
| 629 | - for term in node.terms | |
| 630 | - ] | |
| 631 | - } | |
| 632 | - } | |
| 633 | - | |
| 634 | - else: | |
| 635 | - # Unknown operator | |
| 636 | - return {"match_all": {}} | |
| 637 | - | |
| 638 | 603 | def _build_filters( |
| 639 | 604 | self, |
| 640 | 605 | filters: Optional[Dict[str, Any]] = None, | ... | ... |
search/searcher.py
| 1 | 1 | """ |
| 2 | 2 | Main Searcher module - executes search queries against Elasticsearch. |
| 3 | 3 | |
| 4 | -Handles query parsing, boolean expressions, ranking, and result formatting. | |
| 4 | +Handles query parsing, ranking, and result formatting. | |
| 5 | 5 | """ |
| 6 | 6 | |
| 7 | 7 | from typing import Dict, Any, List, Optional, Union |
| ... | ... | @@ -12,11 +12,9 @@ import logging |
| 12 | 12 | from utils.es_client import ESClient |
| 13 | 13 | from query import QueryParser, ParsedQuery |
| 14 | 14 | from embeddings.image_encoder import CLIPImageEncoder |
| 15 | -from .boolean_parser import BooleanParser, QueryNode | |
| 16 | 15 | from .es_query_builder import ESQueryBuilder |
| 17 | 16 | from config import SearchConfig |
| 18 | 17 | from config.tenant_config_loader import get_tenant_config_loader |
| 19 | -from config.utils import get_match_fields_for_index | |
| 20 | 18 | from context.request_context import RequestContext, RequestContextStage |
| 21 | 19 | from api.models import FacetResult, FacetValue, FacetConfig |
| 22 | 20 | from api.result_formatter import ResultFormatter |
| ... | ... | @@ -73,7 +71,7 @@ class Searcher: |
| 73 | 71 | |
| 74 | 72 | Handles: |
| 75 | 73 | - Query parsing and translation |
| 76 | - - Boolean expression parsing | |
| 74 | + - Dynamic multi-language text recall planning | |
| 77 | 75 | - ES query building |
| 78 | 76 | - Result ranking and formatting |
| 79 | 77 | """ |
| ... | ... | @@ -98,12 +96,6 @@ class Searcher: |
| 98 | 96 | self.config = config |
| 99 | 97 | # Index name is now generated dynamically per tenant, no longer stored here |
| 100 | 98 | self.query_parser = query_parser or QueryParser(config) |
| 101 | - | |
| 102 | - # Initialize components | |
| 103 | - self.boolean_parser = BooleanParser() | |
| 104 | - | |
| 105 | - # Get match fields from config | |
| 106 | - self.match_fields = get_match_fields_for_index(config, "default") | |
| 107 | 99 | self.text_embedding_field = config.query_config.text_embedding_field or "title_embedding" |
| 108 | 100 | self.image_embedding_field = config.query_config.image_embedding_field |
| 109 | 101 | if self.image_embedding_field and image_encoder is None: |
| ... | ... | @@ -114,13 +106,26 @@ class Searcher: |
| 114 | 106 | |
| 115 | 107 | # Query builder - simplified single-layer architecture |
| 116 | 108 | self.query_builder = ESQueryBuilder( |
| 117 | - match_fields=self.match_fields, | |
| 109 | + match_fields=[], | |
| 110 | + field_boosts=self.config.field_boosts, | |
| 111 | + multilingual_fields=self.config.query_config.multilingual_fields, | |
| 112 | + shared_fields=self.config.query_config.shared_fields, | |
| 113 | + core_multilingual_fields=self.config.query_config.core_multilingual_fields, | |
| 118 | 114 | text_embedding_field=self.text_embedding_field, |
| 119 | 115 | image_embedding_field=self.image_embedding_field, |
| 120 | 116 | source_fields=self.source_fields, |
| 121 | 117 | function_score_config=self.config.function_score, |
| 122 | 118 | default_language=self.config.query_config.default_language, |
| 123 | - knn_boost=self.config.query_config.knn_boost | |
| 119 | + knn_boost=self.config.query_config.knn_boost, | |
| 120 | + base_minimum_should_match=self.config.query_config.base_minimum_should_match, | |
| 121 | + translation_minimum_should_match=self.config.query_config.translation_minimum_should_match, | |
| 122 | + translation_boost=self.config.query_config.translation_boost, | |
| 123 | + translation_boost_when_source_missing=self.config.query_config.translation_boost_when_source_missing, | |
| 124 | + source_boost_when_missing=self.config.query_config.source_boost_when_missing, | |
| 125 | + keywords_boost=self.config.query_config.keywords_boost, | |
| 126 | + enable_phrase_query=self.config.query_config.enable_phrase_query, | |
| 127 | + tie_breaker_base_query=self.config.query_config.tie_breaker_base_query, | |
| 128 | + tie_breaker_keywords=self.config.query_config.tie_breaker_keywords, | |
| 124 | 129 | ) |
| 125 | 130 | |
| 126 | 131 | def _apply_source_filter(self, es_query: Dict[str, Any]) -> None: |
| ... | ... | @@ -250,7 +255,7 @@ class Searcher: |
| 250 | 255 | translations=parsed_query.translations, |
| 251 | 256 | query_vector=parsed_query.query_vector.tolist() if parsed_query.query_vector is not None else None, |
| 252 | 257 | domain=parsed_query.domain, |
| 253 | - is_simple_query=self.boolean_parser.is_simple_query(parsed_query.rewritten_query) | |
| 258 | + is_simple_query=True | |
| 254 | 259 | ) |
| 255 | 260 | |
| 256 | 261 | context.logger.info( |
| ... | ... | @@ -271,38 +276,7 @@ class Searcher: |
| 271 | 276 | finally: |
| 272 | 277 | context.end_stage(RequestContextStage.QUERY_PARSING) |
| 273 | 278 | |
| 274 | - # Step 2: Boolean parsing | |
| 275 | - context.start_stage(RequestContextStage.BOOLEAN_PARSING) | |
| 276 | - try: | |
| 277 | - query_node = None | |
| 278 | - if self.boolean_parser.is_simple_query(parsed_query.rewritten_query): | |
| 279 | - # Simple query | |
| 280 | - query_text = parsed_query.rewritten_query | |
| 281 | - context.logger.debug( | |
| 282 | - f"简单查询 | 无布尔表达式", | |
| 283 | - extra={'reqid': context.reqid, 'uid': context.uid} | |
| 284 | - ) | |
| 285 | - else: | |
| 286 | - # Complex boolean query | |
| 287 | - query_node = self.boolean_parser.parse(parsed_query.rewritten_query) | |
| 288 | - query_text = parsed_query.rewritten_query | |
| 289 | - context.store_intermediate_result('query_node', query_node) | |
| 290 | - context.store_intermediate_result('boolean_ast', str(query_node)) | |
| 291 | - context.logger.info( | |
| 292 | - f"布尔表达式解析 | AST: {query_node}", | |
| 293 | - extra={'reqid': context.reqid, 'uid': context.uid} | |
| 294 | - ) | |
| 295 | - except Exception as e: | |
| 296 | - context.set_error(e) | |
| 297 | - context.logger.error( | |
| 298 | - f"布尔表达式解析失败 | 错误: {str(e)}", | |
| 299 | - extra={'reqid': context.reqid, 'uid': context.uid} | |
| 300 | - ) | |
| 301 | - raise | |
| 302 | - finally: | |
| 303 | - context.end_stage(RequestContextStage.BOOLEAN_PARSING) | |
| 304 | - | |
| 305 | - # Step 3: Query building | |
| 279 | + # Step 2: Query building | |
| 306 | 280 | context.start_stage(RequestContextStage.QUERY_BUILDING) |
| 307 | 281 | try: |
| 308 | 282 | # Generate tenant-specific index name |
| ... | ... | @@ -314,7 +288,6 @@ class Searcher: |
| 314 | 288 | es_query = self.query_builder.build_query( |
| 315 | 289 | query_text=parsed_query.rewritten_query or parsed_query.query_normalized, |
| 316 | 290 | query_vector=parsed_query.query_vector if enable_embedding else None, |
| 317 | - query_node=query_node, | |
| 318 | 291 | filters=filters, |
| 319 | 292 | range_filters=range_filters, |
| 320 | 293 | facet_configs=facets, |
| ... | ... | @@ -529,7 +502,6 @@ class Searcher: |
| 529 | 502 | "translations": context.query_analysis.translations, |
| 530 | 503 | "has_vector": context.query_analysis.query_vector is not None, |
| 531 | 504 | "is_simple_query": context.query_analysis.is_simple_query, |
| 532 | - "boolean_ast": context.get_intermediate_result('boolean_ast'), | |
| 533 | 505 | "domain": context.query_analysis.domain |
| 534 | 506 | }, |
| 535 | 507 | "es_query": context.get_intermediate_result('es_query', {}), |
| ... | ... | @@ -666,12 +638,18 @@ class Searcher: |
| 666 | 638 | |
| 667 | 639 | def get_domain_summary(self) -> Dict[str, Any]: |
| 668 | 640 | """ |
| 669 | - Get summary of all configured domains. | |
| 641 | + Get summary of dynamic text retrieval configuration. | |
| 670 | 642 | |
| 671 | 643 | Returns: |
| 672 | - Dictionary with domain information | |
| 644 | + Dictionary with language-aware field information | |
| 673 | 645 | """ |
| 674 | - return self.query_builder.get_domain_summary() | |
| 646 | + return { | |
| 647 | + "mode": "dynamic_language_fields", | |
| 648 | + "multilingual_fields": self.config.query_config.multilingual_fields, | |
| 649 | + "shared_fields": self.config.query_config.shared_fields, | |
| 650 | + "core_multilingual_fields": self.config.query_config.core_multilingual_fields, | |
| 651 | + "field_boosts": self.config.field_boosts, | |
| 652 | + } | |
| 675 | 653 | |
| 676 | 654 | def get_document(self, tenant_id: str, doc_id: str) -> Optional[Dict[str, Any]]: |
| 677 | 655 | """ | ... | ... |
suggestion/service.py
| ... | ... | @@ -33,6 +33,68 @@ class SuggestionService: |
| 33 | 33 | return primary |
| 34 | 34 | return index_languages[0] |
| 35 | 35 | |
| 36 | + def _completion_suggest( | |
| 37 | + self, | |
| 38 | + index_name: str, | |
| 39 | + query: str, | |
| 40 | + lang: str, | |
| 41 | + size: int, | |
| 42 | + ) -> List[Dict[str, Any]]: | |
| 43 | + """ | |
| 44 | + Query ES completion suggester from `completion.<lang>`. | |
| 45 | + | |
| 46 | + Returns items in the same shape as search hits -> dicts with "text"/"lang"/"score"/"rank_score"/"sources". | |
| 47 | + """ | |
| 48 | + field_name = f"completion.{lang}" | |
| 49 | + body = { | |
| 50 | + "suggest": { | |
| 51 | + "s": { | |
| 52 | + "prefix": query, | |
| 53 | + "completion": { | |
| 54 | + "field": field_name, | |
| 55 | + "size": size, | |
| 56 | + "skip_duplicates": True, | |
| 57 | + }, | |
| 58 | + } | |
| 59 | + }, | |
| 60 | + "_source": [ | |
| 61 | + "text", | |
| 62 | + "lang", | |
| 63 | + "rank_score", | |
| 64 | + "sources", | |
| 65 | + "lang_source", | |
| 66 | + "lang_confidence", | |
| 67 | + "lang_conflict", | |
| 68 | + ], | |
| 69 | + } | |
| 70 | + try: | |
| 71 | + resp = self.es_client.client.search(index=index_name, body=body) | |
| 72 | + except Exception as e: | |
| 73 | + # completion is an optimization path; never hard-fail the whole endpoint | |
| 74 | + logger.warning("Completion suggest failed for index=%s field=%s: %s", index_name, field_name, e) | |
| 75 | + return [] | |
| 76 | + | |
| 77 | + entries = (resp.get("suggest", {}) or {}).get("s", []) or [] | |
| 78 | + if not entries: | |
| 79 | + return [] | |
| 80 | + options = entries[0].get("options", []) or [] | |
| 81 | + out: List[Dict[str, Any]] = [] | |
| 82 | + for opt in options: | |
| 83 | + src = opt.get("_source", {}) or {} | |
| 84 | + out.append( | |
| 85 | + { | |
| 86 | + "text": src.get("text") or opt.get("text"), | |
| 87 | + "lang": src.get("lang") or lang, | |
| 88 | + "score": opt.get("_score", 0.0), | |
| 89 | + "rank_score": src.get("rank_score"), | |
| 90 | + "sources": src.get("sources", []), | |
| 91 | + "lang_source": src.get("lang_source"), | |
| 92 | + "lang_confidence": src.get("lang_confidence"), | |
| 93 | + "lang_conflict": src.get("lang_conflict", False), | |
| 94 | + } | |
| 95 | + ) | |
| 96 | + return out | |
| 97 | + | |
| 36 | 98 | def _search_products_for_suggestion( |
| 37 | 99 | self, |
| 38 | 100 | tenant_id: str, |
| ... | ... | @@ -95,6 +157,17 @@ class SuggestionService: |
| 95 | 157 | start = time.time() |
| 96 | 158 | resolved_lang = self._resolve_language(tenant_id, language) |
| 97 | 159 | index_name = get_suggestion_index_name(tenant_id) |
| 160 | + if not self.es_client.index_exists(index_name): | |
| 161 | + # On a fresh ES cluster the suggestion index might not be built yet. | |
| 162 | + # Keep endpoint stable for frontend autocomplete: return empty list instead of 500. | |
| 163 | + took_ms = int((time.time() - start) * 1000) | |
| 164 | + return { | |
| 165 | + "query": query, | |
| 166 | + "language": language, | |
| 167 | + "resolved_language": resolved_lang, | |
| 168 | + "suggestions": [], | |
| 169 | + "took_ms": took_ms, | |
| 170 | + } | |
| 98 | 171 | |
| 99 | 172 | sat_field = f"sat.{resolved_lang}" |
| 100 | 173 | dsl = { |
| ... | ... | @@ -139,14 +212,42 @@ class SuggestionService: |
| 139 | 212 | "lang_conflict", |
| 140 | 213 | ], |
| 141 | 214 | } |
| 215 | + # Recall path A: bool_prefix on search_as_you_type | |
| 142 | 216 | es_resp = self.es_client.search(index_name=index_name, body=dsl, size=size, from_=0) |
| 143 | 217 | hits = es_resp.get("hits", {}).get("hits", []) or [] |
| 144 | 218 | |
| 219 | + # Recall path B: completion suggester (optional optimization) | |
| 220 | + completion_items = self._completion_suggest( | |
| 221 | + index_name=index_name, | |
| 222 | + query=query, | |
| 223 | + lang=resolved_lang, | |
| 224 | + size=size, | |
| 225 | + ) | |
| 226 | + | |
| 145 | 227 | suggestions: List[Dict[str, Any]] = [] |
| 228 | + seen_text_norm: set = set() | |
| 229 | + | |
| 230 | + def _norm_text(v: Any) -> str: | |
| 231 | + return str(v or "").strip().lower() | |
| 232 | + | |
| 233 | + # Put completion results first (usually better prefix UX), then fill with sat results. | |
| 234 | + for item in completion_items: | |
| 235 | + text_val = item.get("text") | |
| 236 | + norm = _norm_text(text_val) | |
| 237 | + if not norm or norm in seen_text_norm: | |
| 238 | + continue | |
| 239 | + seen_text_norm.add(norm) | |
| 240 | + suggestions.append(dict(item)) | |
| 241 | + | |
| 146 | 242 | for hit in hits: |
| 147 | 243 | src = hit.get("_source", {}) or {} |
| 244 | + text_val = src.get("text") | |
| 245 | + norm = _norm_text(text_val) | |
| 246 | + if not norm or norm in seen_text_norm: | |
| 247 | + continue | |
| 248 | + seen_text_norm.add(norm) | |
| 148 | 249 | item = { |
| 149 | - "text": src.get("text"), | |
| 250 | + "text": text_val, | |
| 150 | 251 | "lang": src.get("lang"), |
| 151 | 252 | "score": hit.get("_score", 0.0), |
| 152 | 253 | "rank_score": src.get("rank_score"), |
| ... | ... | @@ -173,7 +274,7 @@ class SuggestionService: |
| 173 | 274 | "query": query, |
| 174 | 275 | "language": language, |
| 175 | 276 | "resolved_language": resolved_lang, |
| 176 | - "suggestions": suggestions, | |
| 277 | + "suggestions": suggestions[:size], | |
| 177 | 278 | "took_ms": took_ms, |
| 178 | 279 | } |
| 179 | 280 | ... | ... |