From bd96ceadef76dc327afcd2d07a023f4902d2f9f5 Mon Sep 17 00:00:00 2001 From: tangwang Date: Tue, 10 Mar 2026 16:06:31 +0800 Subject: [PATCH] 1. 动态多语言字段与统一策略配置 - 配置改为“字段基名 + 动态语言后缀”方案,已不再依赖旧 `indexes`。 [config.yaml](/data/saas-search/config/config.yaml#L17) - `search_fields` / `text_query_strategy` 已进入强校验与解析流程。 [config_loader.py](/data/saas-search/config/config_loader.py#L254) --- .env | 4 ++-- api/models.py | 8 ++++---- api/result_formatter.py | 29 ++++++++++++++++++++--------- api/routes/admin.py | 4 +++- api/routes/search.py | 2 +- config/config.yaml | 102 +++++++++++++++++++++++++++++++++++++++++++----------------------------------------------------------- config/config_loader.py | 109 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------- config/utils.py | 84 +++++++++++++++++++++++++++++++++++++----------------------------------------------- docs/DEVELOPER_GUIDE.md | 6 +++--- docs/QUICKSTART.md | 11 ++++++----- docs/搜索API对接指南.md | 55 ++++++------------------------------------------------- docs/搜索API速查表.md | 12 ------------ query/language_detector.py | 229 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------------------------------------------------------------------------------------- query/query_parser.py | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----- query/query_rewriter.py | 6 +++--- search/__init__.py | 3 --- search/boolean_parser.py | 201 --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- search/es_query_builder.py | 329 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- search/searcher.py | 78 ++++++++++++++++++++++++++++-------------------------------------------------- suggestion/service.py | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 20 files changed, 691 insertions(+), 752 deletions(-) delete mode 100644 search/boolean_parser.py diff --git a/.env b/.env index 3aafe7d..3d775d1 100644 --- a/.env +++ b/.env @@ -1,6 +1,6 @@ # Elasticsearch Configuration -ES_HOST=http://localhost:9200 -ES_USERNAME=saas +ES_HOST=http://120.76.41.98:9200 +ES_USERNAME=essa ES_PASSWORD=4hOaLaf41y2VuI8y # Redis Configuration (Optional) - AI 生产 10.200.16.14:6479 diff --git a/api/models.py b/api/models.py index 94028fa..e1e2d2a 100644 --- a/api/models.py +++ b/api/models.py @@ -70,12 +70,12 @@ class SearchRequest(BaseModel): """搜索请求模型(重构版)""" # 基础搜索参数 - query: str = Field(..., description="搜索查询字符串,支持布尔表达式(AND, OR, RANK, ANDNOT)") + query: str = Field(..., description="搜索查询字符串(统一文本检索策略)") size: int = Field(10, ge=1, le=1000, description="返回结果数量") from_: int = Field(0, ge=0, alias="from", description="分页偏移量") - language: Literal["zh", "en"] = Field( - "zh", - description="响应语言:'zh'(中文)或 'en'(英文),用于选择 title/description/vendor 等多语言字段" + language: str = Field( + "en", + description="响应语言代码(如 zh/en/de/fr/ar/ru),用于多语言字段返回优先级" ) # 过滤器 - 精确匹配和多值匹配 diff --git a/api/result_formatter.py b/api/result_formatter.py index 830d38e..c1d5910 100644 --- a/api/result_formatter.py +++ b/api/result_formatter.py @@ -27,20 +27,32 @@ class ResultFormatter: List of SpuResult objects """ results = [] - lang = (language or "en").lower() - if lang not in ("zh", "en"): - lang = "en" + lang = (language or "en").lower().replace("-", "_") + lang_base = lang.split("_")[0] if lang else "en" def pick_lang_field(src: Dict[str, Any], base: str) -> Optional[str]: """从多语言对象字段中按语言选择一个值:{base: {"zh": "...", "en": "...", ...}}""" obj = src.get(base) if not isinstance(obj, dict): return None - zh_val = obj.get("zh") - en_val = obj.get("en") - if lang == "zh": - return zh_val or en_val - return en_val or zh_val + candidates = [ + lang, + lang_base, + "en", + "zh", + ] + seen = set() + for cand in candidates: + if not cand or cand in seen: + continue + seen.add(cand) + value = obj.get(cand) + if value: + return value + for value in obj.values(): + if value: + return value + return None for hit in es_hits: source = hit.get('_source', {}) @@ -434,4 +446,3 @@ class ResultFormatter: """ # TODO: Implement related search generation logic return [] - diff --git a/api/routes/admin.py b/api/routes/admin.py index e4a015e..2ff27db 100644 --- a/api/routes/admin.py +++ b/api/routes/admin.py @@ -52,7 +52,9 @@ async def get_configuration(): return { "es_index_name": config.es_index_name, "num_field_boosts": len(config.field_boosts), - "num_indexes": len(config.indexes), + "multilingual_fields": config.query_config.multilingual_fields, + "shared_fields": config.query_config.shared_fields, + "core_multilingual_fields": config.query_config.core_multilingual_fields, "supported_languages": config.query_config.supported_languages, "ranking_expression": config.ranking.expression, "spu_enabled": config.spu_config.enabled diff --git a/api/routes/search.py b/api/routes/search.py index eb7de96..1650953 100644 --- a/api/routes/search.py +++ b/api/routes/search.py @@ -37,7 +37,7 @@ async def search(request: SearchRequest, http_request: Request): Supports: - Multi-language query processing - - Boolean operators (AND, OR, RANK, ANDNOT) + - Unified text retrieval strategy (no boolean AST parsing) - Semantic search with embeddings - Custom ranking functions - Exact match filters and range filters diff --git a/config/config.yaml b/config/config.yaml index 2bf6ddc..f55a157 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -12,71 +12,20 @@ es_settings: refresh_interval: "30s" # 字段权重配置(用于搜索时的字段boost) -# 只配置权重,不配置字段结构(字段结构由 mappings/search_products.json 定义) +# 统一按“字段基名”配置;查询时按 search_langs 动态拼接 .{lang}。 +# 若需要按某个语言单独调权,也可以加显式 key(例如 title.de: 3.2)。 field_boosts: - # 文本相关性字段 - "title.zh": 3.0 - "brief.zh": 1.5 - "description.zh": 1.0 - "vendor.zh": 1.5 - "title.en": 3.0 - "brief.en": 1.5 - "description.en": 1.0 - "vendor.en": 1.5 - - # 分类相关字段 - "category_path.zh": 1.5 - "category_name_text.zh": 1.5 - "category_path.en": 1.5 - "category_name_text.en": 1.5 - - # 标签和属性值字段 + title: 3.0 + brief: 1.5 + description: 1.0 + vendor: 1.5 + category_path: 1.5 + category_name_text: 1.5 tags: 1.0 option1_values: 0.5 option2_values: 0.5 option3_values: 0.5 -# 搜索域配置(Query Domains) -# 定义不同的搜索策略,指定哪些字段组合在一起搜索 -indexes: - - name: "default" - label: "默认搜索" - fields: - - "title.zh" - - "brief.zh" - - "description.zh" - - "vendor.zh" - - "tags" - - "category_path.zh" - - "category_name_text.zh" - - "option1_values" - boost: 1.0 - - - name: "title" - label: "标题搜索" - fields: - - "title.zh" - boost: 2.0 - - - name: "vendor" - label: "品牌搜索" - fields: - - "vendor.zh" - boost: 1.5 - - - name: "category" - label: "类目搜索" - fields: - - "category_path.zh" - - "category_name_text.zh" - boost: 1.5 - - - name: "tags" - label: "标签搜索" - fields: - - "tags" - boost: 1.0 - # Query Configuration(查询配置) query_config: # 支持的语言 @@ -89,6 +38,41 @@ query_config: enable_text_embedding: true enable_query_rewrite: true + # 动态多语言检索字段配置 + # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式; + # shared_fields 为无语言后缀字段。 + search_fields: + multilingual_fields: + - "title" + - "brief" + - "description" + - "vendor" + - "category_path" + - "category_name_text" + shared_fields: + - "tags" + - "option1_values" + - "option2_values" + - "option3_values" + core_multilingual_fields: + - "title" + - "brief" + - "vendor" + - "category_name_text" + + # 统一文本召回策略(主查询 + 翻译查询 + phrase/keywords) + text_query_strategy: + base_minimum_should_match: "75%" + translation_minimum_should_match: "75%" + translation_boost: 0.4 + translation_boost_when_source_missing: 1.0 + source_boost_when_missing: 0.6 + original_query_fallback_boost_when_translation_missing: 0.2 + keywords_boost: 0.1 + enable_phrase_query: true + tie_breaker_base_query: 0.9 + tie_breaker_keywords: 0.9 + # Embedding字段名称 text_embedding_field: "title_embedding" image_embedding_field: null diff --git a/config/config_loader.py b/config/config_loader.py index b53f8e1..fbd5aad 100644 --- a/config/config_loader.py +++ b/config/config_loader.py @@ -57,6 +57,28 @@ class QueryConfig: # KNN boost configuration knn_boost: float = 0.25 # Boost value for KNN (embedding recall) + + # Dynamic text fields for multi-language retrieval + multilingual_fields: List[str] = field( + default_factory=lambda: ["title", "brief", "description", "vendor", "category_path", "category_name_text"] + ) + shared_fields: List[str] = field( + default_factory=lambda: ["tags", "option1_values", "option2_values", "option3_values"] + ) + core_multilingual_fields: List[str] = field( + default_factory=lambda: ["title", "brief", "vendor", "category_name_text"] + ) + + # Unified text strategy tuning + base_minimum_should_match: str = "75%" + translation_minimum_should_match: str = "75%" + translation_boost: float = 0.4 + translation_boost_when_source_missing: float = 1.0 + source_boost_when_missing: float = 0.6 + keywords_boost: float = 0.1 + enable_phrase_query: bool = True + tie_breaker_base_query: float = 0.9 + tie_breaker_keywords: float = 0.9 @dataclass @@ -102,7 +124,7 @@ class SearchConfig: # 字段权重配置(用于搜索) field_boosts: Dict[str, float] - # Index structure (query domains) + # Legacy index domains (deprecated; kept for compatibility) indexes: List[IndexConfig] # Query processing @@ -218,7 +240,7 @@ class ConfigLoader: if not isinstance(field_boosts, dict): raise ConfigurationError("field_boosts must be a dictionary") - # Parse indexes + # Parse indexes (deprecated; compatibility only) indexes = [] for index_data in config_data.get("indexes", []): indexes.append(self._parse_index_config(index_data)) @@ -228,6 +250,8 @@ class ConfigLoader: services_data = config_data.get("services", {}) if isinstance(config_data.get("services", {}), dict) else {} rewrite_dictionary = self._load_rewrite_dictionary() embedding_thresholds = query_config_data.get("embedding_disable_thresholds", {}) + search_fields_cfg = query_config_data.get("search_fields", {}) + text_strategy_cfg = query_config_data.get("text_query_strategy", {}) query_config = QueryConfig( supported_languages=query_config_data.get("supported_languages") or ["zh", "en"], @@ -245,7 +269,30 @@ class ConfigLoader: embedding_disable_chinese_char_limit=embedding_thresholds.get("chinese_char_limit", 4), embedding_disable_english_word_limit=embedding_thresholds.get("english_word_limit", 3), source_fields=query_config_data.get("source_fields"), - knn_boost=query_config_data.get("knn_boost", 0.25) + knn_boost=query_config_data.get("knn_boost", 0.25), + multilingual_fields=search_fields_cfg.get( + "multilingual_fields", + ["title", "brief", "description", "vendor", "category_path", "category_name_text"], + ), + shared_fields=search_fields_cfg.get( + "shared_fields", + ["tags", "option1_values", "option2_values", "option3_values"], + ), + core_multilingual_fields=search_fields_cfg.get( + "core_multilingual_fields", + ["title", "brief", "vendor", "category_name_text"], + ), + base_minimum_should_match=str(text_strategy_cfg.get("base_minimum_should_match", "75%")), + translation_minimum_should_match=str(text_strategy_cfg.get("translation_minimum_should_match", "75%")), + translation_boost=float(text_strategy_cfg.get("translation_boost", 0.4)), + translation_boost_when_source_missing=float( + text_strategy_cfg.get("translation_boost_when_source_missing", 1.0) + ), + source_boost_when_missing=float(text_strategy_cfg.get("source_boost_when_missing", 0.6)), + keywords_boost=float(text_strategy_cfg.get("keywords_boost", 0.1)), + enable_phrase_query=bool(text_strategy_cfg.get("enable_phrase_query", True)), + tie_breaker_base_query=float(text_strategy_cfg.get("tie_breaker_base_query", 0.9)), + tie_breaker_keywords=float(text_strategy_cfg.get("tie_breaker_keywords", 0.9)), ) # Parse ranking config @@ -336,10 +383,7 @@ class ConfigLoader: elif boost < 0: errors.append(f"field_boosts['{field_name}']: boost must be non-negative") - # Validate indexes - if not config.indexes: - errors.append("At least one index domain must be defined") - + # Validate indexes (deprecated, optional) index_names = set() for index in config.indexes: # Check for duplicate index names @@ -365,6 +409,39 @@ class ConfigLoader: f"Default language '{config.query_config.default_language}' " f"not in supported languages: {config.query_config.supported_languages}" ) + + # Validate dynamic search fields + def _validate_str_list(name: str, values: List[str]) -> None: + if not isinstance(values, list) or not values: + errors.append(f"query_config.{name} must be a non-empty list[str]") + return + for i, val in enumerate(values): + if not isinstance(val, str) or not val.strip(): + errors.append(f"query_config.{name}[{i}] must be a non-empty string") + + _validate_str_list("multilingual_fields", config.query_config.multilingual_fields) + _validate_str_list("shared_fields", config.query_config.shared_fields) + _validate_str_list("core_multilingual_fields", config.query_config.core_multilingual_fields) + + core_set = set(config.query_config.core_multilingual_fields) + multi_set = set(config.query_config.multilingual_fields) + if not core_set.issubset(multi_set): + errors.append("query_config.core_multilingual_fields must be subset of multilingual_fields") + + # Validate text query strategy numbers + for name in ( + "translation_boost", + "translation_boost_when_source_missing", + "source_boost_when_missing", + "keywords_boost", + "tie_breaker_base_query", + "tie_breaker_keywords", + ): + value = getattr(config.query_config, name, None) + if not isinstance(value, (int, float)): + errors.append(f"query_config.{name} must be a number") + elif value < 0: + errors.append(f"query_config.{name} must be non-negative") # Validate source_fields tri-state semantics source_fields = config.query_config.source_fields @@ -409,7 +486,23 @@ class ConfigLoader: "chinese_char_limit": config.query_config.embedding_disable_chinese_char_limit, "english_word_limit": config.query_config.embedding_disable_english_word_limit }, - "source_fields": config.query_config.source_fields + "source_fields": config.query_config.source_fields, + "search_fields": { + "multilingual_fields": config.query_config.multilingual_fields, + "shared_fields": config.query_config.shared_fields, + "core_multilingual_fields": config.query_config.core_multilingual_fields, + }, + "text_query_strategy": { + "base_minimum_should_match": config.query_config.base_minimum_should_match, + "translation_minimum_should_match": config.query_config.translation_minimum_should_match, + "translation_boost": config.query_config.translation_boost, + "translation_boost_when_source_missing": config.query_config.translation_boost_when_source_missing, + "source_boost_when_missing": config.query_config.source_boost_when_missing, + "keywords_boost": config.query_config.keywords_boost, + "enable_phrase_query": config.query_config.enable_phrase_query, + "tie_breaker_base_query": config.query_config.tie_breaker_base_query, + "tie_breaker_keywords": config.query_config.tie_breaker_keywords, + } } return { diff --git a/config/utils.py b/config/utils.py index b0247e0..baa878d 100644 --- a/config/utils.py +++ b/config/utils.py @@ -1,62 +1,52 @@ -""" -Configuration utility functions. - -Helper functions for working with SearchConfig objects. -""" +"""Configuration helper functions for dynamic multi-language search fields.""" from typing import Dict, List from .config_loader import SearchConfig +def _format_field_with_boost(field_name: str, boost: float) -> str: + if abs(float(boost) - 1.0) < 1e-9: + return field_name + return f"{field_name}^{boost}" + + +def _get_boost(config: SearchConfig, base_field: str, language: str = "") -> float: + lang = (language or "").strip().lower() + if lang: + lang_key = f"{base_field}.{lang}" + if lang_key in config.field_boosts: + return float(config.field_boosts[lang_key]) + if base_field in config.field_boosts: + return float(config.field_boosts[base_field]) + return 1.0 + + def get_match_fields_for_index(config: SearchConfig, index_name: str = "default") -> List[str]: """ - Generate match fields list with boost from field_boosts. - - Args: - config: SearchConfig instance - index_name: Name of the index domain (default: "default") - - Returns: - List of field names with boost, e.g., ["title.zh^3.0", "brief.zh^1.5"] + Deprecated compatibility wrapper. + + `indexes` is no longer used by runtime query building. This function now returns + dynamic match fields for the default language based on query_config.search_fields. """ - # Find the index config - index_config = None - for idx in config.indexes: - if idx.name == index_name: - index_config = idx - break - - if not index_config: - return [] - - # Generate match fields with boost - match_fields = [] - for field_name in index_config.fields: - # Get field boost from field_boosts dictionary - field_boost = config.field_boosts.get(field_name, 1.0) - - # Combine index boost and field boost - total_boost = index_config.boost * field_boost - - if total_boost != 1.0: - match_fields.append(f"{field_name}^{total_boost}") - else: - match_fields.append(field_name) - + del index_name + lang = (config.query_config.default_language or "en").strip().lower() + match_fields: List[str] = [] + + for base_field in config.query_config.multilingual_fields: + field_name = f"{base_field}.{lang}" + match_fields.append(_format_field_with_boost(field_name, _get_boost(config, base_field, lang))) + + for shared_field in config.query_config.shared_fields: + match_fields.append(_format_field_with_boost(shared_field, _get_boost(config, shared_field))) + return match_fields def get_domain_fields(config: SearchConfig) -> Dict[str, List[str]]: """ - Generate domain-specific match fields from all index configs. - - Args: - config: SearchConfig instance - + Get dynamic domain fields for compatibility with old diagnostics endpoints. + Returns: - Dictionary mapping domain name to list of match fields + A single `default` domain entry generated from dynamic search_fields. """ - domain_fields = {} - for index_config in config.indexes: - domain_fields[index_config.name] = get_match_fields_for_index(config, index_config.name) - return domain_fields + return {"default": get_match_fields_for_index(config)} diff --git a/docs/DEVELOPER_GUIDE.md b/docs/DEVELOPER_GUIDE.md index a82413c..55df4e5 100644 --- a/docs/DEVELOPER_GUIDE.md +++ b/docs/DEVELOPER_GUIDE.md @@ -105,7 +105,7 @@ MySQL (店匠 SPU/SKU) api/ # FastAPI 应用:搜索路由、管理路由、索引路由(indexer_app) config/ # 配置加载与解析:config.yaml、services、env indexer/ # MySQL → ES 管道:mapping、transformer、bulk、增量、build-docs -query/ # 查询解析:规范化、改写、翻译、embedding 调用、布尔解析 +query/ # 查询解析:规范化、改写、翻译、embedding 调用、语言计划生成 search/ # 搜索执行:多语言查询构建、Searcher、重排客户端、分数融合 embeddings/ # 向量化:服务端(server)、文本/图像后端、协议与配置 reranker/ # 重排:服务端(server)、后端(backends)、配置 @@ -144,7 +144,7 @@ docs/ # 文档(含本指南) ### 4.4 query -- **职责**:查询解析与预处理:规范化、语言检测、改写(词典)、翻译、文本向量化、布尔表达式解析;输出可供 Searcher 使用的结构化查询信息。 +- **职责**:查询解析与预处理:规范化、语言检测、改写(词典)、翻译、文本向量化;输出可供 Searcher 使用的结构化查询信息(含 search_langs 语言计划)。 - **原则**:翻译/向量通过 `providers` 获取,不直接依赖具体服务 URL 或实现;支持按配置关闭翻译/向量(如短查询、typing 场景)。 ### 4.5 search @@ -241,7 +241,7 @@ docs/ # 文档(含本指南) ### 6.1 主配置文件 -- **config/config.yaml**:搜索行为(field_boosts、indexes、query_config、ranking、function_score、rerank 融合参数)、SPU 配置、**services**(翻译/向量/重排的 provider 与 backends)、tenant_config 等。 +- **config/config.yaml**:搜索行为(field_boosts、query_config.search_fields、query_config.text_query_strategy、ranking、function_score、rerank 融合参数)、SPU 配置、**services**(翻译/向量/重排的 provider 与 backends)、tenant_config 等。 - **.env**:敏感信息与部署态变量(DB、ES、Redis、API Key、端口等);不提交敏感值,可提供 `.env.example` 模板。 ### 6.2 services 块结构(能力统一约定) diff --git a/docs/QUICKSTART.md b/docs/QUICKSTART.md index 4c3f159..a3aa0d3 100644 --- a/docs/QUICKSTART.md +++ b/docs/QUICKSTART.md @@ -329,7 +329,7 @@ saas-search 以 MySQL 中的店匠标准表为权威数据源: - **统一索引结构**:所有租户使用同一套 mapping(按租户数据分索引名 + 文档内 `tenant_id` 隔离) - **SPU 级索引**:每个文档是一个 SPU,包含嵌套 `skus`、`specifications` -- **配置文件驱动**:搜索权重、搜索域、重排融合、provider 全在 `config/config.yaml`,不再以“硬编码配置”为主 +- **配置文件驱动**:搜索权重、动态多语言字段、重排融合、provider 全在 `config/config.yaml`,不再以“硬编码配置”为主 ### 2.2 索引结构(Mapping) @@ -338,7 +338,7 @@ saas-search 以 MySQL 中的店匠标准表为权威数据源: 核心字段可分为: - 标识字段:`tenant_id`, `spu_id` -- 多语言文本:`title.zh/en`, `brief.zh/en`, `description.zh/en`, `vendor.zh/en`, `category_path.zh/en`, `category_name_text.zh/en` +- 多语言文本:`title.`, `brief.`, `description.`, `vendor.`, `category_path.`, `category_name_text.` - 类目过滤:`category1_name`, `category2_name`, `category3_name` 等 - 规格/变体:`specifications`(nested)、`skus`(nested) - 价格库存:`min_price`, `max_price`, `total_inventory` 等 @@ -346,8 +346,9 @@ saas-search 以 MySQL 中的店匠标准表为权威数据源: ### 2.3 查询、权重、排序(`config/config.yaml`) -- `field_boosts`:字段权重(如标题、品牌、类目) -- `indexes`:搜索域(default/title/vendor/category/tags) +- `field_boosts`:字段权重(统一按字段基名配置,运行时按 `.{lang}` 动态组装) +- `query_config.search_fields`:动态多语言检索字段(multilingual/shared/core) +- `query_config.text_query_strategy`:文本召回策略参数(minimum_should_match、翻译boost等) - `query_config`:语言、embedding 开关、source_fields、knn_boost、翻译提示词等 - `ranking.expression`:融合表达式(例如 `bm25() + 0.25*text_embedding_relevance()`) - `function_score`:ES 层加权函数 @@ -364,7 +365,7 @@ saas-search 以 MySQL 中的店匠标准表为权威数据源: | 修改项 | 操作 | |--------|------| | 索引结构(mapping) | 修改 `mappings/search_products.json` → `./scripts/create_tenant_index.sh ` → 重新导入 | -| 搜索域/权重/排序/重排 | 修改 `config/config.yaml` 对应块 | +| 搜索字段/权重/排序/重排 | 修改 `config/config.yaml` 对应块 | | provider 与服务 URL | 修改 `config/config.yaml` 的 `services` 块,或用环境变量覆盖 | --- diff --git a/docs/搜索API对接指南.md b/docs/搜索API对接指南.md index e3b8f65..5b2fbb9 100644 --- a/docs/搜索API对接指南.md +++ b/docs/搜索API对接指南.md @@ -18,10 +18,9 @@ - 3.3 [过滤器详解](#33-过滤器详解) - 3.4 [分面配置](#34-分面配置) - 3.5 [SKU筛选维度](#35-sku筛选维度) - - 3.6 [布尔表达式语法](#36-布尔表达式语法) - - 3.7 [搜索建议接口](#37-搜索建议接口) - - 3.8 [即时搜索接口](#38-即时搜索接口) - - 3.9 [获取单个文档](#39-获取单个文档) + - 3.6 [搜索建议接口](#37-搜索建议接口) + - 3.7 [即时搜索接口](#38-即时搜索接口) + - 3.8 [获取单个文档](#39-获取单个文档) 4. [响应格式说明](#响应格式说明) - 4.1 [标准响应结构](#41-标准响应结构) @@ -56,8 +55,7 @@ - 8.3 [分面搜索](#83-分面搜索) - 8.4 [规格过滤与分面](#84-规格过滤与分面) - 8.5 [SKU筛选](#85-sku筛选) - - 8.6 [布尔表达式搜索](#86-布尔表达式搜索) - - 8.7 [分页查询](#87-分页查询) + - 8.6 [分页查询](#87-分页查询) 9. [数据模型](#9-数据模型) - 9.1 [商品字段定义](#91-商品字段定义) @@ -167,7 +165,7 @@ curl -X POST "http://43.166.252.75:6002/search/" \ ### 3.1 接口信息 - **端点**: `POST /search/` -- **描述**: 执行文本搜索查询,支持多语言、布尔表达式、过滤器和分面搜索 +- **描述**: 执行文本搜索查询,支持多语言、过滤器和分面搜索 - **租户标识**:`tenant_id` 通过 HTTP 请求头 **`X-Tenant-ID`** 传递(推荐);也可通过 URL query 参数 **`tenant_id`** 传递。**不要放在请求体中。** **请求示例(推荐)**: @@ -210,7 +208,7 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"}) | 参数 | 类型 | 必填 | 默认值 | 说明 | |------|------|------|--------|------| -| `query` | string | Y | - | 搜索查询字符串,支持布尔表达式(AND, OR, RANK, ANDNOT) | +| `query` | string | Y | - | 搜索查询字符串(统一文本检索策略) | | `size` | integer | N | 10 | 返回结果数量(1-100) | | `from` | integer | N | 0 | 分页偏移量(用于分页) | | `language` | string | N | "zh" | 返回语言:`zh`(中文)或 `en`(英文)。后端会根据此参数选择对应的中英文字段返回 | @@ -544,36 +542,6 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"}) } ``` -### 3.6 布尔表达式语法 - -搜索查询支持布尔表达式,提供更灵活的搜索能力。 - -**支持的操作符**: - -| 操作符 | 描述 | 示例 | -|--------|------|------| -| `AND` | 所有词必须匹配 | `玩具 AND 乐高` | -| `OR` | 任意词匹配 | `芭比 OR 娃娃` | -| `ANDNOT` | 排除特定词 | `玩具 ANDNOT 电动` | -| `RANK` | 排序加权(不强制匹配) | `玩具 RANK 乐高` | -| `()` | 分组 | `玩具 AND (乐高 OR 芭比)` | - -**操作符优先级**(从高到低): -1. `()` - 括号 -2. `ANDNOT` - 排除 -3. `AND` - 与 -4. `OR` - 或 -5. `RANK` - 排序 - -**示例**: -``` -"芭比娃娃" // 简单查询 -"玩具 AND 乐高" // AND 查询 -"芭比 OR 娃娃" // OR 查询 -"玩具 ANDNOT 电动" // 排除查询 -"玩具 AND (乐高 OR 芭比)" // 复杂查询 -``` - ### 3.7 搜索建议接口 - **端点**: `GET /search/suggestions` @@ -2020,17 +1988,6 @@ curl "http://localhost:6006/health" - 每个SPU下,每种颜色只会返回第一个SKU - 如果维度不匹配,返回所有SKU(不进行过滤) -### 8.6 布尔表达式搜索 - -**需求**: 搜索包含"手机"和"智能"的商品,排除"二手" - -```json -{ - "query": "手机 AND 智能 ANDNOT 二手", - "size": 20 -} -``` - ### 8.7 分页查询 **需求**: 获取第2页结果(每页20条) diff --git a/docs/搜索API速查表.md b/docs/搜索API速查表.md index 67f75ec..61179ef 100644 --- a/docs/搜索API速查表.md +++ b/docs/搜索API速查表.md @@ -165,18 +165,6 @@ POST /search/ --- -## 布尔表达式 - -```bash -{ - "query": "玩具 AND (乐高 OR 芭比) ANDNOT 电动" -} -``` - -**操作符优先级**: `()` > `ANDNOT` > `AND` > `OR` > `RANK` - ---- - ## 分页 ```bash diff --git a/query/language_detector.py b/query/language_detector.py index f00ba4f..44fc041 100644 --- a/query/language_detector.py +++ b/query/language_detector.py @@ -1,133 +1,150 @@ """ Language detection utility. -Detects the language of a query string. +Detects language of short e-commerce queries with script checks + lightweight +Latin-language scoring (de/fr/es/it/pt/nl/en). """ -from typing import Optional +from typing import Dict, List import re class LanguageDetector: - """Simple rule-based language detector for common e-commerce languages.""" - - # Unicode ranges for different scripts - CJK_RANGES = [ - (0x4E00, 0x9FFF), # CJK Unified Ideographs - (0x3400, 0x4DBF), # CJK Extension A - (0x20000, 0x2A6DF), # CJK Extension B - (0x3040, 0x309F), # Hiragana - (0x30A0, 0x30FF), # Katakana - ] - - CYRILLIC_RANGE = (0x0400, 0x04FF) - ARABIC_RANGE = (0x0600, 0x06FF) - LATIN_RANGE = (0x0041, 0x007A) + """Rule-based language detector for common e-commerce query languages.""" def __init__(self): - """Initialize language detector.""" - self.chinese_pattern = re.compile(r'[\u4e00-\u9fff]+') - self.russian_pattern = re.compile(r'[\u0400-\u04ff]+') - self.arabic_pattern = re.compile(r'[\u0600-\u06ff]+') - self.japanese_pattern = re.compile(r'[\u3040-\u309f\u30a0-\u30ff]+') + self._re_zh = re.compile(r"[\u4e00-\u9fff]") + self._re_ja_kana = re.compile(r"[\u3040-\u30ff]") + self._re_ko = re.compile(r"[\uac00-\ud7af]") + self._re_ru = re.compile(r"[\u0400-\u04ff]") + self._re_ar = re.compile(r"[\u0600-\u06ff]") + self._re_hi = re.compile(r"[\u0900-\u097f]") + self._re_he = re.compile(r"[\u0590-\u05ff]") + self._re_th = re.compile(r"[\u0e00-\u0e7f]") + self._re_latin_word = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ]+") + + # Stopwords + e-commerce terms for Latin-family disambiguation. + self._latin_lexicons: Dict[str, set] = { + "en": { + "the", "and", "for", "with", "new", "women", "men", "kids", + "shoe", "shoes", "dress", "shirt", "jacket", "bag", "wireless", + }, + "de": { + "der", "die", "das", "und", "mit", "für", "damen", "herren", + "kinder", "schuhe", "kleid", "hemd", "jacke", "tasche", + }, + "fr": { + "le", "la", "les", "et", "avec", "pour", "femme", "homme", + "enfant", "chaussures", "robe", "chemise", "veste", "sac", + }, + "es": { + "el", "la", "los", "las", "y", "con", "para", "mujer", "hombre", + "niño", "niña", "zapatos", "vestido", "camisa", "chaqueta", "bolso", + }, + "it": { + "il", "lo", "la", "gli", "le", "e", "con", "per", "donna", "uomo", + "bambino", "scarpe", "abito", "camicia", "giacca", "borsa", + }, + "pt": { + "o", "a", "os", "as", "e", "com", "para", "mulher", "homem", + "criança", "sapatos", "vestido", "camisa", "jaqueta", "bolsa", + }, + "nl": { + "de", "het", "en", "met", "voor", "dames", "heren", "kinderen", + "schoenen", "jurk", "overhemd", "jas", "tas", + }, + } + self._diacritic_weights: Dict[str, Dict[str, int]] = { + "de": {"ä": 3, "ö": 3, "ü": 3, "ß": 4}, + "fr": {"é": 2, "è": 2, "ê": 2, "à": 2, "ç": 2, "ù": 2, "ô": 2}, + "es": {"ñ": 3, "á": 2, "é": 2, "í": 2, "ó": 2, "ú": 2}, + "it": {"à": 2, "è": 2, "é": 2, "ì": 2, "ò": 2, "ù": 2}, + "pt": {"ã": 3, "õ": 3, "ç": 2, "á": 2, "â": 2, "ê": 2, "ô": 2}, + "nl": {"ij": 2}, + } def detect(self, text: str) -> str: """ - Detect language of text. - - Args: - text: Input text + Detect language code for text. - Returns: - Language code: 'zh', 'en', 'ru', 'ar', 'ja', or 'unknown' + Returns one of: zh/en/ru/ar/ja/ko/de/fr/es/it/pt/nl/hi/he/th/unknown """ if not text or not text.strip(): - return 'unknown' - - text = text.strip() - - # Count characters in each script - char_counts = { - 'chinese': 0, - 'russian': 0, - 'arabic': 0, - 'japanese': 0, - 'latin': 0 - } - - for char in text: - code_point = ord(char) - - # Check CJK (Chinese/Japanese) - is_cjk = any(start <= code_point <= end for start, end in self.CJK_RANGES) - if is_cjk: - char_counts['chinese'] += 1 - - # Check Hiragana/Katakana (Japanese) - if 0x3040 <= code_point <= 0x30FF: - char_counts['japanese'] += 1 - - # Check Cyrillic (Russian) - if self.CYRILLIC_RANGE[0] <= code_point <= self.CYRILLIC_RANGE[1]: - char_counts['russian'] += 1 - - # Check Arabic - if self.ARABIC_RANGE[0] <= code_point <= self.ARABIC_RANGE[1]: - char_counts['arabic'] += 1 - - # Check Latin - if (0x0041 <= code_point <= 0x005A) or (0x0061 <= code_point <= 0x007A): - char_counts['latin'] += 1 - - # Determine dominant script - total_chars = sum(char_counts.values()) - if total_chars == 0: - return 'unknown' - - # Calculate percentages - percentages = { - script: count / total_chars - for script, count in char_counts.items() - } - - # Japanese has both Hiragana/Katakana and CJK - if percentages['japanese'] > 0.1: - return 'ja' - - # Russian (Cyrillic) - if percentages['russian'] > 0.5: - return 'ru' - - # Arabic - if percentages['arabic'] > 0.5: - return 'ar' - - # Chinese (CJK without Japanese kana) - if percentages['chinese'] > 0.3: - return 'zh' - - # English/Latin - if percentages['latin'] > 0.5: - return 'en' - - return 'unknown' + return "unknown" + q = text.strip().lower() + + # Script-first detection for non-Latin languages. + if self._re_ja_kana.search(q): + return "ja" + if self._re_ko.search(q): + return "ko" + if self._re_zh.search(q): + return "zh" + if self._re_ru.search(q): + return "ru" + if self._re_ar.search(q): + return "ar" + if self._re_hi.search(q): + return "hi" + if self._re_he.search(q): + return "he" + if self._re_th.search(q): + return "th" + + # Latin-family scoring. + tokens = self._re_latin_word.findall(q) + if not tokens: + return "unknown" + + scores: Dict[str, float] = {k: 0.0 for k in self._latin_lexicons.keys()} + scores["en"] = scores.get("en", 0.0) + token_set = set(tokens) + + # Lexicon matches + for lang, lex in self._latin_lexicons.items(): + overlap = len(token_set & lex) + if overlap: + scores[lang] += overlap * 2.0 + + # Diacritics / orthographic hints + for lang, hints in self._diacritic_weights.items(): + for marker, weight in hints.items(): + if marker in q: + scores[lang] += weight + + # Light suffix hints for common product words + for t in tokens: + if t.endswith("ung") or t.endswith("chen"): + scores["de"] += 0.6 + if t.endswith("ción") or t.endswith("ado") or t.endswith("ada"): + scores["es"] += 0.6 + if t.endswith("zione") or t.endswith("etto") or t.endswith("ella"): + scores["it"] += 0.6 + if t.endswith("ção") or t.endswith("mente"): + scores["pt"] += 0.6 + if t.endswith("ment") or t.endswith("eau"): + scores["fr"] += 0.5 + + # Fallback preference: English for pure Latin short tokens. + scores["en"] += 0.2 + + best_lang = max(scores.items(), key=lambda x: x[1])[0] + best_score = scores[best_lang] + if best_score <= 0: + return "en" + return best_lang def is_chinese(self, text: str) -> bool: - """Check if text is primarily Chinese.""" - return self.detect(text) == 'zh' + return self.detect(text) == "zh" def is_english(self, text: str) -> bool: - """Check if text is primarily English.""" - return self.detect(text) == 'en' + return self.detect(text) == "en" def is_russian(self, text: str) -> bool: - """Check if text is primarily Russian.""" - return self.detect(text) == 'ru' + return self.detect(text) == "ru" def is_arabic(self, text: str) -> bool: - """Check if text is primarily Arabic.""" - return self.detect(text) == 'ar' + return self.detect(text) == "ar" def is_japanese(self, text: str) -> bool: - """Check if text is primarily Japanese.""" - return self.detect(text) == 'ja' + return self.detect(text) == "ja" diff --git a/query/query_parser.py b/query/query_parser.py index 007361b..2c81891 100644 --- a/query/query_parser.py +++ b/query/query_parser.py @@ -37,7 +37,11 @@ class ParsedQuery: domain: str = "default", keywords: str = "", token_count: int = 0, - query_tokens: Optional[List[str]] = None + query_tokens: Optional[List[str]] = None, + query_text_by_lang: Optional[Dict[str, str]] = None, + search_langs: Optional[List[str]] = None, + index_languages: Optional[List[str]] = None, + source_in_index_languages: bool = True, ): self.original_query = original_query self.query_normalized = query_normalized @@ -50,6 +54,10 @@ class ParsedQuery: self.keywords = keywords self.token_count = token_count self.query_tokens = query_tokens or [] + self.query_text_by_lang = query_text_by_lang or {} + self.search_langs = search_langs or [] + self.index_languages = index_languages or [] + self.source_in_index_languages = bool(source_in_index_languages) def to_dict(self) -> Dict[str, Any]: """Convert to dictionary representation.""" @@ -61,6 +69,10 @@ class ParsedQuery: "translations": self.translations, "domain": self.domain } + result["query_text_by_lang"] = self.query_text_by_lang + result["search_langs"] = self.search_langs + result["index_languages"] = self.index_languages + result["source_in_index_languages"] = self.source_in_index_languages return result @@ -253,12 +265,21 @@ class QueryParser: # Stage 4: Translation (with async support and conditional waiting) translations = {} translation_futures = {} + index_langs = ["en", "zh"] try: # 根据租户配置的 index_languages 决定翻译目标语言 from config.tenant_config_loader import get_tenant_config_loader tenant_loader = get_tenant_config_loader() tenant_cfg = tenant_loader.get_tenant_config(tenant_id or "default") - index_langs = tenant_cfg.get("index_languages") or ["en", "zh"] + raw_index_langs = tenant_cfg.get("index_languages") or ["en", "zh"] + index_langs = [] + seen_langs = set() + for lang in raw_index_langs: + norm_lang = str(lang or "").strip().lower() + if not norm_lang or norm_lang in seen_langs: + continue + seen_langs.add(norm_lang) + index_langs.append(norm_lang) target_langs_for_translation = [lang for lang in index_langs if lang != detected_lang] @@ -269,8 +290,12 @@ class QueryParser: # Use e-commerce context for better disambiguation translation_context = self.config.query_config.translation_context # For query translation, we use a general prompt (not language-specific) - query_prompt = self.config.query_config.translation_prompts.get('query_zh') or \ - self.config.query_config.translation_prompts.get('default_zh') + query_prompt = ( + self.config.query_config.translation_prompts.get(f"query_{detected_lang}") + or self.config.query_config.translation_prompts.get("query_en") + or self.config.query_config.translation_prompts.get("default_en") + or self.config.query_config.translation_prompts.get("default_zh") + ) # Determine if we need to wait for translation results # If detected_lang is not in index_languages, we must wait for translation @@ -417,6 +442,33 @@ class QueryParser: # Update translations in context after all are complete if translations and context: context.store_intermediate_result('translations', translations) + + # Build language-scoped query plan: source language + available translations + query_text_by_lang: Dict[str, str] = {} + if query_text: + query_text_by_lang[detected_lang] = query_text + for lang, translated_text in (translations or {}).items(): + if translated_text and str(translated_text).strip(): + query_text_by_lang[str(lang).strip().lower()] = str(translated_text) + + source_in_index_languages = detected_lang in index_langs + ordered_search_langs: List[str] = [] + seen_order = set() + if detected_lang in query_text_by_lang: + ordered_search_langs.append(detected_lang) + seen_order.add(detected_lang) + for lang in index_langs: + if lang in query_text_by_lang and lang not in seen_order: + ordered_search_langs.append(lang) + seen_order.add(lang) + for lang in query_text_by_lang.keys(): + if lang not in seen_order: + ordered_search_langs.append(lang) + seen_order.add(lang) + + if context: + context.store_intermediate_result("search_langs", ordered_search_langs) + context.store_intermediate_result("query_text_by_lang", query_text_by_lang) # Build result result = ParsedQuery( @@ -429,7 +481,11 @@ class QueryParser: domain=domain, keywords=keywords, token_count=token_count, - query_tokens=query_tokens + query_tokens=query_tokens, + query_text_by_lang=query_text_by_lang, + search_langs=ordered_search_langs, + index_languages=index_langs, + source_in_index_languages=source_in_index_languages, ) if context and hasattr(context, 'logger'): diff --git a/query/query_rewriter.py b/query/query_rewriter.py index b460770..0107dd9 100644 --- a/query/query_rewriter.py +++ b/query/query_rewriter.py @@ -19,7 +19,7 @@ class QueryRewriter: Args: rewrite_dict: Dictionary mapping exact query terms to rewrite expressions - e.g., {"芭比": "brand:芭比 OR name:芭比娃娃"} + e.g., {"芭比": "芭比娃娃"} Only full word matches will be rewritten, no partial matching. """ self.rewrite_dict = rewrite_dict or {} @@ -107,13 +107,13 @@ class QueryNormalizer: return query @staticmethod - def remove_punctuation(query: str, keep_operators: bool = True) -> str: + def remove_punctuation(query: str, keep_operators: bool = False) -> str: """ Remove punctuation from query. Args: query: Original query - keep_operators: Whether to keep boolean operators (AND, OR, etc.) + keep_operators: Whether to keep symbols used in old query syntax. Returns: Query without punctuation diff --git a/search/__init__.py b/search/__init__.py index a2bf707..b1fa389 100644 --- a/search/__init__.py +++ b/search/__init__.py @@ -1,12 +1,9 @@ """Search package initialization.""" -from .boolean_parser import BooleanParser, QueryNode from .es_query_builder import ESQueryBuilder from .searcher import Searcher, SearchResult __all__ = [ - 'BooleanParser', - 'QueryNode', 'ESQueryBuilder', 'Searcher', 'SearchResult', diff --git a/search/boolean_parser.py b/search/boolean_parser.py deleted file mode 100644 index 41f4007..0000000 --- a/search/boolean_parser.py +++ /dev/null @@ -1,201 +0,0 @@ -""" -Boolean expression parser for search queries. - -Supports: AND, OR, RANK, ANDNOT operators with parentheses. -Precedence (high to low): (), ANDNOT, AND, OR, RANK -""" - -import re -from typing import List, Tuple, Optional -from dataclasses import dataclass - - -@dataclass -class QueryNode: - """Represents a node in the parsed query tree.""" - operator: str # 'AND', 'OR', 'RANK', 'ANDNOT', 'TERM' - terms: List['QueryNode'] = None # Child nodes for operators - value: str = None # Value for leaf nodes (TERM) - - def __repr__(self): - if self.operator == 'TERM': - return f"TERM({self.value})" - else: - return f"{self.operator}({', '.join(str(t) for t in self.terms)})" - - -class BooleanParser: - """ - Parser for boolean search expressions. - - Operator precedence (high to low): - 1. () - Parentheses - 2. ANDNOT - AND NOT (exclusion) - 3. AND - All terms must match - 4. OR - Any term must match - 5. RANK - Scoring boost (like OR but affects ranking) - """ - - OPERATORS = {'AND', 'OR', 'RANK', 'ANDNOT'} - PRECEDENCE = { - 'ANDNOT': 3, - 'AND': 2, - 'OR': 1, - 'RANK': 0 - } - - def __init__(self): - """Initialize boolean parser.""" - pass - - def parse(self, expression: str) -> QueryNode: - """ - Parse boolean expression into query tree. - - Args: - expression: Boolean expression string - Example: "laptop AND (gaming OR professional) ANDNOT cheap" - - Returns: - Root QueryNode of parsed tree - """ - if not expression or not expression.strip(): - return QueryNode(operator='TERM', value='') - - # Tokenize - tokens = self._tokenize(expression) - - if not tokens: - return QueryNode(operator='TERM', value='') - - # Parse with precedence - return self._parse_expression(tokens) - - def _tokenize(self, expression: str) -> List[str]: - """ - Tokenize expression into terms and operators. - - Args: - expression: Expression string - - Returns: - List of tokens - """ - # Pattern to match: operators, parentheses, or terms (with domain prefix support) - pattern = r'\b(AND|OR|RANK|ANDNOT)\b|[()]|(?:\w+:)?[^\s()]+' - - tokens = [] - for match in re.finditer(pattern, expression): - token = match.group().strip() - if token: - tokens.append(token) - - return tokens - - def _parse_expression(self, tokens: List[str], start: int = 0) -> Tuple[QueryNode, int]: - """ - Parse expression with operator precedence. - - Args: - tokens: List of tokens - start: Starting index - - Returns: - Tuple of (QueryNode, next_index) - """ - # Start with lowest precedence (RANK) - return self._parse_rank(tokens, start) - - def _parse_rank(self, tokens: List[str], start: int) -> Tuple[QueryNode, int]: - """Parse RANK operator (lowest precedence).""" - left, pos = self._parse_or(tokens, start) - - while pos < len(tokens) and tokens[pos] == 'RANK': - pos += 1 # Skip 'RANK' - right, pos = self._parse_or(tokens, pos) - left = QueryNode(operator='RANK', terms=[left, right]) - - return left, pos - - def _parse_or(self, tokens: List[str], start: int) -> Tuple[QueryNode, int]: - """Parse OR operator.""" - left, pos = self._parse_and(tokens, start) - - while pos < len(tokens) and tokens[pos] == 'OR': - pos += 1 # Skip 'OR' - right, pos = self._parse_and(tokens, pos) - left = QueryNode(operator='OR', terms=[left, right]) - - return left, pos - - def _parse_and(self, tokens: List[str], start: int) -> Tuple[QueryNode, int]: - """Parse AND operator.""" - left, pos = self._parse_andnot(tokens, start) - - while pos < len(tokens) and tokens[pos] == 'AND': - pos += 1 # Skip 'AND' - right, pos = self._parse_andnot(tokens, pos) - left = QueryNode(operator='AND', terms=[left, right]) - - return left, pos - - def _parse_andnot(self, tokens: List[str], start: int) -> Tuple[QueryNode, int]: - """Parse ANDNOT operator (highest precedence).""" - left, pos = self._parse_primary(tokens, start) - - while pos < len(tokens) and tokens[pos] == 'ANDNOT': - pos += 1 # Skip 'ANDNOT' - right, pos = self._parse_primary(tokens, pos) - left = QueryNode(operator='ANDNOT', terms=[left, right]) - - return left, pos - - def _parse_primary(self, tokens: List[str], start: int) -> Tuple[QueryNode, int]: - """Parse primary expression (terms or parentheses).""" - if start >= len(tokens): - return QueryNode(operator='TERM', value=''), start - - token = tokens[start] - - # Handle parentheses - if token == '(': - # Find matching closing parenthesis - depth = 1 - pos = start + 1 - while pos < len(tokens) and depth > 0: - if tokens[pos] == '(': - depth += 1 - elif tokens[pos] == ')': - depth -= 1 - pos += 1 - - # Parse contents of parentheses - inner_tokens = tokens[start + 1:pos - 1] - if inner_tokens: - node, _ = self._parse_expression(inner_tokens, 0) - return node, pos - else: - return QueryNode(operator='TERM', value=''), pos - - # Handle term - if token not in self.OPERATORS and token not in ['(', ')']: - return QueryNode(operator='TERM', value=token), start + 1 - - # Unexpected token - return QueryNode(operator='TERM', value=''), start + 1 - - def is_simple_query(self, expression: str) -> bool: - """ - Check if query is simple (no boolean operators). - - Args: - expression: Query expression - - Returns: - True if simple query (no operators) - """ - tokens = self._tokenize(expression) - for token in tokens: - if token in self.OPERATORS: - return False - return True diff --git a/search/es_query_builder.py b/search/es_query_builder.py index c767374..4e9f5cd 100644 --- a/search/es_query_builder.py +++ b/search/es_query_builder.py @@ -10,7 +10,6 @@ Simplified architecture: from typing import Dict, Any, List, Optional, Union, Tuple import numpy as np -from .boolean_parser import QueryNode from config import FunctionScoreConfig @@ -20,18 +19,31 @@ class ESQueryBuilder: def __init__( self, match_fields: List[str], + field_boosts: Optional[Dict[str, float]] = None, + multilingual_fields: Optional[List[str]] = None, + shared_fields: Optional[List[str]] = None, + core_multilingual_fields: Optional[List[str]] = None, text_embedding_field: Optional[str] = None, image_embedding_field: Optional[str] = None, source_fields: Optional[List[str]] = None, function_score_config: Optional[FunctionScoreConfig] = None, default_language: str = "en", - knn_boost: float = 0.25 + knn_boost: float = 0.25, + base_minimum_should_match: str = "75%", + translation_minimum_should_match: str = "75%", + translation_boost: float = 0.4, + translation_boost_when_source_missing: float = 1.0, + source_boost_when_missing: float = 0.6, + keywords_boost: float = 0.1, + enable_phrase_query: bool = True, + tie_breaker_base_query: float = 0.9, + tie_breaker_keywords: float = 0.9, ): """ Initialize query builder. Multi-language search (translation-based cross-language recall) is always enabled: - queries are matched against both detected-language and translated zh/en clauses. + queries are matched against detected-language and translated target-language clauses. Args: match_fields: Fields to search for text matching @@ -43,12 +55,27 @@ class ESQueryBuilder: knn_boost: Boost value for KNN (embedding recall) """ self.match_fields = match_fields + self.field_boosts = field_boosts or {} + self.multilingual_fields = multilingual_fields or [ + "title", "brief", "description", "vendor", "category_path", "category_name_text" + ] + self.shared_fields = shared_fields or ["tags", "option1_values", "option2_values", "option3_values"] + self.core_multilingual_fields = core_multilingual_fields or ["title", "brief", "vendor", "category_name_text"] self.text_embedding_field = text_embedding_field self.image_embedding_field = image_embedding_field self.source_fields = source_fields self.function_score_config = function_score_config self.default_language = default_language self.knn_boost = knn_boost + self.base_minimum_should_match = base_minimum_should_match + self.translation_minimum_should_match = translation_minimum_should_match + self.translation_boost = float(translation_boost) + self.translation_boost_when_source_missing = float(translation_boost_when_source_missing) + self.source_boost_when_missing = float(source_boost_when_missing) + self.keywords_boost = float(keywords_boost) + self.enable_phrase_query = bool(enable_phrase_query) + self.tie_breaker_base_query = float(tie_breaker_base_query) + self.tie_breaker_keywords = float(tie_breaker_keywords) def _apply_source_filter(self, es_query: Dict[str, Any]) -> None: """ @@ -118,7 +145,6 @@ class ESQueryBuilder: self, query_text: str, query_vector: Optional[np.ndarray] = None, - query_node: Optional[QueryNode] = None, filters: Optional[Dict[str, Any]] = None, range_filters: Optional[Dict[str, Any]] = None, facet_configs: Optional[List[Any]] = None, @@ -136,14 +162,13 @@ class ESQueryBuilder: 结构:filters and (text_recall or embedding_recall) + post_filter - conjunctive_filters: 应用在 query.bool.filter(影响结果和聚合) - disjunctive_filters: 应用在 post_filter(只影响结果,不影响聚合) - - text_recall: 文本相关性召回(中英文字段都用) + - text_recall: 文本相关性召回(按 search_langs 动态语言字段) - embedding_recall: 向量召回(KNN) - function_score: 包装召回部分,支持提权字段 Args: query_text: Query text for BM25 matching query_vector: Query embedding for KNN search - query_node: Parsed boolean expression tree filters: Exact match filters range_filters: Range filters for numeric fields (always applied in query) facet_configs: Facet configurations (used to identify multi-select facets) @@ -157,6 +182,7 @@ class ESQueryBuilder: Returns: ES query DSL dictionary """ + # Boolean AST path has been removed; keep a single text strategy. es_query = { "size": size, "from": from_ @@ -170,12 +196,8 @@ class ESQueryBuilder: # Text recall (always include if query_text exists) if query_text: - if query_node and query_node.operator != 'TERM': - # Complex boolean query - text_query = self._build_boolean_query(query_node) - else: - # Simple text query - use advanced should-based multi-query strategy - text_query = self._build_advanced_text_query(query_text, parsed_query) + # Unified text query strategy + text_query = self._build_advanced_text_query(query_text, parsed_query) recall_clauses.append(text_query) # Embedding recall (KNN - separate from query, handled below) @@ -379,50 +401,49 @@ class ESQueryBuilder: } } + def _format_field_with_boost(self, field_name: str, boost: float) -> str: + if abs(float(boost) - 1.0) < 1e-9: + return field_name + return f"{field_name}^{boost}" + + def _get_field_boost(self, base_field: str, language: Optional[str] = None) -> float: + # Language-specific override first (e.g. title.de), then base field (e.g. title) + if language: + lang_key = f"{base_field}.{language}" + if lang_key in self.field_boosts: + return float(self.field_boosts[lang_key]) + if base_field in self.field_boosts: + return float(self.field_boosts[base_field]) + return 1.0 + def _get_match_fields(self, language: str) -> Tuple[List[str], List[str]]: """ - Get match fields for a specific language. + Build dynamic match fields for one language. Args: - language: Language code ('zh' or 'en') + language: Language code (e.g. zh/en/de/fr/...) Returns: (all_fields, core_fields) - core_fields are for phrase/keyword queries """ - if language == 'zh': - all_fields = [ - "title.zh^3.0", - "brief.zh^1.5", - "description.zh", - "vendor.zh^1.5", - "tags", - "category_path.zh^1.5", - "category_name_text.zh^1.5", - "option1_values^0.5" - ] - core_fields = [ - "title.zh^3.0", - "brief.zh^1.5", - "vendor.zh^1.5", - "category_name_text.zh^1.5" - ] - else: # en - all_fields = [ - "title.en^3.0", - "brief.en^1.5", - "description.en", - "vendor.en^1.5", - "tags", - "category_path.en^1.5", - "category_name_text.en^1.5", - "option1_values^0.5" - ] - core_fields = [ - "title.en^3.0", - "brief.en^1.5", - "vendor.en^1.5", - "category_name_text.en^1.5" - ] + lang = (language or "").strip().lower() + all_fields: List[str] = [] + core_fields: List[str] = [] + + for base in self.multilingual_fields: + field = f"{base}.{lang}" + boost = self._get_field_boost(base, lang) + all_fields.append(self._format_field_with_boost(field, boost)) + + for shared in self.shared_fields: + boost = self._get_field_boost(shared, None) + all_fields.append(self._format_field_with_boost(shared, boost)) + + for base in self.core_multilingual_fields: + field = f"{base}.{lang}" + boost = self._get_field_boost(base, lang) + core_fields.append(self._format_field_with_boost(field, boost)) + return all_fields, core_fields def _get_embedding_field(self, language: str) -> str: @@ -434,9 +455,9 @@ class ESQueryBuilder: """ Build advanced text query using should clauses with multiple query strategies. - Reference implementation: - - base_query: main query with AND operator and 75% minimum_should_match - - translation queries: lower boost (0.4) for other languages + Unified implementation: + - base_query: source-language clause + - translation queries: target-language clauses from search_langs/query_text_by_lang - phrase query: for short queries (2+ tokens) - keywords query: extracted nouns from query - KNN query: added separately in build_query @@ -451,94 +472,89 @@ class ESQueryBuilder: should_clauses = [] # Get query analysis from parsed_query - translations = {} - language = self.default_language + query_text_by_lang: Dict[str, str] = {} + search_langs: List[str] = [] + source_lang = self.default_language + source_in_index_languages = True keywords = "" query_tokens = [] token_count = 0 if parsed_query: - translations = parsed_query.translations or {} - # Use default language if detected_language is None or "unknown" - detected_lang = parsed_query.detected_language - if not detected_lang or detected_lang == "unknown": - language = self.default_language - else: - language = detected_lang + query_text_by_lang = getattr(parsed_query, "query_text_by_lang", None) or {} + search_langs = getattr(parsed_query, "search_langs", None) or [] + detected_lang = getattr(parsed_query, "detected_language", None) + source_lang = detected_lang if detected_lang and detected_lang != "unknown" else self.default_language + source_in_index_languages = bool( + getattr(parsed_query, "source_in_index_languages", True) + ) keywords = getattr(parsed_query, 'keywords', '') or "" query_tokens = getattr(parsed_query, 'query_tokens', None) or [] token_count = len(query_tokens) or getattr(parsed_query, 'token_count', 0) or 0 - # Get match fields for the detected language - match_fields, core_fields = self._get_match_fields(language) - - # Tie breaker values - tie_breaker_base_query = 0.9 - tie_breaker_keywords = 0.9 - - # 1. Base query - main query with AND operator - should_clauses.append({ - "multi_match": { - "_name": "base_query", - "fields": match_fields, - "minimum_should_match": "75%", - # "operator": "AND", - "query": query_text, - "tie_breaker": tie_breaker_base_query - } - }) - - # 2. Translation queries - lower boost (0.4) for other languages (multi-language search always on) - if language != 'zh' and translations.get('zh'): - zh_fields, _ = self._get_match_fields('zh') - should_clauses.append({ - "multi_match": { - "query": translations['zh'], - "fields": zh_fields, - "minimum_should_match": "75%", - "tie_breaker": tie_breaker_base_query, - "boost": 0.4, - "_name": "base_query_trans_zh" - } - }) - if language != 'en' and translations.get('en'): - en_fields, _ = self._get_match_fields('en') - should_clauses.append({ - "multi_match": { - "query": translations['en'], - "fields": en_fields, - "minimum_should_match": "75%", - "tie_breaker": tie_breaker_base_query, - "boost": 0.4, - "_name": "base_query_trans_en" - } - }) + if not query_text_by_lang: + query_text_by_lang = {source_lang: query_text} + if source_lang not in query_text_by_lang and query_text: + query_text_by_lang[source_lang] = query_text + if not search_langs: + search_langs = list(query_text_by_lang.keys()) + + # Core fields for phrase/keyword based on source language. + _, core_fields = self._get_match_fields(source_lang) + if not core_fields and search_langs: + _, core_fields = self._get_match_fields(search_langs[0]) + + # Base + translated clauses based on language plan. + for lang in search_langs: + lang_query = query_text_by_lang.get(lang) + if not lang_query: + continue + match_fields, _ = self._get_match_fields(lang) + if not match_fields: + continue - if False and is_long_query: - boost = 0.5 * pow(min(1.0, token_count / 10.0), 0.9) - minimum_should_match = "70%" - should_clauses.append({ + is_source = (lang == source_lang) + clause_boost = 1.0 + clause_name = "base_query" if is_source else f"base_query_trans_{lang}" + minimum_should_match = ( + self.base_minimum_should_match if is_source else self.translation_minimum_should_match + ) + if is_source and not source_in_index_languages: + clause_boost = self.source_boost_when_missing + elif not is_source: + clause_boost = ( + self.translation_boost + if source_in_index_languages + else self.translation_boost_when_source_missing + ) + + clause = { "multi_match": { - "query": query_text, + "_name": clause_name, "fields": match_fields, "minimum_should_match": minimum_should_match, - "boost": boost, - "tie_breaker": tie_breaker_long_query, - "_name": "long_query" + "query": lang_query, + "tie_breaker": self.tie_breaker_base_query, } + } + if abs(clause_boost - 1.0) > 1e-9: + clause["multi_match"]["boost"] = clause_boost + should_clauses.append({ + "multi_match": clause["multi_match"] }) # 3. Short query - add phrase query (derived from query_tokens) # is_short: quoted or ((token_count <= 2 or len <= 4) and no space) - ENABLE_PHRASE_QUERY = True + source_query_text = query_text_by_lang.get(source_lang) or query_text + ENABLE_PHRASE_QUERY = self.enable_phrase_query is_quoted = query_text.startswith('"') and query_text.endswith('"') is_short = is_quoted or ((token_count <= 2 or len(query_text) <= 4) and ' ' not in query_text) - if ENABLE_PHRASE_QUERY and token_count >= 2 and is_short: + if ENABLE_PHRASE_QUERY and core_fields and token_count >= 2 and is_short: query_length = len(query_text) slop = 0 if query_length < 3 else 1 if query_length < 5 else 2 should_clauses.append({ "multi_match": { - "query": query_text, + "query": source_query_text, "fields": core_fields, "type": "phrase", "slop": slop, @@ -548,18 +564,31 @@ class ESQueryBuilder: }) # 4. Keywords query - extracted nouns from query - elif keywords and len(keywords.split()) <= 2 and 2 * len(keywords.replace(' ', '')) <= len(query_text): + elif core_fields and keywords and len(keywords.split()) <= 2 and 2 * len(keywords.replace(' ', '')) <= len(query_text): should_clauses.append({ "multi_match": { "query": keywords, "fields": core_fields, # "operator": "AND", - "tie_breaker": tie_breaker_keywords, - "boost": 0.1, + "tie_breaker": self.tie_breaker_keywords, + "boost": self.keywords_boost, "_name": "keywords_query" } }) + # Fallback to a simple query when language fields cannot be resolved. + if not should_clauses: + fallback_fields = self.match_fields or ["title.en^1.0"] + return { + "multi_match": { + "_name": "base_query_fallback", + "query": query_text, + "fields": fallback_fields, + "minimum_should_match": self.base_minimum_should_match, + "tie_breaker": self.tie_breaker_base_query, + } + } + # Return bool query with should clauses if len(should_clauses) == 1: return should_clauses[0] @@ -571,70 +600,6 @@ class ESQueryBuilder: } } - def _build_boolean_query(self, node: QueryNode) -> Dict[str, Any]: - """ - Build query from boolean expression tree. - - Args: - node: Query tree node - - Returns: - ES query clause - """ - if node.operator == 'TERM': - # Leaf node - simple text query - return self._build_text_query(node.value) - - elif node.operator == 'AND': - # All terms must match - return { - "bool": { - "must": [ - self._build_boolean_query(term) - for term in node.terms - ] - } - } - - elif node.operator == 'OR': - # Any term must match - return { - "bool": { - "should": [ - self._build_boolean_query(term) - for term in node.terms - ], - "minimum_should_match": 1 - } - } - - elif node.operator == 'ANDNOT': - # First term must match, second must not - if len(node.terms) >= 2: - return { - "bool": { - "must": [self._build_boolean_query(node.terms[0])], - "must_not": [self._build_boolean_query(node.terms[1])] - } - } - else: - return self._build_boolean_query(node.terms[0]) - - elif node.operator == 'RANK': - # Like OR but for ranking (all terms contribute to score) - return { - "bool": { - "should": [ - self._build_boolean_query(term) - for term in node.terms - ] - } - } - - else: - # Unknown operator - return {"match_all": {}} - def _build_filters( self, filters: Optional[Dict[str, Any]] = None, diff --git a/search/searcher.py b/search/searcher.py index 4949da9..1d974a4 100644 --- a/search/searcher.py +++ b/search/searcher.py @@ -1,7 +1,7 @@ """ Main Searcher module - executes search queries against Elasticsearch. -Handles query parsing, boolean expressions, ranking, and result formatting. +Handles query parsing, ranking, and result formatting. """ from typing import Dict, Any, List, Optional, Union @@ -12,11 +12,9 @@ import logging from utils.es_client import ESClient from query import QueryParser, ParsedQuery from embeddings.image_encoder import CLIPImageEncoder -from .boolean_parser import BooleanParser, QueryNode from .es_query_builder import ESQueryBuilder from config import SearchConfig from config.tenant_config_loader import get_tenant_config_loader -from config.utils import get_match_fields_for_index from context.request_context import RequestContext, RequestContextStage from api.models import FacetResult, FacetValue, FacetConfig from api.result_formatter import ResultFormatter @@ -73,7 +71,7 @@ class Searcher: Handles: - Query parsing and translation - - Boolean expression parsing + - Dynamic multi-language text recall planning - ES query building - Result ranking and formatting """ @@ -98,12 +96,6 @@ class Searcher: self.config = config # Index name is now generated dynamically per tenant, no longer stored here self.query_parser = query_parser or QueryParser(config) - - # Initialize components - self.boolean_parser = BooleanParser() - - # Get match fields from config - self.match_fields = get_match_fields_for_index(config, "default") self.text_embedding_field = config.query_config.text_embedding_field or "title_embedding" self.image_embedding_field = config.query_config.image_embedding_field if self.image_embedding_field and image_encoder is None: @@ -114,13 +106,26 @@ class Searcher: # Query builder - simplified single-layer architecture self.query_builder = ESQueryBuilder( - match_fields=self.match_fields, + match_fields=[], + field_boosts=self.config.field_boosts, + multilingual_fields=self.config.query_config.multilingual_fields, + shared_fields=self.config.query_config.shared_fields, + core_multilingual_fields=self.config.query_config.core_multilingual_fields, text_embedding_field=self.text_embedding_field, image_embedding_field=self.image_embedding_field, source_fields=self.source_fields, function_score_config=self.config.function_score, default_language=self.config.query_config.default_language, - knn_boost=self.config.query_config.knn_boost + knn_boost=self.config.query_config.knn_boost, + base_minimum_should_match=self.config.query_config.base_minimum_should_match, + translation_minimum_should_match=self.config.query_config.translation_minimum_should_match, + translation_boost=self.config.query_config.translation_boost, + translation_boost_when_source_missing=self.config.query_config.translation_boost_when_source_missing, + source_boost_when_missing=self.config.query_config.source_boost_when_missing, + keywords_boost=self.config.query_config.keywords_boost, + enable_phrase_query=self.config.query_config.enable_phrase_query, + tie_breaker_base_query=self.config.query_config.tie_breaker_base_query, + tie_breaker_keywords=self.config.query_config.tie_breaker_keywords, ) def _apply_source_filter(self, es_query: Dict[str, Any]) -> None: @@ -250,7 +255,7 @@ class Searcher: translations=parsed_query.translations, query_vector=parsed_query.query_vector.tolist() if parsed_query.query_vector is not None else None, domain=parsed_query.domain, - is_simple_query=self.boolean_parser.is_simple_query(parsed_query.rewritten_query) + is_simple_query=True ) context.logger.info( @@ -271,38 +276,7 @@ class Searcher: finally: context.end_stage(RequestContextStage.QUERY_PARSING) - # Step 2: Boolean parsing - context.start_stage(RequestContextStage.BOOLEAN_PARSING) - try: - query_node = None - if self.boolean_parser.is_simple_query(parsed_query.rewritten_query): - # Simple query - query_text = parsed_query.rewritten_query - context.logger.debug( - f"简单查询 | 无布尔表达式", - extra={'reqid': context.reqid, 'uid': context.uid} - ) - else: - # Complex boolean query - query_node = self.boolean_parser.parse(parsed_query.rewritten_query) - query_text = parsed_query.rewritten_query - context.store_intermediate_result('query_node', query_node) - context.store_intermediate_result('boolean_ast', str(query_node)) - context.logger.info( - f"布尔表达式解析 | AST: {query_node}", - extra={'reqid': context.reqid, 'uid': context.uid} - ) - except Exception as e: - context.set_error(e) - context.logger.error( - f"布尔表达式解析失败 | 错误: {str(e)}", - extra={'reqid': context.reqid, 'uid': context.uid} - ) - raise - finally: - context.end_stage(RequestContextStage.BOOLEAN_PARSING) - - # Step 3: Query building + # Step 2: Query building context.start_stage(RequestContextStage.QUERY_BUILDING) try: # Generate tenant-specific index name @@ -314,7 +288,6 @@ class Searcher: es_query = self.query_builder.build_query( query_text=parsed_query.rewritten_query or parsed_query.query_normalized, query_vector=parsed_query.query_vector if enable_embedding else None, - query_node=query_node, filters=filters, range_filters=range_filters, facet_configs=facets, @@ -529,7 +502,6 @@ class Searcher: "translations": context.query_analysis.translations, "has_vector": context.query_analysis.query_vector is not None, "is_simple_query": context.query_analysis.is_simple_query, - "boolean_ast": context.get_intermediate_result('boolean_ast'), "domain": context.query_analysis.domain }, "es_query": context.get_intermediate_result('es_query', {}), @@ -666,12 +638,18 @@ class Searcher: def get_domain_summary(self) -> Dict[str, Any]: """ - Get summary of all configured domains. + Get summary of dynamic text retrieval configuration. Returns: - Dictionary with domain information + Dictionary with language-aware field information """ - return self.query_builder.get_domain_summary() + return { + "mode": "dynamic_language_fields", + "multilingual_fields": self.config.query_config.multilingual_fields, + "shared_fields": self.config.query_config.shared_fields, + "core_multilingual_fields": self.config.query_config.core_multilingual_fields, + "field_boosts": self.config.field_boosts, + } def get_document(self, tenant_id: str, doc_id: str) -> Optional[Dict[str, Any]]: """ diff --git a/suggestion/service.py b/suggestion/service.py index 88ec5f6..4816c04 100644 --- a/suggestion/service.py +++ b/suggestion/service.py @@ -33,6 +33,68 @@ class SuggestionService: return primary return index_languages[0] + def _completion_suggest( + self, + index_name: str, + query: str, + lang: str, + size: int, + ) -> List[Dict[str, Any]]: + """ + Query ES completion suggester from `completion.`. + + Returns items in the same shape as search hits -> dicts with "text"/"lang"/"score"/"rank_score"/"sources". + """ + field_name = f"completion.{lang}" + body = { + "suggest": { + "s": { + "prefix": query, + "completion": { + "field": field_name, + "size": size, + "skip_duplicates": True, + }, + } + }, + "_source": [ + "text", + "lang", + "rank_score", + "sources", + "lang_source", + "lang_confidence", + "lang_conflict", + ], + } + try: + resp = self.es_client.client.search(index=index_name, body=body) + except Exception as e: + # completion is an optimization path; never hard-fail the whole endpoint + logger.warning("Completion suggest failed for index=%s field=%s: %s", index_name, field_name, e) + return [] + + entries = (resp.get("suggest", {}) or {}).get("s", []) or [] + if not entries: + return [] + options = entries[0].get("options", []) or [] + out: List[Dict[str, Any]] = [] + for opt in options: + src = opt.get("_source", {}) or {} + out.append( + { + "text": src.get("text") or opt.get("text"), + "lang": src.get("lang") or lang, + "score": opt.get("_score", 0.0), + "rank_score": src.get("rank_score"), + "sources": src.get("sources", []), + "lang_source": src.get("lang_source"), + "lang_confidence": src.get("lang_confidence"), + "lang_conflict": src.get("lang_conflict", False), + } + ) + return out + def _search_products_for_suggestion( self, tenant_id: str, @@ -95,6 +157,17 @@ class SuggestionService: start = time.time() resolved_lang = self._resolve_language(tenant_id, language) index_name = get_suggestion_index_name(tenant_id) + if not self.es_client.index_exists(index_name): + # On a fresh ES cluster the suggestion index might not be built yet. + # Keep endpoint stable for frontend autocomplete: return empty list instead of 500. + took_ms = int((time.time() - start) * 1000) + return { + "query": query, + "language": language, + "resolved_language": resolved_lang, + "suggestions": [], + "took_ms": took_ms, + } sat_field = f"sat.{resolved_lang}" dsl = { @@ -139,14 +212,42 @@ class SuggestionService: "lang_conflict", ], } + # Recall path A: bool_prefix on search_as_you_type es_resp = self.es_client.search(index_name=index_name, body=dsl, size=size, from_=0) hits = es_resp.get("hits", {}).get("hits", []) or [] + # Recall path B: completion suggester (optional optimization) + completion_items = self._completion_suggest( + index_name=index_name, + query=query, + lang=resolved_lang, + size=size, + ) + suggestions: List[Dict[str, Any]] = [] + seen_text_norm: set = set() + + def _norm_text(v: Any) -> str: + return str(v or "").strip().lower() + + # Put completion results first (usually better prefix UX), then fill with sat results. + for item in completion_items: + text_val = item.get("text") + norm = _norm_text(text_val) + if not norm or norm in seen_text_norm: + continue + seen_text_norm.add(norm) + suggestions.append(dict(item)) + for hit in hits: src = hit.get("_source", {}) or {} + text_val = src.get("text") + norm = _norm_text(text_val) + if not norm or norm in seen_text_norm: + continue + seen_text_norm.add(norm) item = { - "text": src.get("text"), + "text": text_val, "lang": src.get("lang"), "score": hit.get("_score", 0.0), "rank_score": src.get("rank_score"), @@ -173,7 +274,7 @@ class SuggestionService: "query": query, "language": language, "resolved_language": resolved_lang, - "suggestions": suggestions, + "suggestions": suggestions[:size], "took_ms": took_ms, } -- libgit2 0.21.2