diff --git a/api/routes/suggestion_indexer.py b/api/routes/suggestion_indexer.py index 447ffac..643a736 100644 --- a/api/routes/suggestion_indexer.py +++ b/api/routes/suggestion_indexer.py @@ -20,7 +20,10 @@ class FullBuildRequest(BaseModel): tenant_id: str = Field(..., description="租户 ID") days: int = Field(360, description="查询日志回溯天数") batch_size: int = Field(500, description="商品扫描 batch 大小") - min_query_len: int = Field(1, description="最小查询长度过滤") + min_query_len: int = Field( + 3, + description="最小查询长度过滤(中文字符按 2 计数,其余字符按 1 计数)", + ) publish_alias: bool = Field( True, description="是否在构建完成后发布 alias 到新版本索引", @@ -35,7 +38,10 @@ class IncrementalBuildRequest(BaseModel): """增量更新 suggestion 索引""" tenant_id: str = Field(..., description="租户 ID") - min_query_len: int = Field(1, description="最小查询长度过滤") + min_query_len: int = Field( + 3, + description="最小查询长度过滤(中文字符按 2 计数,其余字符按 1 计数)", + ) fallback_days: int = Field( 7, description="当没有增量水位线时,默认从最近多少天的查询日志开始补", diff --git a/docs/suggestion索引构建.md b/docs/suggestion索引构建.md index 19ba9b7..0dad214 100644 --- a/docs/suggestion索引构建.md +++ b/docs/suggestion索引构建.md @@ -23,7 +23,7 @@ 示例:`search_suggestions_tenant_1_current` -- **元信息索引(所有租户共用)** +- **元信息索引(所有租户共用一个索引,每个租户一条文档)** - 名称: @@ -31,7 +31,27 @@ search_suggestions\_meta \] - - 用于记录每个租户的: + - 用于记录每个租户的元信息(`_id = tenant_id`): + 该索引全局创建一次,每新增一个租户,插入一行,每次为一个租户做完全量,如果成功,更新一下对应的信息。 + ``` +{ +"settings": { + "number_of_shards": 1, + "number_of_replicas": 0, + "refresh_interval": "1s", +}, +"mappings": { + "properties": { + "tenant_id": {"type": "keyword"}, + "active_alias": {"type": "keyword"}, + "active_index": {"type": "keyword"}, + "last_full_build_at": {"type": "date"}, + "last_incremental_build_at": {"type": "date"}, + "last_incremental_watermark": {"type": "date"}, + "updated_at": {"type": "date"}, + } +} +``` - `active_alias`(当前 alias 名) - `active_index`(当前实际索引名) - `last_full_build_at` @@ -137,38 +157,19 @@ - `min_query_len: int`:过滤短查询 - `publish_alias: bool`:是否构建完成后切 alias(只在 versioned 模式下起作用) - `keep_versions: int`:保留多少个最新版本索引 -- `use_versioned_index: bool`:true 使用 `*_v{timestamp}` 版本索引;false 使用 legacy 索引 - -#### 2. 每租户语言配置 - -- 通过 `tenant_config`(`get_tenant_config_loader().get_tenant_config(tenant_id)`)拿到: - - `index_languages: List[str]`,默认 `["en", "zh"]` - - `primary_language: str`,默认 `"en"` - -Java 端需要自己的 tenant 配置源,但**要保持同样字段含义与默认值**。 #### 3. 构建目标索引 -1. 如果 `use_versioned_index = true`: - - 新索引名:`get_suggestion_versioned_index_name(tenant_id)`,即带时间戳后缀 -2. 否则(legacy 模式): - - 索引名:`get_suggestion_legacy_index_name(tenant_id)` - - 若 `recreate = true` 且索引已存在,先删除再重建 -3. 若目标索引已经存在 → 抛错(防止误覆盖) - -4. 按上文 mapping 创建索引。 +1. 创建索引: + - 索引名称:search_suggestions_tenant_{tenant\_id}\_v{yyyyMMddHHmmss} + - 结构: search_suggestions.json #### 4. 构建候选词 ##### 4.1 从商品索引收集 title / qanchors(Step 1) -对应 `_iter_products` 与 `_build_full_candidates` 的前半段。 - -- 数据源:**商品 ES 索引**(SPU 索引) - - 获取当前租户商品索引名称:通过 `indexer.mapping_generator.get_tenant_index_name(tenant_id)` 获取 - 遍历店铺的所有商品:获取每个商品的 `"spu_id"`, `"title"`, `"qanchors"` 3个字段(按`spu_id`升序) - - 对每个商品文档: 1. 确定 `product_id`: diff --git a/suggestion/builder.py b/suggestion/builder.py index 96f27c0..2914c2d 100644 --- a/suggestion/builder.py +++ b/suggestion/builder.py @@ -260,10 +260,8 @@ class SuggestionIndexBuilder: body: Dict[str, Any] = { "size": batch_size, "_source": ["id", "spu_id", "title", "qanchors"], - # Prefer spu_id when present; fall back to id.keyword for current mappings. "sort": [ {"spu_id": {"order": "asc", "missing": "_last"}}, - {"id.keyword": {"order": "asc", "missing": "_last"}}, ], "query": {"match_all": {}}, } @@ -817,8 +815,7 @@ class SuggestionIndexBuilder: days=bootstrap_days, batch_size=batch_size, min_query_len=min_query_len, - publish_alias=True, - use_versioned_index=True, + publish_alias=True ) return { "mode": "incremental", -- libgit2 0.21.2