Commit 1cca75c8a5e01c59a3ccd526d143305f95318576
1 parent
5b8f58c0
sugg 索引文档
Showing
3 changed files
with
34 additions
and
30 deletions
Show diff stats
api/routes/suggestion_indexer.py
| ... | ... | @@ -20,7 +20,10 @@ class FullBuildRequest(BaseModel): |
| 20 | 20 | tenant_id: str = Field(..., description="租户 ID") |
| 21 | 21 | days: int = Field(360, description="查询日志回溯天数") |
| 22 | 22 | batch_size: int = Field(500, description="商品扫描 batch 大小") |
| 23 | - min_query_len: int = Field(1, description="最小查询长度过滤") | |
| 23 | + min_query_len: int = Field( | |
| 24 | + 3, | |
| 25 | + description="最小查询长度过滤(中文字符按 2 计数,其余字符按 1 计数)", | |
| 26 | + ) | |
| 24 | 27 | publish_alias: bool = Field( |
| 25 | 28 | True, |
| 26 | 29 | description="是否在构建完成后发布 alias 到新版本索引", |
| ... | ... | @@ -35,7 +38,10 @@ class IncrementalBuildRequest(BaseModel): |
| 35 | 38 | """增量更新 suggestion 索引""" |
| 36 | 39 | |
| 37 | 40 | tenant_id: str = Field(..., description="租户 ID") |
| 38 | - min_query_len: int = Field(1, description="最小查询长度过滤") | |
| 41 | + min_query_len: int = Field( | |
| 42 | + 3, | |
| 43 | + description="最小查询长度过滤(中文字符按 2 计数,其余字符按 1 计数)", | |
| 44 | + ) | |
| 39 | 45 | fallback_days: int = Field( |
| 40 | 46 | 7, |
| 41 | 47 | description="当没有增量水位线时,默认从最近多少天的查询日志开始补", | ... | ... |
docs/suggestion索引构建.md
| ... | ... | @@ -23,7 +23,7 @@ |
| 23 | 23 | |
| 24 | 24 | 示例:`search_suggestions_tenant_1_current` |
| 25 | 25 | |
| 26 | -- **元信息索引(所有租户共用)** | |
| 26 | +- **元信息索引(所有租户共用一个索引,每个租户一条文档)** | |
| 27 | 27 | |
| 28 | 28 | - 名称: |
| 29 | 29 | |
| ... | ... | @@ -31,7 +31,27 @@ |
| 31 | 31 | search_suggestions\_meta |
| 32 | 32 | \] |
| 33 | 33 | |
| 34 | - - 用于记录每个租户的: | |
| 34 | + - 用于记录每个租户的元信息(`_id = tenant_id`): | |
| 35 | + 该索引全局创建一次,每新增一个租户,插入一行,每次为一个租户做完全量,如果成功,更新一下对应的信息。 | |
| 36 | + ``` | |
| 37 | +{ | |
| 38 | +"settings": { | |
| 39 | + "number_of_shards": 1, | |
| 40 | + "number_of_replicas": 0, | |
| 41 | + "refresh_interval": "1s", | |
| 42 | +}, | |
| 43 | +"mappings": { | |
| 44 | + "properties": { | |
| 45 | + "tenant_id": {"type": "keyword"}, | |
| 46 | + "active_alias": {"type": "keyword"}, | |
| 47 | + "active_index": {"type": "keyword"}, | |
| 48 | + "last_full_build_at": {"type": "date"}, | |
| 49 | + "last_incremental_build_at": {"type": "date"}, | |
| 50 | + "last_incremental_watermark": {"type": "date"}, | |
| 51 | + "updated_at": {"type": "date"}, | |
| 52 | + } | |
| 53 | +} | |
| 54 | +``` | |
| 35 | 55 | - `active_alias`(当前 alias 名) |
| 36 | 56 | - `active_index`(当前实际索引名) |
| 37 | 57 | - `last_full_build_at` |
| ... | ... | @@ -137,38 +157,19 @@ |
| 137 | 157 | - `min_query_len: int`:过滤短查询 |
| 138 | 158 | - `publish_alias: bool`:是否构建完成后切 alias(只在 versioned 模式下起作用) |
| 139 | 159 | - `keep_versions: int`:保留多少个最新版本索引 |
| 140 | -- `use_versioned_index: bool`:true 使用 `*_v{timestamp}` 版本索引;false 使用 legacy 索引 | |
| 141 | - | |
| 142 | -#### 2. 每租户语言配置 | |
| 143 | - | |
| 144 | -- 通过 `tenant_config`(`get_tenant_config_loader().get_tenant_config(tenant_id)`)拿到: | |
| 145 | - - `index_languages: List[str]`,默认 `["en", "zh"]` | |
| 146 | - - `primary_language: str`,默认 `"en"` | |
| 147 | - | |
| 148 | -Java 端需要自己的 tenant 配置源,但**要保持同样字段含义与默认值**。 | |
| 149 | 160 | |
| 150 | 161 | #### 3. 构建目标索引 |
| 151 | 162 | |
| 152 | -1. 如果 `use_versioned_index = true`: | |
| 153 | - - 新索引名:`get_suggestion_versioned_index_name(tenant_id)`,即带时间戳后缀 | |
| 154 | -2. 否则(legacy 模式): | |
| 155 | - - 索引名:`get_suggestion_legacy_index_name(tenant_id)` | |
| 156 | - - 若 `recreate = true` 且索引已存在,先删除再重建 | |
| 157 | -3. 若目标索引已经存在 → 抛错(防止误覆盖) | |
| 158 | - | |
| 159 | -4. 按上文 mapping 创建索引。 | |
| 163 | +1. 创建索引: | |
| 164 | + - 索引名称:search_suggestions_tenant_{tenant\_id}\_v{yyyyMMddHHmmss} | |
| 165 | + - 结构: search_suggestions.json | |
| 160 | 166 | |
| 161 | 167 | #### 4. 构建候选词 |
| 162 | 168 | |
| 163 | 169 | ##### 4.1 从商品索引收集 title / qanchors(Step 1) |
| 164 | 170 | |
| 165 | -对应 `_iter_products` 与 `_build_full_candidates` 的前半段。 | |
| 166 | - | |
| 167 | -- 数据源:**商品 ES 索引**(SPU 索引) | |
| 168 | - - 获取当前租户商品索引名称:通过 `indexer.mapping_generator.get_tenant_index_name(tenant_id)` 获取 | |
| 169 | 171 | - 遍历店铺的所有商品:获取每个商品的 `"spu_id"`, `"title"`, `"qanchors"` 3个字段(按`spu_id`升序) |
| 170 | 172 | |
| 171 | - | |
| 172 | 173 | - 对每个商品文档: |
| 173 | 174 | |
| 174 | 175 | 1. 确定 `product_id`: | ... | ... |
suggestion/builder.py
| ... | ... | @@ -260,10 +260,8 @@ class SuggestionIndexBuilder: |
| 260 | 260 | body: Dict[str, Any] = { |
| 261 | 261 | "size": batch_size, |
| 262 | 262 | "_source": ["id", "spu_id", "title", "qanchors"], |
| 263 | - # Prefer spu_id when present; fall back to id.keyword for current mappings. | |
| 264 | 263 | "sort": [ |
| 265 | 264 | {"spu_id": {"order": "asc", "missing": "_last"}}, |
| 266 | - {"id.keyword": {"order": "asc", "missing": "_last"}}, | |
| 267 | 265 | ], |
| 268 | 266 | "query": {"match_all": {}}, |
| 269 | 267 | } |
| ... | ... | @@ -817,8 +815,7 @@ class SuggestionIndexBuilder: |
| 817 | 815 | days=bootstrap_days, |
| 818 | 816 | batch_size=batch_size, |
| 819 | 817 | min_query_len=min_query_len, |
| 820 | - publish_alias=True, | |
| 821 | - use_versioned_index=True, | |
| 818 | + publish_alias=True | |
| 822 | 819 | ) |
| 823 | 820 | return { |
| 824 | 821 | "mode": "incremental", | ... | ... |