Compare View
Commits (5)
-
…nt.py)、[search/searcher.py](/data/saas-search/search/searcher.py)、[frontend/static/js/app.js](/data/saas-search/frontend/static/js/app.js) 以及 [tests/test_rerank_client.py](/data/saas-search/tests/test_rerank_client.py)。 主要修复内容如下: - 精排现依据融合阶段得分进行排序,而非仅依据原始的 `fine_score`。 - 最终重排不再依赖独立的 `fine_scores` 数组(该数组在精排排序后可能产生同步偏差),而是直接读取命中结果附带的 `_fine_score`。 - 精排与最终重排现均通过同一计算路径生成融合调试信息,该路径同时也决定实际排序结果,从而保证记录逻辑与生产逻辑保持一致。 - 调试信息载荷更加清晰:精排和最终重排阶段都会暴露融合输入/因子以及规范的 `fusion_summary`,前端界面现在会渲染该摘要信息。 主要问题:阶段逻辑重复且存在并行的数据通道:一个通道用于计算排序,另一个通道用于组装调试字段,还有第三个通道用于传递辅助数组。这造成了潜在的差异风险。本次重构通过将阶段得分作为唯一事实来源,并让调试/前端直接消费其输出而非事后重构,降低了该风险。 验证结果: - `./.venv/bin/python -m pytest -q tests/test_rerank_client.py tests/test_search_rerank_window.py` - `./.venv/bin/python -m py_compile search/rerank_client.py search/searcher.py` 结果:`22 passed`。 当前的主流程: 1. Query 解析 2. ES 召回 3. 粗排:只用 ES 内部文本/KNN 信号 4. 款式 SKU 选择 + title suffix 5. 精排:轻量 reranker + 文本/KNN 融合 6. 最终 rerank:重 reranker + fine score + 文本/KNN 融合 7. 分页、补全字段、格式化返回 主控代码在 [searcher.py](/data/saas-search/search/searcher.py),打分与 rerank 细节在 [rerank_client.py](/data/saas-search/search/rerank_client.py),配置定义在 [schema.py](/data/saas-search/config/schema.py) 和 [config.yaml](/data/saas-search/config/config.yaml)。 **先看入口怎么决定走哪条路** 在 [searcher.py:348](/data/saas-search/search/searcher.py#L348) 开始,`search()` 先读租户语言、开关、窗口大小。 关键判断在 [searcher.py:364](/data/saas-search/search/searcher.py#L364) 到 [searcher.py:372](/data/saas-search/search/searcher.py#L372): - `rerank_window` 现在是 80,见 [config.yaml:256](/data/saas-search/config/config.yaml#L256) - `coarse_rank.input_window` 是 700,`output_window` 是 240,见 [config.yaml:231](/data/saas-search/config/config.yaml#L231) - `fine_rank.input_window` 是 240,`output_window` 是 80,见 [config.yaml:245](/data/saas-search/config/config.yaml#L245) 所以如果请求满足 `from_ + size <= rerank_window`,就进入完整漏斗: - ES 实际取前 `700` - 粗排后留 `240` - 精排后留 `80` - 最终 rerank 也只处理这 `80` - 最后再做分页切片 如果请求页超出 80,就不走后面的多阶段漏斗,直接按 ES 原逻辑返回。
-
一、tags字段改支持多语言: spu表tags字段,跟title走一样的翻译逻辑,填入原始语言、zh、en。 检查以下字段,都跟title一样走翻译逻辑 title keywords tags brief description vendor category_path category_name_text 二、/indexer/enrich-content接口的修改 1. 请求参数,把language去掉,因为我返回的内容直接对应索引结构,不用你做处理了,因此不需要指定语言,降低耦合。 2. 返回 enriched_attributes enriched_tags qanchors三个字段,按原始内容填入。 3. enriched_tags是本次新增的,注意区别于tags字段。tags字段来源于mysql spu表,enriched_tags是本接口返回的。 三、specifications的value,需要翻译,也是需要填中英文: { "specifications": [ { "sku_id": "sku-red-s", "name": "color", "value_keyword": "красный", "value_text": { "zh": "红色", "en": "red" } } ] }
Showing
28 changed files
Show diff stats
api/routes/indexer.py
| ... | ... | @@ -80,7 +80,7 @@ class BuildDocsFromDbRequest(BaseModel): |
| 80 | 80 | class EnrichContentItem(BaseModel): |
| 81 | 81 | """单条待生成内容理解字段的商品。""" |
| 82 | 82 | spu_id: str = Field(..., description="SPU ID") |
| 83 | - title: str = Field(..., description="商品标题,用于 LLM 分析生成 qanchors / tags 等") | |
| 83 | + title: str = Field(..., description="商品标题,用于 LLM 分析生成 qanchors / enriched_tags 等") | |
| 84 | 84 | image_url: Optional[str] = Field(None, description="商品主图 URL(预留给多模态/内容理解扩展)") |
| 85 | 85 | brief: Optional[str] = Field(None, description="商品简介/短描述") |
| 86 | 86 | description: Optional[str] = Field(None, description="商品详情/长描述") |
| ... | ... | @@ -88,15 +88,11 @@ class EnrichContentItem(BaseModel): |
| 88 | 88 | |
| 89 | 89 | class EnrichContentRequest(BaseModel): |
| 90 | 90 | """ |
| 91 | - 内容理解字段生成请求:根据商品标题批量生成 qanchors、semantic_attributes、tags。 | |
| 91 | + 内容理解字段生成请求:根据商品标题批量生成 qanchors、enriched_attributes、tags。 | |
| 92 | 92 | 供外部 indexer 在自行组织 doc 时调用,与翻译、向量化等微服务并列。 |
| 93 | 93 | """ |
| 94 | 94 | tenant_id: str = Field(..., description="租户 ID,用于请求路由与结果归属,不参与缓存键") |
| 95 | 95 | items: List[EnrichContentItem] = Field(..., description="待分析的 SPU 列表(spu_id + title,可附带 brief/description/image_url)") |
| 96 | - languages: List[str] = Field( | |
| 97 | - default_factory=lambda: ["zh", "en"], | |
| 98 | - description="目标语言列表,需在支持范围内(zh/en/de/ru/fr),默认 zh, en", | |
| 99 | - ) | |
| 100 | 96 | |
| 101 | 97 | |
| 102 | 98 | @router.post("/reindex") |
| ... | ... | @@ -444,103 +440,36 @@ async def build_docs_from_db(request: BuildDocsFromDbRequest): |
| 444 | 440 | raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}") |
| 445 | 441 | |
| 446 | 442 | |
| 447 | -def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]], languages: List[str]) -> List[Dict[str, Any]]: | |
| 443 | +def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]]) -> List[Dict[str, Any]]: | |
| 448 | 444 | """ |
| 449 | - 同步执行内容理解:调用 product_enrich.analyze_products,按语言批量跑 LLM, | |
| 450 | - 再聚合成每 SPU 的 qanchors、semantic_attributes、tags。供 run_in_executor 调用。 | |
| 445 | + 同步执行内容理解,返回与 ES mapping 对齐的字段结构。 | |
| 446 | + 语言策略由 product_enrich 内部统一决定,路由层不参与。 | |
| 451 | 447 | """ |
| 452 | - from indexer.product_enrich import analyze_products, split_multi_value_field | |
| 448 | + from indexer.product_enrich import build_index_content_fields | |
| 453 | 449 | |
| 454 | - llm_langs = list(dict.fromkeys(languages)) or ["en"] | |
| 455 | - | |
| 456 | - products = [ | |
| 450 | + results = build_index_content_fields(items=items, tenant_id=tenant_id) | |
| 451 | + return [ | |
| 457 | 452 | { |
| 458 | - "id": it["spu_id"], | |
| 459 | - "title": (it.get("title") or "").strip(), | |
| 460 | - "brief": (it.get("brief") or "").strip(), | |
| 461 | - "description": (it.get("description") or "").strip(), | |
| 462 | - "image_url": (it.get("image_url") or "").strip(), | |
| 453 | + "spu_id": item["id"], | |
| 454 | + "qanchors": item["qanchors"], | |
| 455 | + "enriched_attributes": item["enriched_attributes"], | |
| 456 | + "enriched_tags": item["enriched_tags"], | |
| 457 | + **({"error": item["error"]} if item.get("error") else {}), | |
| 463 | 458 | } |
| 464 | - for it in items | |
| 465 | - ] | |
| 466 | - dim_keys = [ | |
| 467 | - "tags", | |
| 468 | - "target_audience", | |
| 469 | - "usage_scene", | |
| 470 | - "season", | |
| 471 | - "key_attributes", | |
| 472 | - "material", | |
| 473 | - "features", | |
| 459 | + for item in results | |
| 474 | 460 | ] |
| 475 | 461 | |
| 476 | - # 按 spu_id 聚合:qanchors[lang], semantic_attributes[], tags[] | |
| 477 | - by_spu: Dict[str, Dict[str, Any]] = {} | |
| 478 | - for it in items: | |
| 479 | - sid = str(it["spu_id"]) | |
| 480 | - by_spu[sid] = {"qanchors": {}, "semantic_attributes": [], "tags": []} | |
| 481 | - | |
| 482 | - for lang in llm_langs: | |
| 483 | - try: | |
| 484 | - rows = analyze_products( | |
| 485 | - products=products, | |
| 486 | - target_lang=lang, | |
| 487 | - batch_size=20, | |
| 488 | - tenant_id=tenant_id, | |
| 489 | - ) | |
| 490 | - except Exception as e: | |
| 491 | - logger.warning("enrich-content analyze_products failed for lang=%s: %s", lang, e) | |
| 492 | - for it in items: | |
| 493 | - sid = str(it["spu_id"]) | |
| 494 | - if "error" not in by_spu[sid]: | |
| 495 | - by_spu[sid]["error"] = str(e) | |
| 496 | - continue | |
| 497 | - | |
| 498 | - for row in rows: | |
| 499 | - spu_id = str(row.get("id") or "") | |
| 500 | - if spu_id not in by_spu: | |
| 501 | - continue | |
| 502 | - rec = by_spu[spu_id] | |
| 503 | - if row.get("error"): | |
| 504 | - rec["error"] = row["error"] | |
| 505 | - continue | |
| 506 | - anchor_text = str(row.get("anchor_text") or "").strip() | |
| 507 | - if anchor_text: | |
| 508 | - rec["qanchors"][lang] = anchor_text | |
| 509 | - for name in dim_keys: | |
| 510 | - raw = row.get(name) | |
| 511 | - if not raw: | |
| 512 | - continue | |
| 513 | - for value in split_multi_value_field(str(raw)): | |
| 514 | - rec["semantic_attributes"].append({"lang": lang, "name": name, "value": value}) | |
| 515 | - if name == "tags": | |
| 516 | - rec["tags"].append(value) | |
| 517 | - | |
| 518 | - # 去重 tags(保持顺序) | |
| 519 | - out = [] | |
| 520 | - for it in items: | |
| 521 | - sid = str(it["spu_id"]) | |
| 522 | - rec = by_spu[sid] | |
| 523 | - tags = list(dict.fromkeys(rec["tags"])) | |
| 524 | - out.append({ | |
| 525 | - "spu_id": sid, | |
| 526 | - "qanchors": rec["qanchors"], | |
| 527 | - "semantic_attributes": rec["semantic_attributes"], | |
| 528 | - "tags": tags, | |
| 529 | - **({"error": rec["error"]} if rec.get("error") else {}), | |
| 530 | - }) | |
| 531 | - return out | |
| 532 | - | |
| 533 | 462 | |
| 534 | 463 | @router.post("/enrich-content") |
| 535 | 464 | async def enrich_content(request: EnrichContentRequest): |
| 536 | 465 | """ |
| 537 | - 内容理解字段生成接口:根据商品标题批量生成 qanchors、semantic_attributes、tags。 | |
| 466 | + 内容理解字段生成接口:根据商品标题批量生成 qanchors、enriched_attributes、tags。 | |
| 538 | 467 | |
| 539 | 468 | 使用场景: |
| 540 | 469 | - 外部 indexer 采用「微服务组合」方式自己组织 doc 时,可调用本接口获取 LLM 生成的 |
| 541 | 470 | 锚文本与语义属性,再与翻译、向量化结果合并写入 ES。 |
| 542 | 471 | - 与 /indexer/build-docs 解耦,避免 build-docs 因 LLM 耗时过长而阻塞;调用方可 |
| 543 | - 先拿不含 qanchors/tags 的 doc,再异步或离线补齐本接口结果后更新 ES。 | |
| 472 | + 先拿不含 qanchors/enriched_tags 的 doc,再异步或离线补齐本接口结果后更新 ES。 | |
| 544 | 473 | |
| 545 | 474 | 实现逻辑与 indexer.product_enrich.analyze_products 一致,支持多语言与 Redis 缓存。 |
| 546 | 475 | """ |
| ... | ... | @@ -568,8 +497,7 @@ async def enrich_content(request: EnrichContentRequest): |
| 568 | 497 | None, |
| 569 | 498 | lambda: _run_enrich_content( |
| 570 | 499 | tenant_id=request.tenant_id, |
| 571 | - items=items_payload, | |
| 572 | - languages=request.languages or ["zh", "en"], | |
| 500 | + items=items_payload | |
| 573 | 501 | ), |
| 574 | 502 | ) |
| 575 | 503 | return { | ... | ... |
config/config.yaml
docs/issue-2026-03-27-添加粗排精排-第2轮-done-0327.txt
| ... | ... | @@ -20,7 +20,7 @@ config.yaml |
| 20 | 20 | 现在链路是: |
| 21 | 21 | ES 拉取 coarse_rank.input_window 条 -> 粗排按 text/knn 融合裁到 coarse_rank.output_window -> 再做 SKU 选择和 title suffix -> 精排调用轻量 reranker 裁到 fine_rank.output_window -> 最终重排调用现有 reranker,并在最终融合里加入 fine_score。同时把 reranker client/provider 改成了按 service_profile 选不同 service_url,这样 fine/final 可以共用同一套服务代码,只起不同实例。 |
| 22 | 22 | |
| 23 | -你是用了./.venv/bin/pytest -q tests/test_rerank_client.py tests/test_search_rerank_window.py 进行验证,结果是 18 passed。也按要求尝试了 ./scripts/service_ctl.sh restart reranker backend 做联调,但当前环境里只有 6007 的现有 reranker,配置里的 fine 端口 6009 还没对应实例,而且服务重启后进程状态不太稳定,没能完成完整线上链路验证。当前最关键的落地前置是把轻量 bge 实例补起来,再用同样脚本重启联调。 | |
| 23 | +你用了./.venv/bin/pytest -q tests/test_rerank_client.py tests/test_search_rerank_window.py 进行验证,结果是 18 passed。也按要求尝试了 ./scripts/service_ctl.sh restart reranker backend 做联调,但当前环境里只有 6007 的现有 reranker,配置里的 fine 端口 6009 还没对应实例,而且服务重启后进程状态不太稳定,没能完成完整线上链路验证。当前最关键的落地前置是把轻量 bge 实例补起来,再用同样脚本重启联调。 | |
| 24 | 24 | |
| 25 | 25 | 因此,需要回顾这个漏斗的代码,进行进一步的优化: |
| 26 | 26 | 1. Fine Rank环节的信息都是N/A。fine 实例也需要在config.yaml中体现,并且需要被server_ctl.sh管理,参考现在是怎么管理reranker的。fine和reranker分别调用哪个模型(两者代码功能,但是只是启动不同的实例),请你深度思考后决定。这里是需要参考当前代码、根据实际需要(原来只有一个reranker、现在多了一个精排,也要调用一个重排模型,因此带来的必要的部分重构。 | ... | ... |
| ... | ... | @@ -0,0 +1,98 @@ |
| 1 | +这是上一轮检索效果优化的需求: | |
| 2 | +参考 | |
| 3 | +searcher.py | |
| 4 | +rerank_client.py | |
| 5 | +schema.py | |
| 6 | +es_query_builder.py | |
| 7 | +config.yaml | |
| 8 | +相关性检索优化说明.md | |
| 9 | + | |
| 10 | +在ES返回到rerank期间增加一轮粗排+一轮精排。 | |
| 11 | +1. ES召回,600 | |
| 12 | +2. 粗排:600->240。配置文件增加粗排相关配置,包括输入条数(配置为700,ES拉取的条数改为粗排输入条数),然后增加粗排的融合公式配置,参考现有的reranker融合公式即可、只是去掉其中的重排模型项。 | |
| 13 | +3. 现在的sku选择、为reranker生成title后缀这一套逻辑,是放在粗排后,因为精排也是一个reranker模型(只不过是一个轻量级的,bge-reranker),需要用这个title后缀。 | |
| 14 | +4. 精排:240-80,使用bge-reranker,但是,因为reranker只能选一个backend,考虑如何重构。现在,精排也是一个独立的进程、独立提供端口,服务。但是,因为跟重排逻辑是一致的(即使有部分不一致也应该分离不一致的点进行配置化),所以共用代码,只是根据需要启动两个实例,避免代码冗余。 | |
| 15 | +5. 重排:80,也是用当前的重排代码,调用单独的实例(即现在使用的实例),返回后,经过融合公式,到分页,也参考现在的融合公式,但是,加入一项精排模型打分。 | |
| 16 | +测试时,使用跟我同样的环境./scripts/service_ctl.sh reranker backend 重启相关服务进行测试 | |
| 17 | + | |
| 18 | +你已经完成了一般修改,已把三段排序链路接上了,主改动在 search/searcher.py、search/rerank_client.py、config/schema.py、config/loader.py、config/services_config.py 和 config/config.yaml。 | |
| 19 | + | |
| 20 | +现在链路是: | |
| 21 | +ES 拉取 coarse_rank.input_window 条 -> 粗排按 text/knn 融合裁到 coarse_rank.output_window -> 再做 SKU 选择和 title suffix -> 精排调用轻量 reranker 裁到 fine_rank.output_window -> 最终重排调用现有 reranker,并在最终融合里加入 fine_score。同时把 reranker client/provider 改成了按 service_profile 选不同 service_url,这样 fine/final 可以共用同一套服务代码,只起不同实例。 | |
| 22 | + | |
| 23 | +并且,你对调试展示进行了重构。你已经把结果卡片和全局调试面板都改成按漏斗阶段取值和展示,在 app.js 里把 ES 召回、粗排、精排、最终 rerank 分开渲染了。 | |
| 24 | +现在每条结果的 debug 会按阶段展示: | |
| 25 | +ES 召回:rank、ES score、norm score、matched queries。 | |
| 26 | +粗排:rank/rank_change、coarse_score、text/knn 输入、text_source/text_translation/text_primary/text_support、text_knn/image_knn、factor。 | |
| 27 | +精排:rank/rank_change、fine_score、fine input。 | |
| 28 | +最终 rerank:rank/rank_change、rerank_score、text/knn score、各 factor、fused_score,以及完整 signals。 | |
| 29 | + | |
| 30 | +请你仔细阅读漏斗环节的这些代码,特别是关于打分、重排序、debug信息记录方面的。 | |
| 31 | + | |
| 32 | + | |
| 33 | +现在,请注意,需要优化的是: | |
| 34 | +1. Fine Rank环节似乎没有进行融合公式的计算、继而进行重排序,请修复。 | |
| 35 | +2.从软件工程的视角review代码: | |
| 36 | +因为增加了多重排序漏斗,数据的记录、传递,交互的接口,是否设计足够合理,存在哪些问题。 | |
| 37 | +请从软件工程的角度审视这些逻辑,是否有需要梳理、清理和重写的地方。 | |
| 38 | +3. Fine Rank和Final Rerank环节信息记录优化: | |
| 39 | +这两个环节都要体现融合公式的输入、关键因子、以及融合公式的得分。为了避免代码膨胀,Fine Rank和Final Rerank | |
| 40 | +都可以采用一个字符串记录这些关键信息,字符串内包括融合公式各项的名称和具体数值,以及最终结果。你也可以继续沿用当前的记录方式,需要你对比一下哪种代码量更少、更清晰简洁。 | |
| 41 | +也要仔细思考当前的代码,真实的计算和信息的记录,是否存在分离的情况,是否存在冗余和分叉。这种情况是不允许的,存在隐藏的风险,以后改了正式逻辑而没有改调试信息,将导致不一致。 | |
| 42 | +务必注意,当前已经有相关的信息记录逻辑,注意不要叠补丁,可以适当修改、或者清理重写,而不是新增,要使得代码更简洁和干净,并保证信息记录与真实逻辑一致。 | |
| 43 | + | |
| 44 | + | |
| 45 | +涉及代码较多,请耐心阅读,以上都是一些需要深度思考的任务,慢慢来,留足够多的时间来review和重新设计。 | |
| 46 | + | |
| 47 | + | |
| 48 | + | |
| 49 | + | |
| 50 | + | |
| 51 | + | |
| 52 | +因为增加了两个环节,多了很多变量。 | |
| 53 | +以这些为效果评估的测试集,调试参数。这次的调整范围是,融合公式中的各个 | |
| 54 | +falda negra oficina | |
| 55 | +red fitted tee | |
| 56 | +黒いミディ丈スカート | |
| 57 | +黑色中长半身裙 | |
| 58 | +чёрное летнее платье | |
| 59 | +修身牛仔裤 | |
| 60 | +date night dress | |
| 61 | +vacation outfit dress | |
| 62 | +minimalist top | |
| 63 | + | |
| 64 | +仔细思考这些漏斗中重要的信息如何呈现。对应的修改前端代码。 | |
| 65 | +注意包括整体漏斗信息的呈现,以及每条结构中独自的信息。 | |
| 66 | +我需要这些信息,辅助各环节融合公式的调参,根据我的需求,深度思考该如何设计,要呈现哪些信息,如何呈现。 | |
| 67 | +可以对现有的逻辑做适当的重构,重新整理。 | |
| 68 | + | |
| 69 | + | |
| 70 | + | |
| 71 | + | |
| 72 | +fine 实例也需要在config.yaml中体现,并且需要被server_ctl.sh管理,参考现在是怎么管理reranker的。fine和reranker分别调用哪个模型(两者代码功能,但是只是启动不同的实例),请你深度思考后决定。这里是需要参考当前代码、根据实际需要(原来只有一个reranker、现在多了一个精排,也要调用一个重排模型,因此带来的必要的部分重构。 | |
| 73 | + | |
| 74 | +1. Fine Rank环节的信息都是N/A,是没有配置吗。fine rank是使用bge-reranker,复用当前reranker模型的代码,但是需要单独起一个服务、单独加载一个模型。 | |
| 75 | +2. Ranking Funnel、Fusion Factors、Signal Breakdown | |
| 76 | +这些是不是整合起来、按漏斗收集、整理信息、以及进行呈现比较好。 | |
| 77 | +ES 召回的环节,展示Matched Queries各项打分、ES的总分、norm后打分、排序位置,等等关键信息 | |
| 78 | +粗排:粗排融合公式的各项输入、重要中间结果和参数、最后得分,排序位置以及上升/下降了多少。等等关键信息。 | |
| 79 | +精排:同样例举关键的输入、中间过程、输出、排序和位置变化等。 | |
| 80 | +reranker:类似 | |
| 81 | + | |
| 82 | +因为涉及的环节较多,非常要注意的一个点是:不要每次修改都在原来的基础上,为实现目标而打补丁,应该观察一下所涉及的代码现在是怎么做的,务必注意如何适当的清理掉现有逻辑,该如何对其进行修改,来达到目的,以达到代码的精简,避免冗余、分叉。 | |
| 83 | + | |
| 84 | + | |
| 85 | + | |
| 86 | + | |
| 87 | + | |
| 88 | +1. Fine Rank 这个环节没有体现融合公式的输入、关键因子、以及融合公式的得分。为了避免代码膨胀,Fine Rank和Final Rerank | |
| 89 | +都可以采用一个字符串记录这些关键信息,字符串内包括融合公式各项的名称和具体数值,以及最终结果。 | |
| 90 | + | |
| 91 | + | |
| 92 | + | |
| 93 | + | |
| 94 | + | |
| 95 | + | |
| 96 | + | |
| 97 | + | |
| 98 | + | ... | ... |
| ... | ... | @@ -0,0 +1,314 @@ |
| 1 | +这是上一轮检索质量优化的需求说明: | |
| 2 | + | |
| 3 | +参考文件: | |
| 4 | +`searcher.py` | |
| 5 | +`rerank_client.py` | |
| 6 | +`schema.py` | |
| 7 | +`es_query_builder.py` | |
| 8 | +`config.yaml` | |
| 9 | +`相关性检索优化说明.md` | |
| 10 | + | |
| 11 | +在ES返回结果与重排之间增加一个粗排阶段和一个精排阶段。 | |
| 12 | + | |
| 13 | +1. ES召回:600条 | |
| 14 | +2. 粗排阶段:600 -> 240条。 | |
| 15 | + 在配置文件中增加粗排相关配置,包括输入大小(设置为700,即从ES获取的数量应改为粗排的输入大小)。 | |
| 16 | + 然后增加粗排的融合公式配置。可参考现有的重排融合公式,但需要去掉重排模型那一项。 | |
| 17 | +3. 目前重排的SKU选择和标题后缀生成逻辑,应放在粗排之后,因为精排也是一个重排模型(轻量级模型,`bge-reranker`),它也需要这个标题后缀。 | |
| 18 | +4. 精排阶段:240 -> 80条。 | |
| 19 | + 使用`bge-reranker`,但由于目前重排只能选择一个后端,请考虑如何重构。 | |
| 20 | + 现在精排也应该是一个独立的进程和服务,拥有自己的端口。 | |
| 21 | + 但由于其逻辑与重排逻辑一致(即便存在差异,这些差异也应抽离并配置化),代码应该共享。按需启动两个实例即可,避免代码重复。 | |
| 22 | +5. 最终重排:80条。 | |
| 23 | + 仍使用当前重排代码,调用独立的实例(即当前在用的那个)。 | |
| 24 | + 返回后,应用融合公式,再进行分页。 | |
| 25 | + 这里也应参考当前的融合公式,但需增加一项:精排模型得分。 | |
| 26 | + | |
| 27 | +测试时,请使用与我相同的环境,并使用以下命令重启相关服务: | |
| 28 | +`./scripts/service_ctl.sh reranker backend` | |
| 29 | + | |
| 30 | +你已经完成了整体修改,并将三级排序流水线串联起来了。 | |
| 31 | +主要改动在: | |
| 32 | +`search/searcher.py` | |
| 33 | +`search/rerank_client.py` | |
| 34 | +`config/schema.py` | |
| 35 | +`config/loader.py` | |
| 36 | +`config/services_config.py` | |
| 37 | +以及 `config/config.yaml`。 | |
| 38 | + | |
| 39 | +现在的流程是: | |
| 40 | + | |
| 41 | +ES获取 `coarse_rank.input_window` 条 -> | |
| 42 | +粗排通过文本/KNN融合裁剪至 `coarse_rank.output_window` -> | |
| 43 | +然后进行SKU选择和标题后缀处理 -> | |
| 44 | +精排调用轻量重排,裁剪至 `fine_rank.output_window` -> | |
| 45 | +最终重排调用现有重排,最终融合时也加入了 `fine_score`。 | |
| 46 | + | |
| 47 | +同时,重排客户端/提供者已改为通过 `service_profile` 选择不同的 `service_url`,因此精排和最终重排可以共享同一套服务代码,仅以不同实例运行。 | |
| 48 | + | |
| 49 | +你还重构了调试展示。 | |
| 50 | +你修改了结果卡片和全局调试面板,使其按漏斗阶段读取并渲染数值,在 `app.js` 中,你现在分别渲染ES召回、粗排、精排和最终重排。 | |
| 51 | + | |
| 52 | +现在,每个结果的调试信息按阶段展示: | |
| 53 | + | |
| 54 | +* ES召回:`rank`、ES得分、归一化得分、匹配查询 | |
| 55 | +* 粗排:`rank` / `rank_change`、`coarse_score`、文本/KNN输入、`text_source` / `text_translation` / `text_primary` / `text_support`、`text_knn` / `image_knn`、`factor` | |
| 56 | +* 精排:`rank` / `rank_change`、`fine_score`、`fine input` | |
| 57 | +* 最终重排:`rank` / `rank_change`、`rerank_score`、文本/KNN得分、各因子、`fused_score` 以及完整信号 | |
| 58 | + | |
| 59 | +请仔细阅读这些漏斗阶段的代码,特别是涉及打分、重排和调试信息记录的部分。 | |
| 60 | + | |
| 61 | +现在,请注意需要优化的部分: | |
| 62 | + | |
| 63 | +1. 精排阶段似乎没有计算融合公式并据此重排。请修复此问题。 | |
| 64 | +2. 从软件工程的角度审视代码: | |
| 65 | + 既然引入了多级排序漏斗,数据记录、传递和交互接口的设计是否足够合理?存在哪些问题? | |
| 66 | + 请从软件工程角度审视这一逻辑,判断是否有需要重新组织、清理或重写的部分。 | |
| 67 | +3. 优化精排和最终重排阶段的信息记录: | |
| 68 | + 这两个阶段都应体现融合公式的输入、关键因子以及融合公式计算出的得分。 | |
| 69 | + 为避免代码臃肿,精排和最终重排都可以使用一个字符串来记录这些关键信息。该字符串可以包含融合公式中各项的名称和值,以及最终结果。 | |
| 70 | + 你也可以继续使用当前的记录方式;请对比哪种方式代码更少、更清晰简洁。 | |
| 71 | + 同时请仔细思考当前代码:实际的计算过程和记录的信息是否分离?是否存在冗余或分歧? | |
| 72 | + 这是不可取的,因为会引入潜在风险:如果后续修改了生产逻辑但未更新调试信息,就会导致不一致。 | |
| 73 | + | |
| 74 | +请特别注意:现在已经存在相关的信息记录逻辑。不要只是层层打补丁。 | |
| 75 | +你可以适当修改,或者清理重写,而不仅仅是增加代码。 | |
| 76 | +目标是让代码更简单、更干净,同时确保记录的信息始终与实际逻辑保持一致。 | |
| 77 | + | |
| 78 | +涉及代码较多,请耐心阅读。 | |
| 79 | +以上所有任务都需要深入思考。请慢慢来,为全面的审查和重新设计留出足够空间。 | |
| 80 | + | |
| 81 | + | |
| 82 | + | |
| 83 | + | |
| 84 | + | |
| 85 | + | |
| 86 | +**整体图** | |
| 87 | +这个 pipeline 现在可以理解成一条“先广召回,再逐层收窄、逐层加贵信号”的漏斗: | |
| 88 | + | |
| 89 | +1. Query 解析 | |
| 90 | +2. ES 召回 | |
| 91 | +3. 粗排:只用 ES 内部文本/KNN 信号 | |
| 92 | +4. 款式 SKU 选择 + title suffix | |
| 93 | +5. 精排:轻量 reranker + 文本/KNN 融合 | |
| 94 | +6. 最终 rerank:重 reranker + fine score + 文本/KNN 融合 | |
| 95 | +7. 分页、补全字段、格式化返回 | |
| 96 | + | |
| 97 | +主控代码在 [searcher.py](/data/saas-search/search/searcher.py),打分与 rerank 细节在 [rerank_client.py](/data/saas-search/search/rerank_client.py),配置定义在 [schema.py](/data/saas-search/config/schema.py) 和 [config.yaml](/data/saas-search/config/config.yaml)。 | |
| 98 | + | |
| 99 | +**先看入口怎么决定走哪条路** | |
| 100 | +在 [searcher.py:348](/data/saas-search/search/searcher.py#L348) 开始,`search()` 先读租户语言、开关、窗口大小。 | |
| 101 | +关键判断在 [searcher.py:364](/data/saas-search/search/searcher.py#L364) 到 [searcher.py:372](/data/saas-search/search/searcher.py#L372): | |
| 102 | + | |
| 103 | +- `rerank_window` 现在是 80,见 [config.yaml:256](/data/saas-search/config/config.yaml#L256) | |
| 104 | +- `coarse_rank.input_window` 是 700,`output_window` 是 240,见 [config.yaml:231](/data/saas-search/config/config.yaml#L231) | |
| 105 | +- `fine_rank.input_window` 是 240,`output_window` 是 80,见 [config.yaml:245](/data/saas-search/config/config.yaml#L245) | |
| 106 | + | |
| 107 | +所以如果请求满足 `from_ + size <= rerank_window`,就进入完整漏斗: | |
| 108 | +- ES 实际取前 `700` | |
| 109 | +- 粗排后留 `240` | |
| 110 | +- 精排后留 `80` | |
| 111 | +- 最终 rerank 也只处理这 `80` | |
| 112 | +- 最后再做分页切片 | |
| 113 | + | |
| 114 | +如果请求页超出 80,就不走后面的多阶段漏斗,直接按 ES 原逻辑返回。 | |
| 115 | + | |
| 116 | +这点非常重要,因为它决定了“贵模型只服务头部结果”。 | |
| 117 | + | |
| 118 | +**Step 1:Query 解析阶段** | |
| 119 | +在 [searcher.py:432](/data/saas-search/search/searcher.py#L432) 到 [searcher.py:469](/data/saas-search/search/searcher.py#L469): | |
| 120 | +`query_parser.parse()` 做几件事: | |
| 121 | + | |
| 122 | +- 规范化 query | |
| 123 | +- 检测语言 | |
| 124 | +- 可能做 rewrite | |
| 125 | +- 生成文本向量 | |
| 126 | +- 如果有图搜,还会带图片向量 | |
| 127 | +- 生成翻译结果 | |
| 128 | +- 识别 style intent | |
| 129 | + | |
| 130 | +这一步的结果存在 `parsed_query` 里,后面 ES 查询、style SKU 选择、fine/final rerank 全都依赖它。 | |
| 131 | + | |
| 132 | +**Step 2:ES Query 构建** | |
| 133 | +ES DSL 在 [searcher.py:471](/data/saas-search/search/searcher.py#L471) 开始,通过 [es_query_builder.py:181](/data/saas-search/search/es_query_builder.py#L181) 的 `build_query()` 生成。 | |
| 134 | + | |
| 135 | +这里的核心结构是: | |
| 136 | +- 文本召回 clause | |
| 137 | +- 文本向量 KNN clause | |
| 138 | +- 图片向量 KNN clause | |
| 139 | +- 它们一起放进 `bool.should` | |
| 140 | +- 过滤条件放进 `filter` | |
| 141 | +- facet 的多选条件走 `post_filter` | |
| 142 | + | |
| 143 | +KNN 部分在 [es_query_builder.py:250](/data/saas-search/search/es_query_builder.py#L250) 之后: | |
| 144 | +- 文本向量 clause 名字固定叫 `knn_query` | |
| 145 | +- 图片向量 clause 名字固定叫 `image_knn_query` | |
| 146 | + | |
| 147 | +而文本召回那边,后续 fusion 代码约定会去读: | |
| 148 | +- 原始 query 的 named query:`base_query` | |
| 149 | +- 翻译 query 的 named query:`base_query_trans_*` | |
| 150 | + | |
| 151 | +也就是说,后面的粗排/精排/最终 rerank,并不是重新理解 ES score,而是从 `matched_queries` 里把这些命名子信号拆出来自己重算。 | |
| 152 | + | |
| 153 | +**Step 3:ES 召回** | |
| 154 | +在 [searcher.py:579](/data/saas-search/search/searcher.py#L579) 到 [searcher.py:627](/data/saas-search/search/searcher.py#L627)。 | |
| 155 | + | |
| 156 | +这里有个很关键的工程优化: | |
| 157 | +如果在 rerank window 内,第一次 ES 拉取时会把 `_source` 关掉,只取排序必需信号,见 [searcher.py:517](/data/saas-search/search/searcher.py#L517) 到 [searcher.py:523](/data/saas-search/search/searcher.py#L523)。 | |
| 158 | + | |
| 159 | +原因是: | |
| 160 | +- 粗排先只需要 `_score` 和 `matched_queries` | |
| 161 | +- 不需要一上来把 700 条完整商品详情都拉回来 | |
| 162 | +- 等粗排收窄后,再补 fine/final rerank 需要的字段 | |
| 163 | + | |
| 164 | +这是现在这条 pipeline 很核心的性能设计点。 | |
| 165 | + | |
| 166 | +**Step 4:粗排** | |
| 167 | +粗排入口在 [searcher.py:638](/data/saas-search/search/searcher.py#L638),真正的打分在 [rerank_client.py:348](/data/saas-search/search/rerank_client.py#L348) 的 `coarse_resort_hits()`。 | |
| 168 | + | |
| 169 | +粗排只看两类信号: | |
| 170 | +- `text_score` | |
| 171 | +- `knn_score` | |
| 172 | + | |
| 173 | +它们先都从统一 helper `_build_hit_signal_bundle()` 里拿,见 [rerank_client.py:246](/data/saas-search/search/rerank_client.py#L246)。 | |
| 174 | + | |
| 175 | +文本分怎么来,见 [rerank_client.py:200](/data/saas-search/search/rerank_client.py#L200): | |
| 176 | +- `source_score = matched_queries["base_query"]` | |
| 177 | +- `translation_score = max(base_query_trans_*)` | |
| 178 | +- `weighted_translation = 0.8 * translation_score` | |
| 179 | +- `primary_text = max(source, weighted_translation)` | |
| 180 | +- `support_text = 另一路` | |
| 181 | +- `text_score = primary_text + 0.25 * support_text` | |
| 182 | + | |
| 183 | +这就是一个 text dismax 思路: | |
| 184 | +原 query 是主路,翻译 query 是辅助路,但不是简单相加。 | |
| 185 | + | |
| 186 | +向量分怎么来,见 [rerank_client.py:156](/data/saas-search/search/rerank_client.py#L156): | |
| 187 | +- `text_knn_score` | |
| 188 | +- `image_knn_score` | |
| 189 | +- 分别乘自己的 weight | |
| 190 | +- 取强的一路做主路 | |
| 191 | +- 弱的一路按 `knn_tie_breaker` 做辅助 | |
| 192 | + | |
| 193 | +然后粗排融合公式在 [rerank_client.py:334](/data/saas-search/search/rerank_client.py#L334): | |
| 194 | +- `coarse_score = (text_score + text_bias)^text_exponent * (knn_score + knn_bias)^knn_exponent` | |
| 195 | + | |
| 196 | +配置定义在 [schema.py:124](/data/saas-search/config/schema.py#L124) 和 [config.yaml:231](/data/saas-search/config/config.yaml#L231)。 | |
| 197 | + | |
| 198 | +算完后: | |
| 199 | +- 写入 `hit["_coarse_score"]` | |
| 200 | +- 按 `_coarse_score` 排序 | |
| 201 | +- 留前 240,见 [searcher.py:645](/data/saas-search/search/searcher.py#L645) | |
| 202 | + | |
| 203 | +**Step 5:粗排后补字段 + SKU 选择** | |
| 204 | +粗排完以后,`searcher` 会按 doc template 反推 fine/final rerank 需要哪些 `_source` 字段,然后只补这些字段,见 [searcher.py:669](/data/saas-search/search/searcher.py#L669)。 | |
| 205 | + | |
| 206 | +之后才做 style SKU 选择,见 [searcher.py:696](/data/saas-search/search/searcher.py#L696)。 | |
| 207 | + | |
| 208 | +为什么放这里? | |
| 209 | +因为现在 fine rank 也是 reranker,它也要吃 title suffix。 | |
| 210 | +而 suffix 是 SKU 选择之后写到 hit 上的 `_style_rerank_suffix`。 | |
| 211 | +真正把 suffix 拼进 doc 文本的地方在 [rerank_client.py:65](/data/saas-search/search/rerank_client.py#L65) 到 [rerank_client.py:74](/data/saas-search/search/rerank_client.py#L74)。 | |
| 212 | + | |
| 213 | +所以顺序必须是: | |
| 214 | +- 先粗排 | |
| 215 | +- 再选 SKU | |
| 216 | +- 再用带 suffix 的 title 去跑 fine/final rerank | |
| 217 | + | |
| 218 | +**Step 6:精排** | |
| 219 | +入口在 [searcher.py:711](/data/saas-search/search/searcher.py#L711),实现是 [rerank_client.py:603](/data/saas-search/search/rerank_client.py#L603) 的 `run_lightweight_rerank()`。 | |
| 220 | + | |
| 221 | +它会做三件事: | |
| 222 | + | |
| 223 | +1. 用 `build_docs_from_hits()` 把每条商品变成 reranker 输入文本 | |
| 224 | +2. 用 `service_profile="fine"` 调轻量服务 | |
| 225 | +3. 不再只按 `fine_score` 排,而是按融合后的 `_fine_fused_score` 排 | |
| 226 | + | |
| 227 | +精排融合公式现在是: | |
| 228 | +- `fine_stage_score = fine_factor * text_factor * knn_factor * style_boost` | |
| 229 | + | |
| 230 | +具体公共计算在 [rerank_client.py:286](/data/saas-search/search/rerank_client.py#L286) 的 `_compute_multiplicative_fusion()`: | |
| 231 | +- `fine_factor = (fine_score + fine_bias)^fine_exponent` | |
| 232 | +- `text_factor = (text_score + text_bias)^text_exponent` | |
| 233 | +- `knn_factor = (knn_score + knn_bias)^knn_exponent` | |
| 234 | +- 如果命中了 selected SKU,再乘 style boost | |
| 235 | + | |
| 236 | +写回 hit 的字段见 [rerank_client.py:655](/data/saas-search/search/rerank_client.py#L655): | |
| 237 | +- `_fine_score` | |
| 238 | +- `_fine_fused_score` | |
| 239 | +- `_text_score` | |
| 240 | +- `_knn_score` | |
| 241 | + | |
| 242 | +排序逻辑在 [rerank_client.py:683](/data/saas-search/search/rerank_client.py#L683): | |
| 243 | +按 `_fine_fused_score` 降序排,然后留前 80,见 [searcher.py:727](/data/saas-search/search/searcher.py#L727)。 | |
| 244 | + | |
| 245 | +这就是你这次特别关心的点:现在 fine rank 已经不是“模型裸分排序”,而是“模型分 + ES 文本/KNN 信号融合后排序”。 | |
| 246 | + | |
| 247 | +**Step 7:最终 rerank** | |
| 248 | +入口在 [searcher.py:767](/data/saas-search/search/searcher.py#L767),实现是 [rerank_client.py:538](/data/saas-search/search/rerank_client.py#L538) 的 `run_rerank()`。 | |
| 249 | + | |
| 250 | +它和 fine rank 很像,但多了一个更重的模型分 `rerank_score`。 | |
| 251 | +最终公式是: | |
| 252 | + | |
| 253 | +- `final_score = rerank_factor * fine_factor * text_factor * knn_factor * style_boost` | |
| 254 | + | |
| 255 | +也就是: | |
| 256 | +- fine rank 产生的 `fine_score` 不会丢 | |
| 257 | +- 到最终 rerank 时,它会继续作为一个乘法项参与最终融合 | |
| 258 | + | |
| 259 | +这个逻辑在 [rerank_client.py:468](/data/saas-search/search/rerank_client.py#L468) 到 [rerank_client.py:476](/data/saas-search/search/rerank_client.py#L476)。 | |
| 260 | + | |
| 261 | +算完后写入: | |
| 262 | +- `_rerank_score` | |
| 263 | +- `_fused_score` | |
| 264 | + | |
| 265 | +然后按 `_fused_score` 排序,见 [rerank_client.py:531](/data/saas-search/search/rerank_client.py#L531)。 | |
| 266 | + | |
| 267 | +这里你可以把它理解成: | |
| 268 | +- fine rank 负责“轻量快速筛一遍,把 240 缩成 80” | |
| 269 | +- 最终 rerank 负责“用更贵模型做最终拍板” | |
| 270 | +- 但最终拍板时,不会忽略 fine rank 结果,而是把 fine score 当成一个先验信号保留进去 | |
| 271 | + | |
| 272 | +**Step 8:分页与字段补全** | |
| 273 | +多阶段排序只在头部窗口内完成。 | |
| 274 | +真正返回给用户前,在 [searcher.py:828](/data/saas-search/search/searcher.py#L828) 之后还会做两件事: | |
| 275 | + | |
| 276 | +- 先按 `from_:from_+size` 对最终 80 条切片 | |
| 277 | +- 再按用户原始 `_source` 需求补回页面真正要显示的字段,见 [searcher.py:859](/data/saas-search/search/searcher.py#L859) | |
| 278 | + | |
| 279 | +所以这条链路是“三次不同目的的数据访问”: | |
| 280 | + | |
| 281 | +- 第一次 ES:只要排序信号 | |
| 282 | +- 第二次按 id 回填:只要 fine/final rerank 需要字段 | |
| 283 | +- 第三次按页面 ids 回填:只要最终页面显示字段 | |
| 284 | + | |
| 285 | +这也是为什么它性能上比“一次全量拉 700 条完整文档”更合理。 | |
| 286 | + | |
| 287 | +**Step 9:结果格式化与 debug funnel** | |
| 288 | +最后在 [searcher.py:906](/data/saas-search/search/searcher.py#L906) 进入结果处理。 | |
| 289 | +这里会把每个商品的阶段信息组装成 `ranking_funnel`,见 [searcher.py:1068](/data/saas-search/search/searcher.py#L1068): | |
| 290 | + | |
| 291 | +- `es_recall` | |
| 292 | +- `coarse_rank` | |
| 293 | +- `fine_rank` | |
| 294 | +- `rerank` | |
| 295 | +- `final_page` | |
| 296 | + | |
| 297 | +其中: | |
| 298 | +- coarse stage 主要保留 text/translation/knn 的拆分信号 | |
| 299 | +- fine/rerank stage 现在都保留 `fusion_inputs`、`fusion_factors`、`fusion_summary` | |
| 300 | +- `fusion_summary` 来自真实计算过程本身,见 [rerank_client.py:265](/data/saas-search/search/rerank_client.py#L265) | |
| 301 | + | |
| 302 | +这点很重要,因为现在“实际排序逻辑”和“debug 展示逻辑”是同源的,不是两套各写一份。 | |
| 303 | + | |
| 304 | +**一句话总结这条 pipeline** | |
| 305 | +这条 pipeline 的本质是: | |
| 306 | + | |
| 307 | +- ES 负责便宜的大范围召回 | |
| 308 | +- 粗排负责只靠 ES 内置信号先做一次结构化筛选 | |
| 309 | +- style SKU 选择负责把商品文本改造成更适合 reranker 理解的输入 | |
| 310 | +- fine rank 负责用轻模型把候选进一步压缩 | |
| 311 | +- final rerank 负责用重模型做最终判定 | |
| 312 | +- 每一层都尽量复用前一层信号,而不是推翻重来 | |
| 313 | + | |
| 314 | +如果你愿意,我下一步可以继续按“一个具体 query 的真实流转样例”来讲,比如假设用户搜 `black dress`,我把它从 `parsed_query`、ES named queries、coarse/fine/final 的每个分数怎么出来,完整手推一遍。 | |
| 0 | 315 | \ No newline at end of file | ... | ... |
docs/suggestion索引构建.md
| ... | ... | @@ -169,7 +169,7 @@ |
| 169 | 169 | |
| 170 | 170 | ##### 4.1 从商品索引收集 title / qanchors / tags(Step 1) |
| 171 | 171 | |
| 172 | - - 遍历店铺的所有商品:获取每个商品的 `"spu_id"`, `"title"`, `"qanchors"`, `"tags"`(按 `spu_id`、`id.keyword` 升序,便于 `search_after` 稳定分页) | |
| 172 | + - 遍历店铺的所有商品:获取每个商品的 `"spu_id"`, `"title"`, `"qanchors"`, `"enriched_tags"`(按 `spu_id`、`id.keyword` 升序,便于 `search_after` 稳定分页) | |
| 173 | 173 | |
| 174 | 174 | - 对每个商品文档: |
| 175 | 175 | |
| ... | ... | @@ -207,7 +207,7 @@ |
| 207 | 207 | - **qanchors 处理**: |
| 208 | 208 | - `qanchors` 字段同样为多语言对象: |
| 209 | 209 | ```json |
| 210 | - "qanchors": { "en": "...", "zh": "..." } | |
| 210 | + "qanchors": { "en": ["slim fit", "sporty casual"], "zh": ["修身", "显瘦"] } | |
| 211 | 211 | ``` |
| 212 | 212 | - 取 `q_raw = qanchors[lang]` |
| 213 | 213 | - 通过 `_split_qanchors(q_raw)` 拆分为若干字符串: |
| ... | ... | @@ -217,10 +217,14 @@ |
| 217 | 217 | - `text_norm = _normalize_text(q_text)`,再用 `_looks_noise` 过滤 |
| 218 | 218 | - 同样按 `(lang, text_norm)` 合并为 `SuggestionCandidate`,调用 `add_product("qanchor", spu_id=product_id)`。 |
| 219 | 219 | |
| 220 | - 4. **tags 处理**(与 `index_languages` 循环并列,每个商品只做一次): | |
| 221 | - - `tags` 可为字符串数组,或逗号等分隔的单个字符串;经 `_iter_product_tags` 展开为若干条。 | |
| 222 | - - 每条 tag **无语言字段**:使用 `query.query_parser.detect_text_language_for_suggestions`(与 `QueryParser` 相同的 `LanguageDetector`)判定语言,并约束在租户的 `index_languages` 内。 | |
| 223 | - - 通过 `_looks_noise` 后按 `(detected_lang, text_norm)` 合并,调用 `add_product("tag", spu_id=product_id)`。 | |
| 220 | + 4. **enriched_tags 处理**(与 `index_languages` 循环并列): | |
| 221 | + - `enriched_tags` 现为多语言对象,例如: | |
| 222 | + ```json | |
| 223 | + "enriched_tags": { "en": ["Classic", "ribbed neckline"], "zh": ["辣妹风"] } | |
| 224 | + ``` | |
| 225 | + - 优先读取 `enriched_tags[lang]`,每个值可为字符串数组,或逗号等分隔的单个字符串;经 `_iter_product_tags` 展开为若干条。 | |
| 226 | + - 对历史旧数据,若 `enriched_tags` 仍是单层字符串 / 数组,则继续走语言检测兜底,并约束在租户的 `index_languages` 内。 | |
| 227 | + - 通过 `_looks_noise` 后按 `(lang, text_norm)` 合并,调用 `add_product("tag", spu_id=product_id)`。 | |
| 224 | 228 | |
| 225 | 229 | ##### 4.2 从查询日志收集用户 query(Step 2) |
| 226 | 230 | ... | ... |
docs/工作总结-微服务性能优化与架构.md
| ... | ... | @@ -95,10 +95,10 @@ instruction: "Given a shopping query, rank product titles by relevance" |
| 95 | 95 | |
| 96 | 96 | ### 5. 内容理解字段(支撑 Suggest) |
| 97 | 97 | |
| 98 | -**能力**:支持根据商品标题批量生成 **qanchors**(锚文本)、**semantic_attributes**、**tags**,供索引与 suggest 使用。 | |
| 98 | +**能力**:支持根据商品标题批量生成 **qanchors**(锚文本)、**enriched_attributes**、**tags**,供索引与 suggest 使用。 | |
| 99 | 99 | |
| 100 | 100 | **具体内容**: |
| 101 | -- **接口**:`POST /indexer/enrich-content`(Indexer 服务端口 **6004**)。请求体为 `items` 数组,每项含 `spu_id`、`title`(必填)及可选多语言标题等;单次请求最多 **50 条**,建议批量调用。响应 `results` 与 `items` 一一对应,每项含 `spu_id`、`qanchors`(按语言键,如 `qanchors.zh`、`qanchors.en`,逗号分隔短语)、`semantic_attributes`、`tags`。 | |
| 101 | +- **接口**:`POST /indexer/enrich-content`(Indexer 服务端口 **6004**)。请求体为 `items` 数组,每项含 `spu_id`、`title`(必填)及可选多语言标题等;单次请求最多 **50 条**,建议批量调用。响应 `results` 与 `items` 一一对应,每项含 `spu_id`、`qanchors`(按语言键,如 `qanchors.zh`、`qanchors.en`,逗号分隔短语)、`enriched_attributes`、`tags`。 | |
| 102 | 102 | -- **索引侧**:微服务组合方式下,调用方先拿不含 qanchors/tags 的 doc,再调用本接口补齐后写入 ES 的 `qanchors.{lang}` 等字段;索引 transformer(`indexer/document_transformer.py`、`indexer/product_enrich.py`)内也可在构建 doc 时调用内容理解逻辑,写入 `qanchors.{lang}`。 |
| 103 | 103 | - **Suggest 侧**:`suggestion/builder.py` 从 ES 商品索引读取 `_source: ["id", "spu_id", "title", "qanchors"]`,对 `qanchors.{lang}` 用 `_split_qanchors` 拆成词条,以 `source="qanchor"` 加入候选,排序时 `qanchor` 权重大于纯 title(`add_product("qanchor", ...)`);suggest 配置中 `sources: ["query_log", "qanchor"]` 表示候选来源包含 qanchor。 |
| 104 | 104 | - **实现与依赖**:内容理解内部使用大模型(需 `DASHSCOPE_API_KEY`),支持多语言与 Redis 缓存(如 `product_anchors`);逻辑与 `indexer/product_enrich` 一致。 | ... | ... |
docs/常用查询 - ES.md
| 1 | 1 | |
| 2 | 2 | |
| 3 | -# 查看所有租户索引 | |
| 4 | - curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/_cat/indices/search_products_tenant_*?v' | |
| 3 | +## Elasticsearch 排查流程 | |
| 5 | 4 | |
| 6 | -# ====================================== | |
| 7 | -# 租户相关 | |
| 8 | -# ====================================== | |
| 9 | -# | |
| 10 | -# 说明:索引已按租户拆分为 search_products_tenant_{tenant_id}, | |
| 11 | -# 一般情况下不需要在查询中再按 tenant_id 过滤(可选保留用于排查)。 | |
| 5 | +### 1. 集群健康状态 | |
| 6 | + | |
| 7 | +```bash | |
| 8 | +# 集群整体健康(green / yellow / red) | |
| 9 | +curl -s -u 'saas:4hOaLaf41y2VuI8y' 'http://127.0.0.1:9200/_cluster/health?pretty' | |
| 10 | +``` | |
| 11 | + | |
| 12 | +### 2. 索引概览 | |
| 13 | + | |
| 14 | +```bash | |
| 15 | +# 查看所有租户索引状态与体积 | |
| 16 | +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/_cat/indices/search_products_tenant_*?v' | |
| 17 | + | |
| 18 | +# 或查看全部索引 | |
| 19 | +curl -s -u 'saas:4hOaLaf41y2VuI8y' 'http://127.0.0.1:9200/_cat/indices?v' | |
| 20 | +``` | |
| 21 | + | |
| 22 | +### 3. 分片分布 | |
| 23 | + | |
| 24 | +```bash | |
| 25 | +# 查看分片在各节点的分布情况 | |
| 26 | +curl -s -u 'saas:4hOaLaf41y2VuI8y' 'http://127.0.0.1:9200/_cat/shards?v' | |
| 27 | +``` | |
| 28 | + | |
| 29 | +### 4. 分配诊断(如有异常) | |
| 30 | + | |
| 31 | +```bash | |
| 32 | +# 当 health 非 green 或 shards 状态异常时,定位具体原因 | |
| 33 | +curl -s -u 'saas:4hOaLaf41y2VuI8y' -X POST 'http://127.0.0.1:9200/_cluster/allocation/explain?pretty' \ | |
| 34 | + -H 'Content-Type: application/json' \ | |
| 35 | + -d '{"index":"search_products_tenant_163","shard":0,"primary":true}' | |
| 36 | +``` | |
| 37 | + | |
| 38 | +> 典型结论示例:`disk_threshold` — 磁盘超过高水位,新分片禁止分配。 | |
| 39 | + | |
| 40 | +### 5. 系统层检查 | |
| 41 | + | |
| 42 | +```bash | |
| 43 | +# 服务状态 | |
| 44 | +sudo systemctl status elasticsearch | |
| 45 | + | |
| 46 | +# 磁盘空间 | |
| 47 | +df -h | |
| 48 | + | |
| 49 | +# ES 数据目录占用 | |
| 50 | +du -sh /var/lib/elasticsearch/ | |
| 51 | +``` | |
| 52 | + | |
| 53 | +### 6. 配置与日志 | |
| 54 | + | |
| 55 | +```bash | |
| 56 | +# 配置文件 | |
| 57 | +cat /etc/elasticsearch/elasticsearch.yml | |
| 58 | + | |
| 59 | +# 实时日志 | |
| 60 | +journalctl -u elasticsearch -f | |
| 61 | +``` | |
| 62 | + | |
| 63 | +--- | |
| 64 | + | |
| 65 | +### 快速排查路径 | |
| 66 | + | |
| 67 | +``` | |
| 68 | +_cluster/health → 确认集群状态(green/yellow/red) | |
| 69 | + ↓ | |
| 70 | +_cat/indices → 检查索引体积与状态 | |
| 71 | + ↓ | |
| 72 | +_cat/shards → 查看分片分布 | |
| 73 | + ↓ | |
| 74 | +_cluster/allocation/explain → 定位分配问题(如需要) | |
| 75 | + ↓ | |
| 76 | +systemctl / df / 日志 → 系统层验证 | |
| 77 | +``` | |
| 78 | + | |
| 79 | +--- | |
| 80 | +以下是将您提供的 Elasticsearch 查询整理为 Markdown 格式的文档: | |
| 81 | + | |
| 82 | +--- | |
| 83 | + | |
| 84 | +# Elasticsearch 查询集合 | |
| 85 | + | |
| 86 | +## 租户相关 | |
| 87 | + | |
| 88 | +> **说明**:索引已按租户拆分为 `search_products_tenant_{tenant_id}`,一般情况下不需要在查询中再按 `tenant_id` 过滤(可选保留用于排查)。 | |
| 89 | + | |
| 90 | +--- | |
| 12 | 91 | |
| 13 | 92 | ### 1. 根据 tenant_id / spu_id 查询 |
| 14 | -curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ | |
| 93 | + | |
| 94 | +#### 查询指定 spu_id 的商品(返回 title) | |
| 95 | +```bash | |
| 96 | +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ | |
| 15 | 97 | "size": 11, |
| 16 | 98 | "_source": ["title"], |
| 17 | 99 | "query": { |
| 18 | - "bool": { | |
| 19 | - "filter": [ | |
| 20 | - { "term": {"spu_id" : 206150} } | |
| 21 | - ] | |
| 22 | - } | |
| 100 | + "bool": { | |
| 101 | + "filter": [ | |
| 102 | + { "term": {"spu_id" : 206150} } | |
| 103 | + ] | |
| 104 | + } | |
| 23 | 105 | } |
| 24 | - }' | |
| 25 | - | |
| 26 | - | |
| 27 | -curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ | |
| 28 | - "size": 100, | |
| 29 | - "_source": ["title"], | |
| 30 | - "query": { | |
| 31 | - "match_all": {} | |
| 32 | - } | |
| 33 | 106 | }' |
| 107 | +``` | |
| 34 | 108 | |
| 35 | - | |
| 36 | -curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ | |
| 37 | - "size": 5, | |
| 38 | - "_source": ["title", "keywords", "tags"], | |
| 39 | - "query": { | |
| 40 | - "bool": { | |
| 41 | - "filter": [ | |
| 42 | - { "term": { "spu_id": "223167" } } | |
| 43 | - ] | |
| 109 | +#### 查询所有商品(返回 title) | |
| 110 | +```bash | |
| 111 | +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ | |
| 112 | + "size": 100, | |
| 113 | + "_source": ["title"], | |
| 114 | + "query": { | |
| 115 | + "match_all": {} | |
| 44 | 116 | } |
| 45 | - } | |
| 46 | 117 | }' |
| 118 | +``` | |
| 47 | 119 | |
| 48 | - | |
| 49 | -curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ | |
| 50 | - "size": 1, | |
| 51 | - "_source": ["title", "keywords", "tags"], | |
| 52 | - "query": { | |
| 53 | - "bool": { | |
| 54 | - "must": [ | |
| 55 | - { | |
| 56 | - "match": { | |
| 57 | - "title.en": { | |
| 58 | - "query": "Floerns Women Gothic Graphic Ribbed Strapless Tube Top Asymmetrical Ruched Bandeau Tops" | |
| 59 | - } | |
| 60 | - } | |
| 120 | +#### 查询指定 spu_id 的商品(返回 title、keywords、tags) | |
| 121 | +```bash | |
| 122 | +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ | |
| 123 | + "size": 5, | |
| 124 | + "_source": ["title", "keywords", "tags"], | |
| 125 | + "query": { | |
| 126 | + "bool": { | |
| 127 | + "filter": [ | |
| 128 | + { "term": { "spu_id": "223167" } } | |
| 129 | + ] | |
| 61 | 130 | } |
| 62 | - ], | |
| 63 | - "filter": [ | |
| 64 | - { "terms": { "tags": ["女装", "派对"] } } | |
| 65 | - ] | |
| 66 | 131 | } |
| 67 | - } | |
| 68 | 132 | }' |
| 133 | +``` | |
| 69 | 134 | |
| 135 | +#### 组合查询:匹配标题 + 过滤标签 | |
| 136 | +```bash | |
| 137 | +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ | |
| 138 | + "size": 1, | |
| 139 | + "_source": ["title", "keywords", "tags"], | |
| 140 | + "query": { | |
| 141 | + "bool": { | |
| 142 | + "must": [ | |
| 143 | + { | |
| 144 | + "match": { | |
| 145 | + "title.en": { | |
| 146 | + "query": "Floerns Women Gothic Graphic Ribbed Strapless Tube Top Asymmetrical Ruched Bandeau Tops" | |
| 147 | + } | |
| 148 | + } | |
| 149 | + } | |
| 150 | + ], | |
| 151 | + "filter": [ | |
| 152 | + { "terms": { "tags": ["女装", "派对"] } } | |
| 153 | + ] | |
| 154 | + } | |
| 155 | + } | |
| 156 | +}' | |
| 157 | +``` | |
| 70 | 158 | |
| 71 | -curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ | |
| 159 | +#### 组合查询:匹配标题 + 过滤租户(冗余示例) | |
| 160 | +```bash | |
| 161 | +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ | |
| 72 | 162 | "size": 1, |
| 73 | 163 | "_source": ["title"], |
| 74 | 164 | "query": { |
| 75 | - "bool": { | |
| 76 | - "must": [ | |
| 77 | - { | |
| 78 | - "match": { | |
| 79 | - "title.en": { | |
| 80 | - "query": "Floerns Women Gothic Graphic Ribbed Strapless Tube Top Asymmetrical Ruched Bandeau Tops" | |
| 81 | - } | |
| 82 | - } | |
| 83 | - } | |
| 84 | - ], | |
| 85 | - "filter": [ | |
| 86 | - { "term": { "tenant_id": "170" } } | |
| 87 | - ] | |
| 88 | - } | |
| 165 | + "bool": { | |
| 166 | + "must": [ | |
| 167 | + { | |
| 168 | + "match": { | |
| 169 | + "title.en": { | |
| 170 | + "query": "Floerns Women Gothic Graphic Ribbed Strapless Tube Top Asymmetrical Ruched Bandeau Tops" | |
| 171 | + } | |
| 172 | + } | |
| 173 | + } | |
| 174 | + ], | |
| 175 | + "filter": [ | |
| 176 | + { "term": { "tenant_id": "170" } } | |
| 177 | + ] | |
| 178 | + } | |
| 89 | 179 | } |
| 90 | 180 | }' |
| 181 | +``` | |
| 91 | 182 | |
| 92 | -Curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_analyze' -H 'Content-Type: application/json' -d '{ | |
| 93 | - "analyzer": "index_ik", | |
| 94 | - "text": "14寸第4代-眼珠实身冰雪公仔带手动大推车,搪胶雪宝宝" | |
| 95 | -}' | |
| 183 | +--- | |
| 184 | + | |
| 185 | +### 2. 分析器测试 | |
| 96 | 186 | |
| 97 | -curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_analyze' -H 'Content-Type: application/json' -d '{ | |
| 98 | - "analyzer": "query_ik", | |
| 99 | - "text": "14寸第4代-眼珠实身冰雪公仔带手动大推车,搪胶雪宝宝" | |
| 187 | +#### 测试 index_ik 分析器 | |
| 188 | +```bash | |
| 189 | +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_analyze' -H 'Content-Type: application/json' -d '{ | |
| 190 | + "analyzer": "index_ik", | |
| 191 | + "text": "14寸第4代-眼珠实身冰雪公仔带手动大推车,搪胶雪宝宝" | |
| 100 | 192 | }' |
| 193 | +``` | |
| 101 | 194 | |
| 102 | -curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ | |
| 103 | - "size": 100, | |
| 104 | - "from": 0, | |
| 105 | - "query": { | |
| 106 | - "bool": { | |
| 107 | - "must": [ | |
| 108 | - { | |
| 109 | - "multi_match": { | |
| 110 | - "_name": "base_query", | |
| 111 | - "fields": [ | |
| 112 | - "title.zh^3.0", | |
| 113 | - "brief.zh^1.5", | |
| 114 | - "description.zh", | |
| 115 | - "vendor.zh^1.5", | |
| 116 | - "tags", | |
| 117 | - "category_path.zh^1.5", | |
| 118 | - "category_name_text.zh^1.5", | |
| 119 | - "option1_values^0.5" | |
| 120 | - ], | |
| 121 | - "minimum_should_match": "75%", | |
| 122 | - "operator": "AND", | |
| 123 | - "query": "裙", | |
| 124 | - "tie_breaker": 0.9 | |
| 125 | - } | |
| 126 | - } | |
| 127 | - ], | |
| 128 | - "filter": [ | |
| 129 | - { | |
| 130 | - "match_all": {} | |
| 131 | - } | |
| 132 | - ] | |
| 133 | - } | |
| 134 | - } | |
| 195 | +#### 测试 query_ik 分析器 | |
| 196 | +```bash | |
| 197 | +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_analyze' -H 'Content-Type: application/json' -d '{ | |
| 198 | + "analyzer": "query_ik", | |
| 199 | + "text": "14寸第4代-眼珠实身冰雪公仔带手动大推车,搪胶雪宝宝" | |
| 135 | 200 | }' |
| 201 | +``` | |
| 202 | + | |
| 203 | +--- | |
| 136 | 204 | |
| 137 | -curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ | |
| 138 | - "size": 1, | |
| 139 | - "from": 0, | |
| 140 | - "query": { | |
| 141 | - "bool": { | |
| 142 | - "must": [ | |
| 143 | - { | |
| 144 | - "multi_match": { | |
| 145 | - "_name": "base_query", | |
| 146 | - "fields": [ | |
| 147 | - "title.zh^3.0", | |
| 148 | - "brief.zh^1.5", | |
| 149 | - "description.zh", | |
| 150 | - "vendor.zh^1.5", | |
| 151 | - "tags", | |
| 152 | - "category_path.zh^1.5", | |
| 153 | - "category_name_text.zh^1.5", | |
| 154 | - "option1_values^0.5" | |
| 205 | +### 3. 多字段搜索 + 聚合(综合分面示例) | |
| 206 | + | |
| 207 | +#### 多字段匹配 + 聚合(category1、color、size、material) | |
| 208 | +```bash | |
| 209 | +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ | |
| 210 | + "size": 1, | |
| 211 | + "from": 0, | |
| 212 | + "query": { | |
| 213 | + "bool": { | |
| 214 | + "must": [ | |
| 215 | + { | |
| 216 | + "multi_match": { | |
| 217 | + "_name": "base_query", | |
| 218 | + "fields": [ | |
| 219 | + "title.zh^3.0", | |
| 220 | + "brief.zh^1.5", | |
| 221 | + "description.zh", | |
| 222 | + "vendor.zh^1.5", | |
| 223 | + "tags", | |
| 224 | + "category_path.zh^1.5", | |
| 225 | + "category_name_text.zh^1.5", | |
| 226 | + "option1_values^0.5" | |
| 227 | + ], | |
| 228 | + "minimum_should_match": "75%", | |
| 229 | + "operator": "AND", | |
| 230 | + "query": "裙", | |
| 231 | + "tie_breaker": 0.9 | |
| 232 | + } | |
| 233 | + } | |
| 155 | 234 | ], |
| 156 | - "minimum_should_match": "75%", | |
| 157 | - "operator": "AND", | |
| 158 | - "query": "裙", | |
| 159 | - "tie_breaker": 0.9 | |
| 160 | - } | |
| 161 | - } | |
| 162 | - ], | |
| 163 | - "filter": [ | |
| 164 | - { "match_all": {} } | |
| 165 | - ] | |
| 166 | - } | |
| 167 | - }, | |
| 168 | - "aggs": { | |
| 169 | - "category1_name_facet": { | |
| 170 | - "terms": { | |
| 171 | - "field": "category1_name", | |
| 172 | - "size": 15, | |
| 173 | - "order": { | |
| 174 | - "_count": "desc" | |
| 235 | + "filter": [ | |
| 236 | + { "match_all": {} } | |
| 237 | + ] | |
| 175 | 238 | } |
| 176 | - } | |
| 177 | 239 | }, |
| 178 | - "specifications_color_facet": { | |
| 179 | - "nested": { | |
| 180 | - "path": "specifications" | |
| 181 | - }, | |
| 182 | - "aggs": { | |
| 183 | - "filter_by_name": { | |
| 184 | - "filter": { | |
| 185 | - "term": { | |
| 186 | - "specifications.name": "color" | |
| 240 | + "aggs": { | |
| 241 | + "category1_name_facet": { | |
| 242 | + "terms": { | |
| 243 | + "field": "category1_name", | |
| 244 | + "size": 15, | |
| 245 | + "order": { "_count": "desc" } | |
| 187 | 246 | } |
| 188 | - }, | |
| 189 | - "aggs": { | |
| 190 | - "value_counts": { | |
| 191 | - "terms": { | |
| 192 | - "field": "specifications.value", | |
| 193 | - "size": 20, | |
| 194 | - "order": { | |
| 195 | - "_count": "desc" | |
| 247 | + }, | |
| 248 | + "specifications_color_facet": { | |
| 249 | + "nested": { "path": "specifications" }, | |
| 250 | + "aggs": { | |
| 251 | + "filter_by_name": { | |
| 252 | + "filter": { "term": { "specifications.name": "color" } }, | |
| 253 | + "aggs": { | |
| 254 | + "value_counts": { | |
| 255 | + "terms": { | |
| 256 | + "field": "specifications.value", | |
| 257 | + "size": 20, | |
| 258 | + "order": { "_count": "desc" } | |
| 259 | + } | |
| 260 | + } | |
| 261 | + } | |
| 196 | 262 | } |
| 197 | - } | |
| 198 | - } | |
| 199 | - } | |
| 200 | - } | |
| 201 | - } | |
| 202 | - }, | |
| 203 | - "specifications_size_facet": { | |
| 204 | - "nested": { | |
| 205 | - "path": "specifications" | |
| 206 | - }, | |
| 207 | - "aggs": { | |
| 208 | - "filter_by_name": { | |
| 209 | - "filter": { | |
| 210 | - "term": { | |
| 211 | - "specifications.name": "size" | |
| 212 | 263 | } |
| 213 | - }, | |
| 214 | - "aggs": { | |
| 215 | - "value_counts": { | |
| 216 | - "terms": { | |
| 217 | - "field": "specifications.value", | |
| 218 | - "size": 15, | |
| 219 | - "order": { | |
| 220 | - "_count": "desc" | |
| 264 | + }, | |
| 265 | + "specifications_size_facet": { | |
| 266 | + "nested": { "path": "specifications" }, | |
| 267 | + "aggs": { | |
| 268 | + "filter_by_name": { | |
| 269 | + "filter": { "term": { "specifications.name": "size" } }, | |
| 270 | + "aggs": { | |
| 271 | + "value_counts": { | |
| 272 | + "terms": { | |
| 273 | + "field": "specifications.value", | |
| 274 | + "size": 15, | |
| 275 | + "order": { "_count": "desc" } | |
| 276 | + } | |
| 277 | + } | |
| 278 | + } | |
| 221 | 279 | } |
| 222 | - } | |
| 223 | - } | |
| 224 | - } | |
| 225 | - } | |
| 226 | - } | |
| 227 | - }, | |
| 228 | - "specifications_material_facet": { | |
| 229 | - "nested": { | |
| 230 | - "path": "specifications" | |
| 231 | - }, | |
| 232 | - "aggs": { | |
| 233 | - "filter_by_name": { | |
| 234 | - "filter": { | |
| 235 | - "term": { | |
| 236 | - "specifications.name": "material" | |
| 237 | 280 | } |
| 238 | - }, | |
| 239 | - "aggs": { | |
| 240 | - "value_counts": { | |
| 241 | - "terms": { | |
| 242 | - "field": "specifications.value", | |
| 243 | - "size": 10, | |
| 244 | - "order": { | |
| 245 | - "_count": "desc" | |
| 281 | + }, | |
| 282 | + "specifications_material_facet": { | |
| 283 | + "nested": { "path": "specifications" }, | |
| 284 | + "aggs": { | |
| 285 | + "filter_by_name": { | |
| 286 | + "filter": { "term": { "specifications.name": "material" } }, | |
| 287 | + "aggs": { | |
| 288 | + "value_counts": { | |
| 289 | + "terms": { | |
| 290 | + "field": "specifications.value", | |
| 291 | + "size": 10, | |
| 292 | + "order": { "_count": "desc" } | |
| 293 | + } | |
| 294 | + } | |
| 295 | + } | |
| 246 | 296 | } |
| 247 | - } | |
| 248 | 297 | } |
| 249 | - } | |
| 250 | 298 | } |
| 251 | - } | |
| 252 | 299 | } |
| 253 | - } | |
| 254 | 300 | }' |
| 301 | +``` | |
| 302 | + | |
| 303 | +--- | |
| 255 | 304 | |
| 305 | +### 4. 通用查询(通用索引示例) | |
| 306 | + | |
| 307 | +#### 查询所有 | |
| 308 | +```bash | |
| 256 | 309 | GET /search_products_tenant_2/_search |
| 257 | 310 | { |
| 258 | - "query": { | |
| 259 | - "match_all": {} | |
| 260 | - } | |
| 311 | + "query": { | |
| 312 | + "match_all": {} | |
| 313 | + } | |
| 261 | 314 | } |
| 315 | +``` | |
| 262 | 316 | |
| 263 | - | |
| 264 | -curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_search?pretty' -H 'Content-Type: application/json' -d '{ | |
| 317 | +#### 按 spu_id 查询(通用索引) | |
| 318 | +```bash | |
| 319 | +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_search?pretty' -H 'Content-Type: application/json' -d '{ | |
| 265 | 320 | "size": 5, |
| 266 | 321 | "query": { |
| 267 | - "bool": { | |
| 268 | - "filter": [ | |
| 269 | - { "term": { "spu_id": "74123" } } | |
| 270 | - ] | |
| 271 | - } | |
| 322 | + "bool": { | |
| 323 | + "filter": [ | |
| 324 | + { "term": { "spu_id": "74123" } } | |
| 325 | + ] | |
| 326 | + } | |
| 272 | 327 | } |
| 273 | - }' | |
| 328 | +}' | |
| 329 | +``` | |
| 274 | 330 | |
| 331 | +--- | |
| 275 | 332 | |
| 276 | -### 2. 统计租户的总文档数 | |
| 333 | +### 5. 统计租户总文档数 | |
| 334 | + | |
| 335 | +```bash | |
| 277 | 336 | curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_count?pretty' -H 'Content-Type: application/json' -d '{ |
| 278 | - "query": { | |
| 279 | - "match_all": {} | |
| 280 | - } | |
| 337 | + "query": { | |
| 338 | + "match_all": {} | |
| 339 | + } | |
| 281 | 340 | }' |
| 341 | +``` | |
| 282 | 342 | |
| 343 | +--- | |
| 283 | 344 | |
| 284 | -# ====================================== | |
| 285 | -# 分面数据诊断相关查询 | |
| 286 | -# ====================================== | |
| 345 | +## 分面数据诊断相关查询 | |
| 287 | 346 | |
| 288 | -## 1. 检查ES文档的分面字段数据 | |
| 347 | +### 1. 检查 ES 文档的分面字段数据 | |
| 289 | 348 | |
| 290 | -### 1.1 查询特定租户的商品,显示分面相关字段 | |
| 349 | +#### 1.1 查询特定租户的商品,显示分面相关字段 | |
| 350 | +```bash | |
| 291 | 351 | curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ |
| 292 | - "query": { | |
| 293 | - "term": { | |
| 294 | - "tenant_id": "162" | |
| 295 | - } | |
| 296 | - }, | |
| 297 | - "size": 1, | |
| 298 | - "_source": [ | |
| 299 | - "spu_id", | |
| 300 | - "title", | |
| 301 | - "category1_name", | |
| 302 | - "category2_name", | |
| 303 | - "category3_name", | |
| 304 | - "specifications", | |
| 305 | - "option1_name", | |
| 306 | - "option2_name", | |
| 307 | - "option3_name" | |
| 308 | - ] | |
| 352 | + "query": { | |
| 353 | + "term": { "tenant_id": "162" } | |
| 354 | + }, | |
| 355 | + "size": 1, | |
| 356 | + "_source": [ | |
| 357 | + "spu_id", "title", "category1_name", "category2_name", | |
| 358 | + "category3_name", "specifications", "option1_name", | |
| 359 | + "option2_name", "option3_name" | |
| 360 | + ] | |
| 309 | 361 | }' |
| 362 | +``` | |
| 310 | 363 | |
| 311 | -### 1.2 验证category1_name字段是否有数据 | |
| 364 | +#### 1.2 验证 category1_name 字段是否有数据 | |
| 365 | +```bash | |
| 312 | 366 | curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ |
| 313 | - "query": { | |
| 314 | - "bool": { | |
| 315 | - "filter": [ | |
| 316 | - { "term": { "tenant_id": "162" } }, | |
| 317 | - { "exists": { "field": "category1_name" } } | |
| 318 | - ] | |
| 319 | - } | |
| 320 | - }, | |
| 321 | - "size": 0 | |
| 367 | + "query": { | |
| 368 | + "bool": { | |
| 369 | + "filter": [ | |
| 370 | + { "term": { "tenant_id": "162" } }, | |
| 371 | + { "exists": { "field": "category1_name" } } | |
| 372 | + ] | |
| 373 | + } | |
| 374 | + }, | |
| 375 | + "size": 0 | |
| 322 | 376 | }' |
| 377 | +``` | |
| 323 | 378 | |
| 324 | -### 1.3 验证specifications字段是否有数据 | |
| 379 | +#### 1.3 验证 specifications 字段是否有数据 | |
| 380 | +```bash | |
| 325 | 381 | curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ |
| 326 | - "query": { | |
| 327 | - "bool": { | |
| 328 | - "filter": [ | |
| 329 | - { "term": { "tenant_id": "162" } }, | |
| 330 | - { "exists": { "field": "specifications" } } | |
| 331 | - ] | |
| 332 | - } | |
| 333 | - }, | |
| 334 | - "size": 0 | |
| 382 | + "query": { | |
| 383 | + "bool": { | |
| 384 | + "filter": [ | |
| 385 | + { "term": { "tenant_id": "162" } }, | |
| 386 | + { "exists": { "field": "specifications" } } | |
| 387 | + ] | |
| 388 | + } | |
| 389 | + }, | |
| 390 | + "size": 0 | |
| 335 | 391 | }' |
| 392 | +``` | |
| 336 | 393 | |
| 337 | -## 2. 分面聚合查询(Facet Aggregations) | |
| 394 | +--- | |
| 338 | 395 | |
| 339 | -### 2.1 category1_name 分面聚合 | |
| 396 | +### 2. 分面聚合查询(Facet Aggregations) | |
| 397 | + | |
| 398 | +#### 2.1 category1_name 分面聚合 | |
| 399 | +```bash | |
| 340 | 400 | curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ |
| 341 | - "query": { | |
| 342 | - "match_all": {} | |
| 343 | - }, | |
| 344 | - "size": 0, | |
| 345 | - "aggs": { | |
| 346 | - "category1_name_facet": { | |
| 347 | - "terms": { | |
| 348 | - "field": "category1_name", | |
| 349 | - "size": 50 | |
| 350 | - } | |
| 401 | + "query": { "match_all": {} }, | |
| 402 | + "size": 0, | |
| 403 | + "aggs": { | |
| 404 | + "category1_name_facet": { | |
| 405 | + "terms": { "field": "category1_name", "size": 50 } | |
| 406 | + } | |
| 351 | 407 | } |
| 352 | - } | |
| 353 | 408 | }' |
| 409 | +``` | |
| 354 | 410 | |
| 355 | -### 2.2 specifications.color 分面聚合 | |
| 411 | +#### 2.2 specifications.color 分面聚合 | |
| 412 | +```bash | |
| 356 | 413 | curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ |
| 357 | - "query": { | |
| 358 | - "match_all": {} | |
| 359 | - }, | |
| 360 | - "size": 0, | |
| 361 | - "aggs": { | |
| 362 | - "specifications_color_facet": { | |
| 363 | - "nested": { | |
| 364 | - "path": "specifications" | |
| 365 | - }, | |
| 366 | - "aggs": { | |
| 367 | - "filtered": { | |
| 368 | - "filter": { | |
| 369 | - "term": { | |
| 370 | - "specifications.name": "color" | |
| 371 | - } | |
| 372 | - }, | |
| 373 | - "aggs": { | |
| 374 | - "values": { | |
| 375 | - "terms": { | |
| 376 | - "field": "specifications.value", | |
| 377 | - "size": 50 | |
| 378 | - } | |
| 414 | + "query": { "match_all": {} }, | |
| 415 | + "size": 0, | |
| 416 | + "aggs": { | |
| 417 | + "specifications_color_facet": { | |
| 418 | + "nested": { "path": "specifications" }, | |
| 419 | + "aggs": { | |
| 420 | + "filtered": { | |
| 421 | + "filter": { "term": { "specifications.name": "color" } }, | |
| 422 | + "aggs": { | |
| 423 | + "values": { "terms": { "field": "specifications.value", "size": 50 } } | |
| 424 | + } | |
| 425 | + } | |
| 379 | 426 | } |
| 380 | - } | |
| 381 | 427 | } |
| 382 | - } | |
| 383 | 428 | } |
| 384 | - } | |
| 385 | 429 | }' |
| 430 | +``` | |
| 386 | 431 | |
| 387 | -### 2.3 specifications.size 分面聚合 | |
| 432 | +#### 2.3 specifications.size 分面聚合 | |
| 433 | +```bash | |
| 388 | 434 | curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ |
| 389 | - "query": { | |
| 390 | - "match_all": {} | |
| 391 | - }, | |
| 392 | - "size": 0, | |
| 393 | - "aggs": { | |
| 394 | - "specifications_size_facet": { | |
| 395 | - "nested": { | |
| 396 | - "path": "specifications" | |
| 397 | - }, | |
| 398 | - "aggs": { | |
| 399 | - "filtered": { | |
| 400 | - "filter": { | |
| 401 | - "term": { | |
| 402 | - "specifications.name": "size" | |
| 403 | - } | |
| 404 | - }, | |
| 405 | - "aggs": { | |
| 406 | - "values": { | |
| 407 | - "terms": { | |
| 408 | - "field": "specifications.value", | |
| 409 | - "size": 50 | |
| 410 | - } | |
| 435 | + "query": { "match_all": {} }, | |
| 436 | + "size": 0, | |
| 437 | + "aggs": { | |
| 438 | + "specifications_size_facet": { | |
| 439 | + "nested": { "path": "specifications" }, | |
| 440 | + "aggs": { | |
| 441 | + "filtered": { | |
| 442 | + "filter": { "term": { "specifications.name": "size" } }, | |
| 443 | + "aggs": { | |
| 444 | + "values": { "terms": { "field": "specifications.value", "size": 50 } } | |
| 445 | + } | |
| 446 | + } | |
| 411 | 447 | } |
| 412 | - } | |
| 413 | 448 | } |
| 414 | - } | |
| 415 | 449 | } |
| 416 | - } | |
| 417 | 450 | }' |
| 451 | +``` | |
| 418 | 452 | |
| 419 | -### 2.4 specifications.material 分面聚合 | |
| 453 | +#### 2.4 specifications.material 分面聚合 | |
| 454 | +```bash | |
| 420 | 455 | curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ |
| 421 | - "query": { | |
| 422 | - "match_all": {} | |
| 423 | - }, | |
| 424 | - "size": 0, | |
| 425 | - "aggs": { | |
| 426 | - "specifications_material_facet": { | |
| 427 | - "nested": { | |
| 428 | - "path": "specifications" | |
| 429 | - }, | |
| 430 | - "aggs": { | |
| 431 | - "filtered": { | |
| 432 | - "filter": { | |
| 433 | - "term": { | |
| 434 | - "specifications.name": "material" | |
| 435 | - } | |
| 436 | - }, | |
| 437 | - "aggs": { | |
| 438 | - "values": { | |
| 439 | - "terms": { | |
| 440 | - "field": "specifications.value", | |
| 441 | - "size": 50 | |
| 442 | - } | |
| 456 | + "query": { "match_all": {} }, | |
| 457 | + "size": 0, | |
| 458 | + "aggs": { | |
| 459 | + "specifications_material_facet": { | |
| 460 | + "nested": { "path": "specifications" }, | |
| 461 | + "aggs": { | |
| 462 | + "filtered": { | |
| 463 | + "filter": { "term": { "specifications.name": "material" } }, | |
| 464 | + "aggs": { | |
| 465 | + "values": { "terms": { "field": "specifications.value", "size": 50 } } | |
| 466 | + } | |
| 467 | + } | |
| 443 | 468 | } |
| 444 | - } | |
| 445 | 469 | } |
| 446 | - } | |
| 447 | 470 | } |
| 448 | - } | |
| 449 | 471 | }' |
| 472 | +``` | |
| 450 | 473 | |
| 451 | -### 2.5 综合分面聚合(category + color + size + material) | |
| 474 | +#### 2.5 综合分面聚合(category + color + size + material) | |
| 475 | +```bash | |
| 452 | 476 | curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ |
| 453 | - "query": { | |
| 454 | - "match_all": {} | |
| 455 | - }, | |
| 456 | - "size": 0, | |
| 457 | - "aggs": { | |
| 458 | - "category1_name_facet": { | |
| 459 | - "terms": { | |
| 460 | - "field": "category1_name", | |
| 461 | - "size": 50 | |
| 462 | - } | |
| 463 | - }, | |
| 464 | - "specifications_color_facet": { | |
| 465 | - "nested": { | |
| 466 | - "path": "specifications" | |
| 467 | - }, | |
| 468 | - "aggs": { | |
| 469 | - "filtered": { | |
| 470 | - "filter": { | |
| 471 | - "term": { | |
| 472 | - "specifications.name": "color" | |
| 473 | - } | |
| 474 | - }, | |
| 475 | - "aggs": { | |
| 476 | - "values": { | |
| 477 | - "terms": { | |
| 478 | - "field": "specifications.value", | |
| 479 | - "size": 50 | |
| 480 | - } | |
| 481 | - } | |
| 482 | - } | |
| 483 | - } | |
| 484 | - } | |
| 485 | - }, | |
| 486 | - "specifications_size_facet": { | |
| 487 | - "nested": { | |
| 488 | - "path": "specifications" | |
| 489 | - }, | |
| 490 | - "aggs": { | |
| 491 | - "filtered": { | |
| 492 | - "filter": { | |
| 493 | - "term": { | |
| 494 | - "specifications.name": "size" | |
| 495 | - } | |
| 496 | - }, | |
| 497 | - "aggs": { | |
| 498 | - "values": { | |
| 499 | - "terms": { | |
| 500 | - "field": "specifications.value", | |
| 501 | - "size": 50 | |
| 502 | - } | |
| 477 | + "query": { "match_all": {} }, | |
| 478 | + "size": 0, | |
| 479 | + "aggs": { | |
| 480 | + "category1_name_facet": { "terms": { "field": "category1_name", "size": 50 } }, | |
| 481 | + "specifications_color_facet": { | |
| 482 | + "nested": { "path": "specifications" }, | |
| 483 | + "aggs": { | |
| 484 | + "filtered": { | |
| 485 | + "filter": { "term": { "specifications.name": "color" } }, | |
| 486 | + "aggs": { "values": { "terms": { "field": "specifications.value", "size": 50 } } } | |
| 487 | + } | |
| 503 | 488 | } |
| 504 | - } | |
| 505 | - } | |
| 506 | - } | |
| 507 | - }, | |
| 508 | - "specifications_material_facet": { | |
| 509 | - "nested": { | |
| 510 | - "path": "specifications" | |
| 511 | - }, | |
| 512 | - "aggs": { | |
| 513 | - "filtered": { | |
| 514 | - "filter": { | |
| 515 | - "term": { | |
| 516 | - "specifications.name": "material" | |
| 489 | + }, | |
| 490 | + "specifications_size_facet": { | |
| 491 | + "nested": { "path": "specifications" }, | |
| 492 | + "aggs": { | |
| 493 | + "filtered": { | |
| 494 | + "filter": { "term": { "specifications.name": "size" } }, | |
| 495 | + "aggs": { "values": { "terms": { "field": "specifications.value", "size": 50 } } } | |
| 496 | + } | |
| 517 | 497 | } |
| 518 | - }, | |
| 519 | - "aggs": { | |
| 520 | - "values": { | |
| 521 | - "terms": { | |
| 522 | - "field": "specifications.value", | |
| 523 | - "size": 50 | |
| 524 | - } | |
| 498 | + }, | |
| 499 | + "specifications_material_facet": { | |
| 500 | + "nested": { "path": "specifications" }, | |
| 501 | + "aggs": { | |
| 502 | + "filtered": { | |
| 503 | + "filter": { "term": { "specifications.name": "material" } }, | |
| 504 | + "aggs": { "values": { "terms": { "field": "specifications.value", "size": 50 } } } | |
| 505 | + } | |
| 525 | 506 | } |
| 526 | - } | |
| 527 | 507 | } |
| 528 | - } | |
| 529 | 508 | } |
| 530 | - } | |
| 531 | 509 | }' |
| 510 | +``` | |
| 532 | 511 | |
| 533 | -## 3. 检查specifications嵌套字段的详细结构 | |
| 512 | +--- | |
| 534 | 513 | |
| 535 | -### 3.1 查看specifications的name字段有哪些值 | |
| 514 | +### 3. 检查 specifications 嵌套字段的详细结构 | |
| 515 | + | |
| 516 | +#### 3.1 查看 specifications 的 name 字段有哪些值 | |
| 517 | +```bash | |
| 536 | 518 | curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_search?pretty' -H 'Content-Type: application/json' -d '{ |
| 537 | - "query": { | |
| 538 | - "term": { | |
| 539 | - "tenant_id": "162" | |
| 540 | - } | |
| 541 | - }, | |
| 542 | - "size": 0, | |
| 543 | - "aggs": { | |
| 544 | - "specifications_names": { | |
| 545 | - "nested": { | |
| 546 | - "path": "specifications" | |
| 547 | - }, | |
| 548 | - "aggs": { | |
| 549 | - "name_values": { | |
| 550 | - "terms": { | |
| 551 | - "field": "specifications.name", | |
| 552 | - "size": 20 | |
| 553 | - } | |
| 519 | + "query": { "term": { "tenant_id": "162" } }, | |
| 520 | + "size": 0, | |
| 521 | + "aggs": { | |
| 522 | + "specifications_names": { | |
| 523 | + "nested": { "path": "specifications" }, | |
| 524 | + "aggs": { | |
| 525 | + "name_values": { "terms": { "field": "specifications.name", "size": 20 } } | |
| 526 | + } | |
| 554 | 527 | } |
| 555 | - } | |
| 556 | 528 | } |
| 557 | - } | |
| 558 | 529 | }' |
| 530 | +``` | |
| 559 | 531 | |
| 560 | -### 3.2 查看某个商品的完整specifications数据 | |
| 532 | +#### 3.2 查看某个商品的完整 specifications 数据 | |
| 533 | +```bash | |
| 561 | 534 | curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_search?pretty' -H 'Content-Type: application/json' -d '{ |
| 562 | - "query": { | |
| 563 | - "bool": { | |
| 564 | - "filter": [ | |
| 565 | - { "term": { "tenant_id": "162" } }, | |
| 566 | - { "exists": { "field": "specifications" } } | |
| 567 | - ] | |
| 568 | - } | |
| 569 | - }, | |
| 570 | - "size": 1, | |
| 571 | - "_source": ["spu_id", "title", "specifications"] | |
| 535 | + "query": { | |
| 536 | + "bool": { | |
| 537 | + "filter": [ | |
| 538 | + { "term": { "tenant_id": "162" } }, | |
| 539 | + { "exists": { "field": "specifications" } } | |
| 540 | + ] | |
| 541 | + } | |
| 542 | + }, | |
| 543 | + "size": 1, | |
| 544 | + "_source": ["spu_id", "title", "specifications"] | |
| 572 | 545 | }' |
| 546 | +``` | |
| 547 | + | |
| 548 | +--- | |
| 573 | 549 | |
| 574 | -## 4. 统计查询 | |
| 550 | +### 4. 统计查询 | |
| 575 | 551 | |
| 576 | -### 4.1 统计有category1_name的文档数量 | |
| 552 | +#### 4.1 统计有 category1_name 的文档数量 | |
| 553 | +```bash | |
| 577 | 554 | curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_count?pretty' -H 'Content-Type: application/json' -d '{ |
| 578 | - "query": { | |
| 579 | - "bool": { | |
| 580 | - "filter": [ | |
| 581 | - { "exists": { "field": "category1_name" } } | |
| 582 | - ] | |
| 555 | + "query": { | |
| 556 | + "bool": { | |
| 557 | + "filter": [ | |
| 558 | + { "exists": { "field": "category1_name" } } | |
| 559 | + ] | |
| 560 | + } | |
| 583 | 561 | } |
| 584 | - } | |
| 585 | 562 | }' |
| 563 | +``` | |
| 586 | 564 | |
| 587 | -### 4.2 统计有specifications的文档数量 | |
| 565 | +#### 4.2 统计有 specifications 的文档数量 | |
| 566 | +```bash | |
| 588 | 567 | curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_count?pretty' -H 'Content-Type: application/json' -d '{ |
| 589 | - "query": { | |
| 590 | - "bool": { | |
| 591 | - "filter": [ | |
| 592 | - { "exists": { "field": "specifications" } } | |
| 593 | - ] | |
| 568 | + "query": { | |
| 569 | + "bool": { | |
| 570 | + "filter": [ | |
| 571 | + { "exists": { "field": "specifications" } } | |
| 572 | + ] | |
| 573 | + } | |
| 594 | 574 | } |
| 595 | - } | |
| 596 | 575 | }' |
| 576 | +``` | |
| 597 | 577 | |
| 578 | +--- | |
| 598 | 579 | |
| 599 | -## 5. 诊断问题场景 | |
| 580 | +### 5. 诊断问题场景 | |
| 600 | 581 | |
| 601 | -### 5.1 查找没有category1_name但有category的文档(MySQL有数据但ES没有) | |
| 582 | +#### 5.1 查找没有 category1_name 但有 category 的文档(MySQL 有数据但 ES 没有) | |
| 583 | +```bash | |
| 602 | 584 | curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ |
| 603 | - "query": { | |
| 604 | - "bool": { | |
| 605 | - "filter": [ | |
| 606 | - { "term": { "tenant_id": "162" } } | |
| 607 | - ], | |
| 608 | - "must_not": [ | |
| 609 | - { "exists": { "field": "category1_name" } } | |
| 610 | - ] | |
| 611 | - } | |
| 612 | - }, | |
| 613 | - "size": 10, | |
| 614 | - "_source": ["spu_id", "title", "category_name_text", "category_path"] | |
| 585 | + "query": { | |
| 586 | + "bool": { | |
| 587 | + "filter": [ | |
| 588 | + { "term": { "tenant_id": "162" } } | |
| 589 | + ], | |
| 590 | + "must_not": [ | |
| 591 | + { "exists": { "field": "category1_name" } } | |
| 592 | + ] | |
| 593 | + } | |
| 594 | + }, | |
| 595 | + "size": 10, | |
| 596 | + "_source": ["spu_id", "title", "category_name_text", "category_path"] | |
| 615 | 597 | }' |
| 598 | +``` | |
| 616 | 599 | |
| 617 | -### 5.2 查找有option但没有specifications的文档(数据转换问题) | |
| 600 | +#### 5.2 查找有 option 但没有 specifications 的文档(数据转换问题) | |
| 601 | +```bash | |
| 618 | 602 | curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ |
| 619 | - "query": { | |
| 620 | - "bool": { | |
| 621 | - "filter": [ | |
| 622 | - { "term": { "tenant_id": "162" } }, | |
| 623 | - { "exists": { "field": "option1_name" } } | |
| 624 | - ], | |
| 625 | - "must_not": [ | |
| 626 | - { "exists": { "field": "specifications" } } | |
| 627 | - ] | |
| 628 | - } | |
| 629 | - }, | |
| 630 | - "size": 10, | |
| 631 | - "_source": ["spu_id", "title", "option1_name", "option2_name", "option3_name", "specifications"] | |
| 603 | + "query": { | |
| 604 | + "bool": { | |
| 605 | + "filter": [ | |
| 606 | + { "term": { "tenant_id": "162" } }, | |
| 607 | + { "exists": { "field": "option1_name" } } | |
| 608 | + ], | |
| 609 | + "must_not": [ | |
| 610 | + { "exists": { "field": "specifications" } } | |
| 611 | + ] | |
| 612 | + } | |
| 613 | + }, | |
| 614 | + "size": 10, | |
| 615 | + "_source": ["spu_id", "title", "option1_name", "option2_name", "option3_name", "specifications"] | |
| 632 | 616 | }' |
| 617 | +``` | |
| 618 | + | |
| 619 | +--- | |
| 633 | 620 | |
| 621 | +## 重排序示例 | |
| 634 | 622 | |
| 635 | -重排序: | |
| 623 | +```bash | |
| 636 | 624 | GET /search_products_tenant_170/_search |
| 637 | 625 | { |
| 638 | 626 | "query": { |
| 639 | - "match": { | |
| 627 | + "match": { | |
| 640 | 628 | "title.en": { |
| 641 | 629 | "query": "quick brown fox", |
| 642 | 630 | "minimum_should_match": "90%" |
| ... | ... | @@ -644,31 +632,52 @@ GET /search_products_tenant_170/_search |
| 644 | 632 | } |
| 645 | 633 | }, |
| 646 | 634 | "rescore": { |
| 647 | - "window_size": 50, | |
| 648 | - "query": { | |
| 635 | + "window_size": 50, | |
| 636 | + "query": { | |
| 649 | 637 | "rescore_query": { |
| 650 | 638 | "match_phrase": { |
| 651 | 639 | "title.en": { |
| 652 | 640 | "query": "quick brown fox", |
| 653 | - "slop": 50 | |
| 641 | + "slop": 50 | |
| 654 | 642 | } |
| 655 | 643 | } |
| 656 | 644 | } |
| 657 | 645 | } |
| 658 | 646 | } |
| 659 | 647 | } |
| 648 | +``` | |
| 649 | + | |
| 650 | +--- | |
| 660 | 651 | |
| 652 | +## 检查字段是否存在 | |
| 661 | 653 | |
| 662 | -检查某个字段是否存在 | |
| 654 | +```bash | |
| 663 | 655 | curl -u 'saas:4hOaLaf41y2VuI8y' -X POST \ |
| 664 | - 'http://localhost:9200/search_products_tenant_163/_count' \ | |
| 665 | - -H 'Content-Type: application/json' \ | |
| 666 | - -d '{ | |
| 656 | +'http://localhost:9200/search_products_tenant_163/_count' \ | |
| 657 | +-H 'Content-Type: application/json' \ | |
| 658 | +-d '{ | |
| 667 | 659 | "query": { |
| 668 | - "bool": { | |
| 669 | - "filter": [ | |
| 670 | - { "exists": { "field": "title_embedding" } } | |
| 671 | - ] | |
| 672 | - } | |
| 660 | + "bool": { | |
| 661 | + "filter": [ | |
| 662 | + { "exists": { "field": "image_embedding" } } | |
| 663 | + ] | |
| 664 | + } | |
| 673 | 665 | } |
| 674 | - }' | |
| 675 | 666 | \ No newline at end of file |
| 667 | +}' | |
| 668 | + | |
| 669 | +curl -u 'saas:4hOaLaf41y2VuI8y' -X POST \ | |
| 670 | +'http://localhost:9200/search_products_tenant_163/_count' \ | |
| 671 | +-H 'Content-Type: application/json' \ | |
| 672 | +-d '{ | |
| 673 | +"query": { | |
| 674 | +"nested": { | |
| 675 | + "path": "image_embedding", | |
| 676 | + "query": { | |
| 677 | + "exists": { | |
| 678 | + "field": "image_embedding.vector" | |
| 679 | + } | |
| 680 | + } | |
| 681 | +} | |
| 682 | +} | |
| 683 | +}' | |
| 684 | +``` | |
| 676 | 685 | \ No newline at end of file | ... | ... |
docs/搜索API对接指南-00-总览与快速开始.md
| ... | ... | @@ -90,7 +90,7 @@ curl -X POST "http://43.166.252.75:6002/search/" \ |
| 90 | 90 | | 查询文档 | POST | `/indexer/documents` | 查询SPU文档数据(不写入ES) | |
| 91 | 91 | | 构建ES文档(正式对接) | POST | `/indexer/build-docs` | 基于上游提供的 MySQL 行数据构建 ES doc,不写入 ES,供 Java 等调用后自行写入 | |
| 92 | 92 | | 构建ES文档(测试用) | POST | `/indexer/build-docs-from-db` | 仅在测试/调试时使用,根据 `tenant_id + spu_ids` 内部查库并构建 ES doc | |
| 93 | -| 内容理解字段生成 | POST | `/indexer/enrich-content` | 根据商品标题批量生成 qanchors、semantic_attributes、tags,供微服务组合方式使用 | | |
| 93 | +| 内容理解字段生成 | POST | `/indexer/enrich-content` | 根据商品标题批量生成 qanchors、enriched_attributes、tags,供微服务组合方式使用 | | |
| 94 | 94 | | 索引健康检查 | GET | `/indexer/health` | 检查索引服务状态 | |
| 95 | 95 | | 健康检查 | GET | `/admin/health` | 服务健康检查 | |
| 96 | 96 | | 获取配置 | GET | `/admin/config` | 获取租户配置 | | ... | ... |
docs/搜索API对接指南-05-索引接口(Indexer).md
| ... | ... | @@ -13,7 +13,7 @@ |
| 13 | 13 | | 查询文档 | POST | `/indexer/documents` | 按 SPU ID 列表查询 ES 文档,不写入 ES | |
| 14 | 14 | | 构建 ES 文档(正式) | POST | `/indexer/build-docs` | 由上游提供 MySQL 行数据,返回 ES-ready 文档,不写 ES | |
| 15 | 15 | | 构建 ES 文档(测试) | POST | `/indexer/build-docs-from-db` | 由本服务查库并构建文档,仅测试/调试用 | |
| 16 | -| 内容理解字段生成 | POST | `/indexer/enrich-content` | 根据商品标题批量生成 qanchors、semantic_attributes、tags(供微服务组合方式使用) | | |
| 16 | +| 内容理解字段生成 | POST | `/indexer/enrich-content` | 根据商品标题批量生成 qanchors、enriched_attributes、tags(供微服务组合方式使用) | | |
| 17 | 17 | | 索引健康检查 | GET | `/indexer/health` | 检查索引服务与数据库连接状态 | |
| 18 | 18 | |
| 19 | 19 | #### 5.0 支撑外部 indexer 的三种方式 |
| ... | ... | @@ -510,7 +510,6 @@ curl -X POST "http://localhost:6004/indexer/build-docs" \ |
| 510 | 510 | { |
| 511 | 511 | "spu": { |
| 512 | 512 | "id": 10001, |
| 513 | - "tenant_id": "162", | |
| 514 | 513 | "title": "测试T恤 纯棉短袖", |
| 515 | 514 | "brief": "舒适纯棉,多色可选", |
| 516 | 515 | "description": "这是一款适合日常穿着的纯棉T恤,透气吸汗。", |
| ... | ... | @@ -521,7 +520,7 @@ curl -X POST "http://localhost:6004/indexer/build-docs" \ |
| 521 | 520 | "category_path": "服装/上衣/T恤", |
| 522 | 521 | "fake_sales": 1280, |
| 523 | 522 | "image_src": "https://oss.essa.cn/98532128-cf8e-456c-9e30-6f2a5ea0c19f.jpg", |
| 524 | - "tags": "T恤,纯棉,短袖,夏季", | |
| 523 | + "enriched_tags": ["T恤", "纯棉"], | |
| 525 | 524 | "create_time": "2024-01-01T00:00:00Z", |
| 526 | 525 | "update_time": "2024-01-01T00:00:00Z" |
| 527 | 526 | }, |
| ... | ... | @@ -570,7 +569,7 @@ curl -X POST "http://localhost:6004/indexer/build-docs" \ |
| 570 | 569 | "tenant_id": "170", |
| 571 | 570 | "spu_id": "223167", |
| 572 | 571 | "title": { "en": "...", "zh": "..." }, |
| 573 | - "tags": ["Floerns", "Clothing", "Shoes & Jewelry"], | |
| 572 | + "enriched_tags": ["Floerns", "Clothing", "Shoes & Jewelry"], | |
| 574 | 573 | "skus": [ |
| 575 | 574 | { |
| 576 | 575 | "sku_id": "3988393", |
| ... | ... | @@ -649,7 +648,7 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ |
| 649 | 648 | ### 5.8 内容理解字段生成接口 |
| 650 | 649 | |
| 651 | 650 | - **端点**: `POST /indexer/enrich-content` |
| 652 | -- **描述**: 根据商品内容信息批量生成 **qanchors**(锚文本)、**semantic_attributes**(语义属性)、**tags**(细分标签),供外部 indexer 在「微服务组合」方式下自行拼装 doc 时使用。请求以 `items[]` 传入商品内容字段(必填/可选见下表)。内部逻辑与 `indexer.product_enrich` 一致,支持多语言与 Redis 缓存;单次请求在线程池中执行,避免阻塞其他接口。 | |
| 651 | +- **描述**: 根据商品内容信息批量生成 **qanchors**(锚文本)、**enriched_attributes**(语义属性)、**enriched_tags**(细分标签),供外部 indexer 在「微服务组合」方式下自行拼装 doc 时使用。请求以 `items[]` 传入商品内容字段(必填/可选见下表)。接口只暴露商品内容输入,语言选择、分析维度与最终字段结构统一由 `indexer.product_enrich` 内部决定;当前返回结果与 `search_products` mapping 保持一致。单次请求在线程池中执行,避免阻塞其他接口。 | |
| 653 | 652 | |
| 654 | 653 | #### 请求参数 |
| 655 | 654 | |
| ... | ... | @@ -669,8 +668,7 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ |
| 669 | 668 | "title": "12PCS Dolls with Bottles", |
| 670 | 669 | "image_url": "https://example.com/images/223168.jpg" |
| 671 | 670 | } |
| 672 | - ], | |
| 673 | - "languages": ["zh", "en"] | |
| 671 | + ] | |
| 674 | 672 | } |
| 675 | 673 | ``` |
| 676 | 674 | |
| ... | ... | @@ -678,7 +676,6 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ |
| 678 | 676 | |------|------|------|--------|------| |
| 679 | 677 | | `tenant_id` | string | Y | - | 租户 ID。目前仅用于记录日志,不产生实际作用| |
| 680 | 678 | | `items` | array | Y | - | 待分析列表;**单次最多 50 条** | |
| 681 | -| `languages` | array[string] | N | `["zh", "en"]` | 目标语言,需在支持范围内:`zh`、`en`、`de`、`ru`、`fr` | | |
| 682 | 679 | |
| 683 | 680 | `items[]` 字段说明: |
| 684 | 681 | |
| ... | ... | @@ -696,6 +693,12 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ |
| 696 | 693 | - `tenant_id`、`spu_id` 只用于请求归属与结果回填,不参与缓存键。 |
| 697 | 694 | - 因此,输入内容不变时可跨请求直接命中缓存;任一输入字段变化时,会自然落到新的缓存 key。 |
| 698 | 695 | |
| 696 | +语言说明: | |
| 697 | + | |
| 698 | +- 接口不接受语言控制参数。 | |
| 699 | +- 返回哪些语言、返回哪些语义维度,统一由 `indexer.product_enrich` 内部逻辑决定。 | |
| 700 | +- 当前为了与 `search_products` mapping 对齐,返回结果只包含核心索引语言 `zh`、`en`。 | |
| 701 | + | |
| 699 | 702 | 批量请求建议: |
| 700 | 703 | - **全量**:强烈建议 尽可能 **20 个 SPU/doc** 攒成一个批次后再请求一次。 |
| 701 | 704 | - **增量**:可按时效要求设置时间窗口(例如 **5 分钟**),在窗口内尽可能攒到 **20 个**;达到 20 或窗口到期就发送一次请求。 |
| ... | ... | @@ -711,21 +714,28 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ |
| 711 | 714 | { |
| 712 | 715 | "spu_id": "223167", |
| 713 | 716 | "qanchors": { |
| 714 | - "zh": "短袖T恤,纯棉,男装,夏季", | |
| 715 | - "en": "cotton t-shirt, short sleeve, men, summer" | |
| 717 | + "zh": ["短袖T恤", "纯棉", "男装", "夏季"], | |
| 718 | + "en": ["cotton t-shirt", "short sleeve", "men", "summer"] | |
| 716 | 719 | }, |
| 717 | - "semantic_attributes": [ | |
| 718 | - { "lang": "zh", "name": "tags", "value": "纯棉" }, | |
| 719 | - { "lang": "zh", "name": "usage_scene", "value": "日常" }, | |
| 720 | - { "lang": "en", "name": "tags", "value": "cotton" } | |
| 721 | - ], | |
| 722 | - "tags": ["纯棉", "短袖", "男装", "cotton", "short sleeve"] | |
| 720 | + "enriched_tags": { | |
| 721 | + "zh": ["纯棉", "短袖", "男装"], | |
| 722 | + "en": ["cotton", "short sleeve", "men"] | |
| 723 | + }, | |
| 724 | + "enriched_attributes": [ | |
| 725 | + { "name": "enriched_tags", "value": { "zh": "纯棉" } }, | |
| 726 | + { "name": "usage_scene", "value": { "zh": "日常" } }, | |
| 727 | + { "name": "enriched_tags", "value": { "en": "cotton" } } | |
| 728 | + ] | |
| 723 | 729 | }, |
| 724 | 730 | { |
| 725 | 731 | "spu_id": "223168", |
| 726 | - "qanchors": { "en": "dolls, toys, 12pcs" }, | |
| 727 | - "semantic_attributes": [], | |
| 728 | - "tags": ["dolls", "toys"] | |
| 732 | + "qanchors": { | |
| 733 | + "en": ["dolls", "toys", "12pcs"] | |
| 734 | + }, | |
| 735 | + "enriched_tags": { | |
| 736 | + "en": ["dolls", "toys"] | |
| 737 | + }, | |
| 738 | + "enriched_attributes": [] | |
| 729 | 739 | } |
| 730 | 740 | ] |
| 731 | 741 | } |
| ... | ... | @@ -733,10 +743,10 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ |
| 733 | 743 | |
| 734 | 744 | | 字段 | 类型 | 说明 | |
| 735 | 745 | |------|------|------| |
| 736 | -| `results` | array | 与请求 `items` 一一对应,每项含 `spu_id`、`qanchors`、`semantic_attributes`、`tags` | | |
| 737 | -| `results[].qanchors` | object | 按语言键的锚文本(逗号分隔短语),可写入 ES 文档的 `qanchors.{lang}` | | |
| 738 | -| `results[].semantic_attributes` | array | 语义属性列表,每项为 `{ "lang", "name", "value" }`,可写入 ES 的 `semantic_attributes` nested 字段 | | |
| 739 | -| `results[].tags` | array | 从语义属性中抽取的 `name=tags` 的 value 集合,可与业务原有 `tags` 合并后写入 ES 的 `tags` 字段 | | |
| 746 | +| `results` | array | 与请求 `items` 一一对应,每项含 `spu_id`、`qanchors`、`enriched_attributes`、`enriched_tags` | | |
| 747 | +| `results[].qanchors` | object | 与 ES `qanchors` 字段同结构,按语言键返回短语数组 | | |
| 748 | +| `results[].enriched_tags` | object | 与 ES `enriched_tags` 字段同结构,按语言键返回标签数组 | | |
| 749 | +| `results[].enriched_attributes` | array | 与 ES `enriched_attributes` nested 字段同结构,每项为 `{ "name", "value": { "zh"?: "...", "en"?: "..." } }` | | |
| 740 | 750 | | `results[].error` | string | 若该条处理失败(如 LLM 异常),会在此字段返回错误信息 | |
| 741 | 751 | |
| 742 | 752 | **错误响应**: |
| ... | ... | @@ -753,13 +763,12 @@ curl -X POST "http://localhost:6004/indexer/enrich-content" \ |
| 753 | 763 | "items": [ |
| 754 | 764 | { |
| 755 | 765 | "spu_id": "223167", |
| 756 | - "title": "纯棉短袖T恤 夏季男装", | |
| 766 | + "title": "纯棉短袖T恤 夏季男装夏季男装", | |
| 757 | 767 | "brief": "夏季透气纯棉短袖,舒适亲肤", |
| 758 | 768 | "description": "100%棉,圆领版型,适合日常通勤与休闲穿搭。", |
| 759 | 769 | "image_url": "https://example.com/images/223167.jpg" |
| 760 | 770 | } |
| 761 | - ], | |
| 762 | - "languages": ["zh", "en"] | |
| 771 | + ] | |
| 763 | 772 | }' |
| 764 | 773 | ``` |
| 765 | 774 | ... | ... |
docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md
| ... | ... | @@ -444,7 +444,7 @@ curl "http://localhost:6006/health" |
| 444 | 444 | |
| 445 | 445 | - **Base URL**: Indexer 服务地址,如 `http://localhost:6004` |
| 446 | 446 | - **路径**: `POST /indexer/enrich-content` |
| 447 | -- **说明**: 根据商品标题批量生成 `qanchors`、`semantic_attributes`、`tags`,用于拼装 ES 文档。内部使用大模型(需配置 `DASHSCOPE_API_KEY`),支持多语言与 Redis 缓存;单次最多 50 条,建议批量调用以提升效率。 | |
| 447 | +- **说明**: 根据商品标题批量生成 `qanchors`、`enriched_attributes`、`tags`,用于拼装 ES 文档。内部使用大模型(需配置 `DASHSCOPE_API_KEY`),支持多语言与 Redis 缓存;单次最多 50 条,建议批量调用以提升效率。 | |
| 448 | 448 | |
| 449 | 449 | 请求/响应格式、示例及错误码见 [-05-索引接口(Indexer)](./搜索API对接指南-05-索引接口(Indexer).md#58-内容理解字段生成接口)。 |
| 450 | 450 | ... | ... |
docs/相关性检索优化说明.md
| ... | ... | @@ -260,6 +260,238 @@ python ./scripts/eval_search_quality.py |
| 260 | 260 | 4. 非 `zh/en` 语种字段动态拼接(如 `de/fr/es`) |
| 261 | 261 | |
| 262 | 262 | |
| 263 | +# 搜索pipeline | |
| 264 | +**整体图** | |
| 265 | +这个 pipeline 现在可以理解成一条“先广召回,再逐层收窄、逐层加贵信号”的漏斗: | |
| 266 | + | |
| 267 | +1. Query 解析 | |
| 268 | +2. ES 召回 | |
| 269 | +3. 粗排:只用 ES 内部文本/KNN 信号 | |
| 270 | +4. 款式 SKU 选择 + title suffix | |
| 271 | +5. 精排:轻量 reranker + 文本/KNN 融合 | |
| 272 | +6. 最终 rerank:重 reranker + fine score + 文本/KNN 融合 | |
| 273 | +7. 分页、补全字段、格式化返回 | |
| 274 | + | |
| 275 | +主控代码在 [searcher.py](/data/saas-search/search/searcher.py),打分与 rerank 细节在 [rerank_client.py](/data/saas-search/search/rerank_client.py),配置定义在 [schema.py](/data/saas-search/config/schema.py) 和 [config.yaml](/data/saas-search/config/config.yaml)。 | |
| 276 | + | |
| 277 | +**先看入口怎么决定走哪条路** | |
| 278 | +在 [searcher.py:348](/data/saas-search/search/searcher.py#L348) 开始,`search()` 先读租户语言、开关、窗口大小。 | |
| 279 | +关键判断在 [searcher.py:364](/data/saas-search/search/searcher.py#L364) 到 [searcher.py:372](/data/saas-search/search/searcher.py#L372): | |
| 280 | + | |
| 281 | +- `rerank_window` 现在是 80,见 [config.yaml:256](/data/saas-search/config/config.yaml#L256) | |
| 282 | +- `coarse_rank.input_window` 是 700,`output_window` 是 240,见 [config.yaml:231](/data/saas-search/config/config.yaml#L231) | |
| 283 | +- `fine_rank.input_window` 是 240,`output_window` 是 80,见 [config.yaml:245](/data/saas-search/config/config.yaml#L245) | |
| 284 | + | |
| 285 | +所以如果请求满足 `from_ + size <= rerank_window`,就进入完整漏斗: | |
| 286 | +- ES 实际取前 `700` | |
| 287 | +- 粗排后留 `240` | |
| 288 | +- 精排后留 `80` | |
| 289 | +- 最终 rerank 也只处理这 `80` | |
| 290 | +- 最后再做分页切片 | |
| 291 | + | |
| 292 | +如果请求页超出 80,就不走后面的多阶段漏斗,直接按 ES 原逻辑返回。 | |
| 293 | + | |
| 294 | +这点非常重要,因为它决定了“贵模型只服务头部结果”。 | |
| 295 | + | |
| 296 | +**Step 1:Query 解析阶段** | |
| 297 | +在 [searcher.py:432](/data/saas-search/search/searcher.py#L432) 到 [searcher.py:469](/data/saas-search/search/searcher.py#L469): | |
| 298 | +`query_parser.parse()` 做几件事: | |
| 299 | + | |
| 300 | +- 规范化 query | |
| 301 | +- 检测语言 | |
| 302 | +- 可能做 rewrite | |
| 303 | +- 生成文本向量 | |
| 304 | +- 如果有图搜,还会带图片向量 | |
| 305 | +- 生成翻译结果 | |
| 306 | +- 识别 style intent | |
| 307 | + | |
| 308 | +这一步的结果存在 `parsed_query` 里,后面 ES 查询、style SKU 选择、fine/final rerank 全都依赖它。 | |
| 309 | + | |
| 310 | +**Step 2:ES Query 构建** | |
| 311 | +ES DSL 在 [searcher.py:471](/data/saas-search/search/searcher.py#L471) 开始,通过 [es_query_builder.py:181](/data/saas-search/search/es_query_builder.py#L181) 的 `build_query()` 生成。 | |
| 312 | + | |
| 313 | +这里的核心结构是: | |
| 314 | +- 文本召回 clause | |
| 315 | +- 文本向量 KNN clause | |
| 316 | +- 图片向量 KNN clause | |
| 317 | +- 它们一起放进 `bool.should` | |
| 318 | +- 过滤条件放进 `filter` | |
| 319 | +- facet 的多选条件走 `post_filter` | |
| 320 | + | |
| 321 | +KNN 部分在 [es_query_builder.py:250](/data/saas-search/search/es_query_builder.py#L250) 之后: | |
| 322 | +- 文本向量 clause 名字固定叫 `knn_query` | |
| 323 | +- 图片向量 clause 名字固定叫 `image_knn_query` | |
| 324 | + | |
| 325 | +而文本召回那边,后续 fusion 代码约定会去读: | |
| 326 | +- 原始 query 的 named query:`base_query` | |
| 327 | +- 翻译 query 的 named query:`base_query_trans_*` | |
| 328 | + | |
| 329 | +也就是说,后面的粗排/精排/最终 rerank,并不是重新理解 ES score,而是从 `matched_queries` 里把这些命名子信号拆出来自己重算。 | |
| 330 | + | |
| 331 | +**Step 3:ES 召回** | |
| 332 | +在 [searcher.py:579](/data/saas-search/search/searcher.py#L579) 到 [searcher.py:627](/data/saas-search/search/searcher.py#L627)。 | |
| 333 | + | |
| 334 | +这里有个很关键的工程优化: | |
| 335 | +如果在 rerank window 内,第一次 ES 拉取时会把 `_source` 关掉,只取排序必需信号,见 [searcher.py:517](/data/saas-search/search/searcher.py#L517) 到 [searcher.py:523](/data/saas-search/search/searcher.py#L523)。 | |
| 336 | + | |
| 337 | +原因是: | |
| 338 | +- 粗排先只需要 `_score` 和 `matched_queries` | |
| 339 | +- 不需要一上来把 700 条完整商品详情都拉回来 | |
| 340 | +- 等粗排收窄后,再补 fine/final rerank 需要的字段 | |
| 341 | + | |
| 342 | +这是现在这条 pipeline 很核心的性能设计点。 | |
| 343 | + | |
| 344 | +**Step 4:粗排** | |
| 345 | +粗排入口在 [searcher.py:638](/data/saas-search/search/searcher.py#L638),真正的打分在 [rerank_client.py:348](/data/saas-search/search/rerank_client.py#L348) 的 `coarse_resort_hits()`。 | |
| 346 | + | |
| 347 | +粗排只看两类信号: | |
| 348 | +- `text_score` | |
| 349 | +- `knn_score` | |
| 350 | + | |
| 351 | +它们先都从统一 helper `_build_hit_signal_bundle()` 里拿,见 [rerank_client.py:246](/data/saas-search/search/rerank_client.py#L246)。 | |
| 352 | + | |
| 353 | +文本分怎么来,见 [rerank_client.py:200](/data/saas-search/search/rerank_client.py#L200): | |
| 354 | +- `source_score = matched_queries["base_query"]` | |
| 355 | +- `translation_score = max(base_query_trans_*)` | |
| 356 | +- `weighted_translation = 0.8 * translation_score` | |
| 357 | +- `primary_text = max(source, weighted_translation)` | |
| 358 | +- `support_text = 另一路` | |
| 359 | +- `text_score = primary_text + 0.25 * support_text` | |
| 360 | + | |
| 361 | +这就是一个 text dismax 思路: | |
| 362 | +原 query 是主路,翻译 query 是辅助路,但不是简单相加。 | |
| 363 | + | |
| 364 | +向量分怎么来,见 [rerank_client.py:156](/data/saas-search/search/rerank_client.py#L156): | |
| 365 | +- `text_knn_score` | |
| 366 | +- `image_knn_score` | |
| 367 | +- 分别乘自己的 weight | |
| 368 | +- 取强的一路做主路 | |
| 369 | +- 弱的一路按 `knn_tie_breaker` 做辅助 | |
| 370 | + | |
| 371 | +然后粗排融合公式在 [rerank_client.py:334](/data/saas-search/search/rerank_client.py#L334): | |
| 372 | +- `coarse_score = (text_score + text_bias)^text_exponent * (knn_score + knn_bias)^knn_exponent` | |
| 373 | + | |
| 374 | +配置定义在 [schema.py:124](/data/saas-search/config/schema.py#L124) 和 [config.yaml:231](/data/saas-search/config/config.yaml#L231)。 | |
| 375 | + | |
| 376 | +算完后: | |
| 377 | +- 写入 `hit["_coarse_score"]` | |
| 378 | +- 按 `_coarse_score` 排序 | |
| 379 | +- 留前 240,见 [searcher.py:645](/data/saas-search/search/searcher.py#L645) | |
| 380 | + | |
| 381 | +**Step 5:粗排后补字段 + SKU 选择** | |
| 382 | +粗排完以后,`searcher` 会按 doc template 反推 fine/final rerank 需要哪些 `_source` 字段,然后只补这些字段,见 [searcher.py:669](/data/saas-search/search/searcher.py#L669)。 | |
| 383 | + | |
| 384 | +之后才做 style SKU 选择,见 [searcher.py:696](/data/saas-search/search/searcher.py#L696)。 | |
| 385 | + | |
| 386 | +为什么放这里? | |
| 387 | +因为现在 fine rank 也是 reranker,它也要吃 title suffix。 | |
| 388 | +而 suffix 是 SKU 选择之后写到 hit 上的 `_style_rerank_suffix`。 | |
| 389 | +真正把 suffix 拼进 doc 文本的地方在 [rerank_client.py:65](/data/saas-search/search/rerank_client.py#L65) 到 [rerank_client.py:74](/data/saas-search/search/rerank_client.py#L74)。 | |
| 390 | + | |
| 391 | +所以顺序必须是: | |
| 392 | +- 先粗排 | |
| 393 | +- 再选 SKU | |
| 394 | +- 再用带 suffix 的 title 去跑 fine/final rerank | |
| 395 | + | |
| 396 | +**Step 6:精排** | |
| 397 | +入口在 [searcher.py:711](/data/saas-search/search/searcher.py#L711),实现是 [rerank_client.py:603](/data/saas-search/search/rerank_client.py#L603) 的 `run_lightweight_rerank()`。 | |
| 398 | + | |
| 399 | +它会做三件事: | |
| 400 | + | |
| 401 | +1. 用 `build_docs_from_hits()` 把每条商品变成 reranker 输入文本 | |
| 402 | +2. 用 `service_profile="fine"` 调轻量服务 | |
| 403 | +3. 不再只按 `fine_score` 排,而是按融合后的 `_fine_fused_score` 排 | |
| 404 | + | |
| 405 | +精排融合公式现在是: | |
| 406 | +- `fine_stage_score = fine_factor * text_factor * knn_factor * style_boost` | |
| 407 | + | |
| 408 | +具体公共计算在 [rerank_client.py:286](/data/saas-search/search/rerank_client.py#L286) 的 `_compute_multiplicative_fusion()`: | |
| 409 | +- `fine_factor = (fine_score + fine_bias)^fine_exponent` | |
| 410 | +- `text_factor = (text_score + text_bias)^text_exponent` | |
| 411 | +- `knn_factor = (knn_score + knn_bias)^knn_exponent` | |
| 412 | +- 如果命中了 selected SKU,再乘 style boost | |
| 413 | + | |
| 414 | +写回 hit 的字段见 [rerank_client.py:655](/data/saas-search/search/rerank_client.py#L655): | |
| 415 | +- `_fine_score` | |
| 416 | +- `_fine_fused_score` | |
| 417 | +- `_text_score` | |
| 418 | +- `_knn_score` | |
| 419 | + | |
| 420 | +排序逻辑在 [rerank_client.py:683](/data/saas-search/search/rerank_client.py#L683): | |
| 421 | +按 `_fine_fused_score` 降序排,然后留前 80,见 [searcher.py:727](/data/saas-search/search/searcher.py#L727)。 | |
| 422 | + | |
| 423 | +这就是你这次特别关心的点:现在 fine rank 已经不是“模型裸分排序”,而是“模型分 + ES 文本/KNN 信号融合后排序”。 | |
| 424 | + | |
| 425 | +**Step 7:最终 rerank** | |
| 426 | +入口在 [searcher.py:767](/data/saas-search/search/searcher.py#L767),实现是 [rerank_client.py:538](/data/saas-search/search/rerank_client.py#L538) 的 `run_rerank()`。 | |
| 427 | + | |
| 428 | +它和 fine rank 很像,但多了一个更重的模型分 `rerank_score`。 | |
| 429 | +最终公式是: | |
| 430 | + | |
| 431 | +- `final_score = rerank_factor * fine_factor * text_factor * knn_factor * style_boost` | |
| 432 | + | |
| 433 | +也就是: | |
| 434 | +- fine rank 产生的 `fine_score` 不会丢 | |
| 435 | +- 到最终 rerank 时,它会继续作为一个乘法项参与最终融合 | |
| 436 | + | |
| 437 | +这个逻辑在 [rerank_client.py:468](/data/saas-search/search/rerank_client.py#L468) 到 [rerank_client.py:476](/data/saas-search/search/rerank_client.py#L476)。 | |
| 438 | + | |
| 439 | +算完后写入: | |
| 440 | +- `_rerank_score` | |
| 441 | +- `_fused_score` | |
| 442 | + | |
| 443 | +然后按 `_fused_score` 排序,见 [rerank_client.py:531](/data/saas-search/search/rerank_client.py#L531)。 | |
| 444 | + | |
| 445 | +这里你可以把它理解成: | |
| 446 | +- fine rank 负责“轻量快速筛一遍,把 240 缩成 80” | |
| 447 | +- 最终 rerank 负责“用更贵模型做最终拍板” | |
| 448 | +- 但最终拍板时,不会忽略 fine rank 结果,而是把 fine score 当成一个先验信号保留进去 | |
| 449 | + | |
| 450 | +**Step 8:分页与字段补全** | |
| 451 | +多阶段排序只在头部窗口内完成。 | |
| 452 | +真正返回给用户前,在 [searcher.py:828](/data/saas-search/search/searcher.py#L828) 之后还会做两件事: | |
| 453 | + | |
| 454 | +- 先按 `from_:from_+size` 对最终 80 条切片 | |
| 455 | +- 再按用户原始 `_source` 需求补回页面真正要显示的字段,见 [searcher.py:859](/data/saas-search/search/searcher.py#L859) | |
| 456 | + | |
| 457 | +所以这条链路是“三次不同目的的数据访问”: | |
| 458 | + | |
| 459 | +- 第一次 ES:只要排序信号 | |
| 460 | +- 第二次按 id 回填:只要 fine/final rerank 需要字段 | |
| 461 | +- 第三次按页面 ids 回填:只要最终页面显示字段 | |
| 462 | + | |
| 463 | +这也是为什么它性能上比“一次全量拉 700 条完整文档”更合理。 | |
| 464 | + | |
| 465 | +**Step 9:结果格式化与 debug funnel** | |
| 466 | +最后在 [searcher.py:906](/data/saas-search/search/searcher.py#L906) 进入结果处理。 | |
| 467 | +这里会把每个商品的阶段信息组装成 `ranking_funnel`,见 [searcher.py:1068](/data/saas-search/search/searcher.py#L1068): | |
| 468 | + | |
| 469 | +- `es_recall` | |
| 470 | +- `coarse_rank` | |
| 471 | +- `fine_rank` | |
| 472 | +- `rerank` | |
| 473 | +- `final_page` | |
| 474 | + | |
| 475 | +其中: | |
| 476 | +- coarse stage 主要保留 text/translation/knn 的拆分信号 | |
| 477 | +- fine/rerank stage 现在都保留 `fusion_inputs`、`fusion_factors`、`fusion_summary` | |
| 478 | +- `fusion_summary` 来自真实计算过程本身,见 [rerank_client.py:265](/data/saas-search/search/rerank_client.py#L265) | |
| 479 | + | |
| 480 | +这点很重要,因为现在“实际排序逻辑”和“debug 展示逻辑”是同源的,不是两套各写一份。 | |
| 481 | + | |
| 482 | +**一句话总结这条 pipeline** | |
| 483 | +这条 pipeline 的本质是: | |
| 484 | + | |
| 485 | +- ES 负责便宜的大范围召回 | |
| 486 | +- 粗排负责只靠 ES 内置信号先做一次结构化筛选 | |
| 487 | +- style SKU 选择负责把商品文本改造成更适合 reranker 理解的输入 | |
| 488 | +- fine rank 负责用轻模型把候选进一步压缩 | |
| 489 | +- final rerank 负责用重模型做最终判定 | |
| 490 | +- 每一层都尽量复用前一层信号,而不是推翻重来 | |
| 491 | + | |
| 492 | +如果你愿意,我下一步可以继续按“一个具体 query 的真实流转样例”来讲,比如假设用户搜 `black dress`,我把它从 `parsed_query`、ES named queries、coarse/fine/final 的每个分数怎么出来,完整手推一遍。 | |
| 493 | + | |
| 494 | + | |
| 263 | 495 | |
| 264 | 496 | ## reranker方面: |
| 265 | 497 | BAAI/bge-reranker-v2-m3的一个严重badcase: | ... | ... |
frontend/static/js/app.js
| ... | ... | @@ -546,22 +546,25 @@ function buildProductDebugHtml({ debug, result, spuId, tenantId }) { |
| 546 | 546 | ${buildStageCard('Fine Rank', 'Lightweight reranker output', [ |
| 547 | 547 | { label: 'rank', value: fineStage.rank ?? 'N/A' }, |
| 548 | 548 | { label: 'rank_change', value: fineStage.rank_change ?? 'N/A' }, |
| 549 | - { label: 'fine_score', value: formatDebugNumber(fineStage.score ?? debug.fine_score) }, | |
| 550 | - ], renderJsonDetails('Fine Input', fineStage.rerank_input ?? debug.rerank_input, false))} | |
| 549 | + { label: 'stage_score', value: formatDebugNumber(fineStage.score ?? debug.score) }, | |
| 550 | + { label: 'fine_score', value: formatDebugNumber(fineStage.fine_score ?? debug.fine_score) }, | |
| 551 | + { label: 'text_score', value: formatDebugNumber(fineStage.text_score ?? debug.text_score) }, | |
| 552 | + { label: 'knn_score', value: formatDebugNumber(fineStage.knn_score ?? debug.knn_score) }, | |
| 553 | + ], `${renderJsonDetails('Fine Fusion', fineStage.fusion_summary || debug.fusion_summary || fineStage.fusion_factors, false)}${renderJsonDetails('Fine Input', fineStage.rerank_input ?? debug.rerank_input, false)}`)} | |
| 551 | 554 | ${buildStageCard('Final Rerank', 'Heavy reranker + final fusion', [ |
| 552 | 555 | { label: 'rank', value: rerankStage.rank ?? finalPageStage.rank ?? debug.final_rank ?? 'N/A' }, |
| 553 | 556 | { label: 'rank_change', value: rerankStage.rank_change ?? finalPageStage.rank_change ?? 'N/A' }, |
| 557 | + { label: 'stage_score', value: formatDebugNumber(rerankStage.score ?? rerankStage.fused_score ?? debug.score) }, | |
| 554 | 558 | { label: 'rerank_score', value: formatDebugNumber(rerankStage.rerank_score ?? debug.rerank_score) }, |
| 559 | + { label: 'fine_score', value: formatDebugNumber(rerankStage.fine_score ?? debug.fine_score) }, | |
| 555 | 560 | { label: 'text_score', value: formatDebugNumber(rerankStage.text_score ?? debug.text_score) }, |
| 556 | 561 | { label: 'knn_score', value: formatDebugNumber(rerankStage.knn_score ?? debug.knn_score) }, |
| 557 | - { label: 'text_source', value: formatDebugNumber(rerankStage.signals?.text_source_score ?? debug.text_source_score) }, | |
| 558 | - { label: 'text_translation', value: formatDebugNumber(rerankStage.signals?.text_translation_score ?? debug.text_translation_score) }, | |
| 559 | 562 | { label: 'fine_factor', value: formatDebugNumber(rerankStage.fine_factor ?? debug.fine_factor) }, |
| 560 | 563 | { label: 'rerank_factor', value: formatDebugNumber(rerankStage.rerank_factor ?? debug.rerank_factor) }, |
| 561 | 564 | { label: 'text_factor', value: formatDebugNumber(rerankStage.text_factor ?? debug.text_factor) }, |
| 562 | 565 | { label: 'knn_factor', value: formatDebugNumber(rerankStage.knn_factor ?? debug.knn_factor) }, |
| 563 | 566 | { label: 'fused_score', value: formatDebugNumber(rerankStage.fused_score ?? debug.fused_score) }, |
| 564 | - ], renderJsonDetails('Rerank Signals', rerankStage.signals, false))} | |
| 567 | + ], `${renderJsonDetails('Final Fusion', rerankStage.fusion_summary || debug.fusion_summary || rerankStage.fusion_factors, false)}${renderJsonDetails('Rerank Signals', rerankStage.signals, false)}`)} | |
| 565 | 568 | </div> |
| 566 | 569 | `; |
| 567 | 570 | ... | ... |
indexer/ANCHORS_AND_SEMANTIC_ATTRIBUTES.md
| 1 | -## qanchors 与 semantic_attributes 设计与索引逻辑说明 | |
| 1 | +## qanchors 与 enriched_attributes 设计与索引逻辑说明 | |
| 2 | 2 | |
| 3 | 3 | 本文档详细说明: |
| 4 | 4 | |
| 5 | 5 | - **锚文本字段 `qanchors.{lang}` 的作用与来源** |
| 6 | -- **语义属性字段 `semantic_attributes` 的结构、用途与写入流程** | |
| 6 | +- **语义属性字段 `enriched_attributes` 的结构、用途与写入流程** | |
| 7 | 7 | - **多语言支持策略(zh / en / de / ru / fr)** |
| 8 | 8 | - **索引阶段与 LLM 调用的集成方式** |
| 9 | 9 | |
| ... | ... | @@ -43,13 +43,13 @@ |
| 43 | 43 | - 主搜索:作为额外的全文字段参与 BM25 召回与打分(可在 `search/query_config.py` 中给一定权重); |
| 44 | 44 | - Suggestion:`suggestion/builder.py` 会从 `qanchors.{lang}` 中拆分词条作为候选(`source="qanchor"`,权重大于 `title`)。 |
| 45 | 45 | |
| 46 | -#### 1.2 `semantic_attributes`:面向过滤/分面的通用语义属性 | |
| 46 | +#### 1.2 `enriched_attributes`:面向过滤/分面的通用语义属性 | |
| 47 | 47 | |
| 48 | 48 | - **Mapping 位置**:`mappings/search_products.json`,追加的 nested 字段。 |
| 49 | 49 | - **结构**: |
| 50 | 50 | |
| 51 | 51 | ```1392:1410:/home/tw/saas-search/mappings/search_products.json |
| 52 | -"semantic_attributes": { | |
| 52 | +"enriched_attributes": { | |
| 53 | 53 | "type": "nested", |
| 54 | 54 | "properties": { |
| 55 | 55 | "lang": { "type": "keyword" }, // 语言:zh / en / de / ru / fr |
| ... | ... | @@ -199,7 +199,7 @@ def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> st |
| 199 | 199 | self._fill_llm_attributes(doc, spu_row) |
| 200 | 200 | ``` |
| 201 | 201 | |
| 202 | -也就是说,**每个 SPU 文档默认会尝试补充 qanchors 与 semantic_attributes**。 | |
| 202 | +也就是说,**每个 SPU 文档默认会尝试补充 qanchors 与 enriched_attributes**。 | |
| 203 | 203 | |
| 204 | 204 | #### 3.2 语言选择策略 |
| 205 | 205 | |
| ... | ... | @@ -237,7 +237,7 @@ def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> st |
| 237 | 237 | if not spu_id or not title: |
| 238 | 238 | return |
| 239 | 239 | |
| 240 | - semantic_list = doc.get("semantic_attributes") or [] | |
| 240 | + semantic_list = doc.get("enriched_attributes") or [] | |
| 241 | 241 | qanchors_obj = doc.get("qanchors") or {} |
| 242 | 242 | |
| 243 | 243 | dim_keys = [ |
| ... | ... | @@ -291,7 +291,7 @@ def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> st |
| 291 | 291 | if qanchors_obj: |
| 292 | 292 | doc["qanchors"] = qanchors_obj |
| 293 | 293 | if semantic_list: |
| 294 | - doc["semantic_attributes"] = semantic_list | |
| 294 | + doc["enriched_attributes"] = semantic_list | |
| 295 | 295 | ``` |
| 296 | 296 | |
| 297 | 297 | 要点: |
| ... | ... | @@ -307,7 +307,7 @@ def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> st |
| 307 | 307 | - 没有 `title`; |
| 308 | 308 | - 或者 `tenant_config.index_languages` 与 `SUPPORTED_LANGS` 没有交集; |
| 309 | 309 | - 或 `DASHSCOPE_API_KEY` 未配置 / LLM 请求报错; |
| 310 | -- 则 `_fill_llm_attributes` 会在日志中输出 `warning`,**不会抛异常**,索引流程继续,只是该 SPU 在这一轮不会得到 `qanchors` / `semantic_attributes`。 | |
| 310 | +- 则 `_fill_llm_attributes` 会在日志中输出 `warning`,**不会抛异常**,索引流程继续,只是该 SPU 在这一轮不会得到 `qanchors` / `enriched_attributes`。 | |
| 311 | 311 | |
| 312 | 312 | 这保证了整个索引服务在 LLM 不可用时表现为一个普通的“传统索引”,而不会中断。 |
| 313 | 313 | |
| ... | ... | @@ -344,13 +344,13 @@ def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> st |
| 344 | 344 | ```json |
| 345 | 345 | { |
| 346 | 346 | "nested": { |
| 347 | - "path": "semantic_attributes", | |
| 347 | + "path": "enriched_attributes", | |
| 348 | 348 | "query": { |
| 349 | 349 | "bool": { |
| 350 | 350 | "must": [ |
| 351 | - { "term": { "semantic_attributes.lang": "zh" } }, | |
| 352 | - { "term": { "semantic_attributes.name": "usage_scene" } }, | |
| 353 | - { "term": { "semantic_attributes.value": "通勤" } } | |
| 351 | + { "term": { "enriched_attributes.lang": "zh" } }, | |
| 352 | + { "term": { "enriched_attributes.name": "usage_scene" } }, | |
| 353 | + { "term": { "enriched_attributes.value": "通勤" } } | |
| 354 | 354 | ] |
| 355 | 355 | } |
| 356 | 356 | } |
| ... | ... | @@ -400,7 +400,7 @@ def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> st |
| 400 | 400 | |
| 401 | 401 | 1. **功能定位**: |
| 402 | 402 | - `qanchors.{lang}`:更好地贴近用户真实查询词,用于召回与 suggestion; |
| 403 | - - `semantic_attributes`:以结构化形式承载 LLM 抽取的语义维度,用于 filter / facet。 | |
| 403 | + - `enriched_attributes`:以结构化形式承载 LLM 抽取的语义维度,用于 filter / facet。 | |
| 404 | 404 | 2. **多语言对齐**: |
| 405 | 405 | - 完全复用租户级 `index_languages` 配置; |
| 406 | 406 | - 对每种语言单独生成锚文本与语义属性,不互相混用。 |
| ... | ... | @@ -409,7 +409,7 @@ def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> st |
| 409 | 409 | - 当 LLM/配置异常时,只是“缺少增强特征”,不影响基础搜索能力。 |
| 410 | 410 | 4. **未来扩展**: |
| 411 | 411 | - 可以在 `dim_keys` 中新增维度名(如 `style`, `benefit` 等),只要在 prompt 与解析逻辑中增加对应列即可; |
| 412 | - - 可以为 `semantic_attributes` 增加额外字段(如 `confidence`、`source`),用于更精细的控制(当前 mapping 为简单版)。 | |
| 412 | + - 可以为 `enriched_attributes` 增加额外字段(如 `confidence`、`source`),用于更精细的控制(当前 mapping 为简单版)。 | |
| 413 | 413 | |
| 414 | -如需在查询层面增加基于 `semantic_attributes` 的统一 DSL(类似 `specifications` 的过滤/分面规则),推荐在 `docs/搜索API对接指南.md` 中新增一节,并在 `search/es_query_builder.py` 里封装构造逻辑,避免前端直接拼 nested 查询。 | |
| 414 | +如需在查询层面增加基于 `enriched_attributes` 的统一 DSL(类似 `specifications` 的过滤/分面规则),推荐在 `docs/搜索API对接指南.md` 中新增一节,并在 `search/es_query_builder.py` 里封装构造逻辑,避免前端直接拼 nested 查询。 | |
| 415 | 415 | ... | ... |
indexer/document_transformer.py
| ... | ... | @@ -11,9 +11,8 @@ SPU文档转换器 - 公共转换逻辑。 |
| 11 | 11 | import pandas as pd |
| 12 | 12 | import numpy as np |
| 13 | 13 | import logging |
| 14 | -import re | |
| 15 | 14 | from typing import Dict, Any, Optional, List |
| 16 | -from indexer.product_enrich import analyze_products, split_multi_value_field | |
| 15 | +from indexer.product_enrich import build_index_content_fields | |
| 17 | 16 | |
| 18 | 17 | logger = logging.getLogger(__name__) |
| 19 | 18 | |
| ... | ... | @@ -75,6 +74,39 @@ class SPUDocumentTransformer: |
| 75 | 74 | ) |
| 76 | 75 | return translations |
| 77 | 76 | |
| 77 | + def _build_core_language_text_object( | |
| 78 | + self, | |
| 79 | + text: Optional[str], | |
| 80 | + source_lang: str, | |
| 81 | + scene: str = "general", | |
| 82 | + ) -> Dict[str, str]: | |
| 83 | + """ | |
| 84 | + 构建与 mapping 中 core_language_text(_with_keyword) 对齐的对象。 | |
| 85 | + 当前核心语言固定为 zh/en。 | |
| 86 | + """ | |
| 87 | + if not text or not str(text).strip(): | |
| 88 | + return {} | |
| 89 | + | |
| 90 | + source_text = str(text).strip() | |
| 91 | + obj: Dict[str, str] = {} | |
| 92 | + | |
| 93 | + if source_lang in CORE_INDEX_LANGUAGES: | |
| 94 | + obj[source_lang] = source_text | |
| 95 | + | |
| 96 | + if self.translator: | |
| 97 | + translations = self._translate_index_languages( | |
| 98 | + text=source_text, | |
| 99 | + source_lang=source_lang, | |
| 100 | + index_languages=CORE_INDEX_LANGUAGES, | |
| 101 | + scene=scene, | |
| 102 | + ) | |
| 103 | + for lang in CORE_INDEX_LANGUAGES: | |
| 104 | + val = translations.get(lang) | |
| 105 | + if val and str(val).strip(): | |
| 106 | + obj[lang] = str(val).strip() | |
| 107 | + | |
| 108 | + return obj | |
| 109 | + | |
| 78 | 110 | def transform_spu_to_doc( |
| 79 | 111 | self, |
| 80 | 112 | tenant_id: str, |
| ... | ... | @@ -118,10 +150,16 @@ class SPUDocumentTransformer: |
| 118 | 150 | if self.enable_title_embedding and self.encoder: |
| 119 | 151 | self._fill_title_embedding(doc) |
| 120 | 152 | |
| 121 | - # Tags | |
| 153 | + # Tags:统一转成与 mapping 一致的 core-language object | |
| 122 | 154 | if pd.notna(spu_row.get('tags')): |
| 123 | 155 | tags_str = str(spu_row['tags']) |
| 124 | - doc['tags'] = split_multi_value_field(tags_str) | |
| 156 | + tags_obj = self._build_core_language_text_object( | |
| 157 | + tags_str, | |
| 158 | + source_lang=primary_lang, | |
| 159 | + scene="general", | |
| 160 | + ) | |
| 161 | + if tags_obj: | |
| 162 | + doc['tags'] = tags_obj | |
| 125 | 163 | |
| 126 | 164 | # Category相关字段 |
| 127 | 165 | self._fill_category_fields(doc, spu_row) |
| ... | ... | @@ -202,7 +240,8 @@ class SPUDocumentTransformer: |
| 202 | 240 | """ |
| 203 | 241 | 批量调用 LLM,为一批 doc 填充: |
| 204 | 242 | - qanchors.{lang} |
| 205 | - - semantic_attributes (lang/name/value) | |
| 243 | + - tags.{lang} | |
| 244 | + - enriched_attributes[].value.{lang} | |
| 206 | 245 | |
| 207 | 246 | 设计目标: |
| 208 | 247 | - 尽可能攒批调用 LLM; |
| ... | ... | @@ -211,16 +250,8 @@ class SPUDocumentTransformer: |
| 211 | 250 | if not docs or not spu_rows or len(docs) != len(spu_rows): |
| 212 | 251 | return |
| 213 | 252 | |
| 214 | - try: | |
| 215 | - index_langs = self.tenant_config.get("index_languages") or ["en", "zh"] | |
| 216 | - except Exception: | |
| 217 | - index_langs = ["en", "zh"] | |
| 218 | - # 不再限制为固定 SUPPORTED_LANGS,直接按照租户配置的 index_languages 调用 | |
| 219 | - llm_langs = list(dict.fromkeys(index_langs)) # 去重并保持顺序 | |
| 220 | - | |
| 221 | - # 只对有 title 的 SPU 参与 LLM;其余跳过 | |
| 222 | 253 | id_to_idx: Dict[str, int] = {} |
| 223 | - products: List[Dict[str, str]] = [] | |
| 254 | + items: List[Dict[str, str]] = [] | |
| 224 | 255 | for i, row in enumerate(spu_rows): |
| 225 | 256 | raw_id = row.get("id") |
| 226 | 257 | spu_id = "" if raw_id is None else str(raw_id).strip() |
| ... | ... | @@ -228,69 +259,45 @@ class SPUDocumentTransformer: |
| 228 | 259 | if not spu_id or not title: |
| 229 | 260 | continue |
| 230 | 261 | id_to_idx[spu_id] = i |
| 231 | - products.append({"id": spu_id, "title": title}) | |
| 232 | - if not products: | |
| 262 | + items.append( | |
| 263 | + { | |
| 264 | + "id": spu_id, | |
| 265 | + "title": title, | |
| 266 | + "brief": str(row.get("brief") or "").strip(), | |
| 267 | + "description": str(row.get("description") or "").strip(), | |
| 268 | + "image_url": str(row.get("image_src") or "").strip(), | |
| 269 | + } | |
| 270 | + ) | |
| 271 | + if not items: | |
| 233 | 272 | return |
| 234 | 273 | |
| 235 | 274 | tenant_id = str(docs[0].get("tenant_id") or "").strip() or None |
| 275 | + try: | |
| 276 | + results = build_index_content_fields(items=items, tenant_id=tenant_id) | |
| 277 | + except Exception as e: | |
| 278 | + logger.warning("LLM batch attribute fill failed: %s", e) | |
| 279 | + return | |
| 236 | 280 | |
| 237 | - dim_keys = [ | |
| 238 | - "tags", | |
| 239 | - "target_audience", | |
| 240 | - "usage_scene", | |
| 241 | - "season", | |
| 242 | - "key_attributes", | |
| 243 | - "material", | |
| 244 | - "features", | |
| 245 | - ] | |
| 246 | - | |
| 247 | - for lang in llm_langs: | |
| 248 | - try: | |
| 249 | - rows = analyze_products( | |
| 250 | - products=products, | |
| 251 | - target_lang=lang, | |
| 252 | - batch_size=20, | |
| 253 | - tenant_id=tenant_id, | |
| 254 | - ) | |
| 255 | - except Exception as e: | |
| 256 | - logger.warning("LLM batch attribute fill failed (lang=%s): %s", lang, e) | |
| 281 | + for result in results: | |
| 282 | + spu_id = str(result.get("id") or "").strip() | |
| 283 | + if not spu_id: | |
| 257 | 284 | continue |
| 285 | + idx = id_to_idx.get(spu_id) | |
| 286 | + if idx is None: | |
| 287 | + continue | |
| 288 | + self._apply_content_enrichment(docs[idx], result) | |
| 258 | 289 | |
| 259 | - for row in rows or []: | |
| 260 | - spu_id = str(row.get("id") or "").strip() | |
| 261 | - if not spu_id: | |
| 262 | - continue | |
| 263 | - idx = id_to_idx.get(spu_id) | |
| 264 | - if idx is None: | |
| 265 | - continue | |
| 266 | - self._apply_llm_row(docs[idx], row=row, lang=lang, dim_keys=dim_keys) | |
| 267 | - | |
| 268 | - def _apply_llm_row(self, doc: Dict[str, Any], row: Dict[str, Any], lang: str, dim_keys: List[str]) -> None: | |
| 269 | - """将单条 LLM 输出 row 按既定结构写入 doc(不抛异常)。""" | |
| 290 | + def _apply_content_enrichment(self, doc: Dict[str, Any], enrichment: Dict[str, Any]) -> None: | |
| 291 | + """将 product_enrich 产出的 ES-ready 内容字段写入 doc。""" | |
| 270 | 292 | try: |
| 271 | - if row.get("error"): | |
| 272 | - return | |
| 273 | - | |
| 274 | - semantic_list = doc.get("semantic_attributes") or [] | |
| 275 | - qanchors_obj = doc.get("qanchors") or {} | |
| 276 | - | |
| 277 | - anchor_text = str(row.get("anchor_text") or "").strip() | |
| 278 | - if anchor_text: | |
| 279 | - qanchors_obj[lang] = anchor_text | |
| 280 | - | |
| 281 | - for name in dim_keys: | |
| 282 | - raw = row.get(name) | |
| 283 | - if not raw: | |
| 284 | - continue | |
| 285 | - for value in split_multi_value_field(str(raw)): | |
| 286 | - semantic_list.append({"lang": lang, "name": name, "value": value}) | |
| 287 | - | |
| 288 | - if qanchors_obj: | |
| 289 | - doc["qanchors"] = qanchors_obj | |
| 290 | - if semantic_list: | |
| 291 | - doc["semantic_attributes"] = semantic_list | |
| 293 | + if enrichment.get("qanchors"): | |
| 294 | + doc["qanchors"] = enrichment["qanchors"] | |
| 295 | + if enrichment.get("tags"): | |
| 296 | + doc["tags"] = enrichment["tags"] | |
| 297 | + if enrichment.get("enriched_attributes"): | |
| 298 | + doc["enriched_attributes"] = enrichment["enriched_attributes"] | |
| 292 | 299 | except Exception as e: |
| 293 | - logger.warning("Failed to apply LLM row to doc (spu_id=%s, lang=%s): %s", doc.get("spu_id"), lang, e) | |
| 300 | + logger.warning("Failed to apply enrichment to doc (spu_id=%s): %s", doc.get("spu_id"), e) | |
| 294 | 301 | |
| 295 | 302 | def _fill_text_fields( |
| 296 | 303 | self, |
| ... | ... | @@ -544,6 +551,23 @@ class SPUDocumentTransformer: |
| 544 | 551 | if pd.notna(position) and pd.notna(name): |
| 545 | 552 | option_name_map[int(position)] = str(name) |
| 546 | 553 | |
| 554 | + primary_lang = self.tenant_config.get('primary_language', 'en') | |
| 555 | + | |
| 556 | + def _build_specification(name: str, raw_value: Any, sku_id: str) -> Optional[Dict[str, Any]]: | |
| 557 | + value = "" if raw_value is None else str(raw_value).strip() | |
| 558 | + if not value: | |
| 559 | + return None | |
| 560 | + return { | |
| 561 | + 'sku_id': sku_id, | |
| 562 | + 'name': name, | |
| 563 | + 'value_keyword': value, | |
| 564 | + 'value_text': self._build_core_language_text_object( | |
| 565 | + value, | |
| 566 | + source_lang=primary_lang, | |
| 567 | + scene="general", | |
| 568 | + ) or normalize_core_text_field_value(value, primary_lang), | |
| 569 | + } | |
| 570 | + | |
| 547 | 571 | for _, sku_row in skus.iterrows(): |
| 548 | 572 | sku_data = self._transform_sku_row(sku_row, option_name_map) |
| 549 | 573 | if sku_data: |
| ... | ... | @@ -584,23 +608,17 @@ class SPUDocumentTransformer: |
| 584 | 608 | # 构建specifications(从SKU的option值和option表的name) |
| 585 | 609 | sku_id = str(sku_row['id']) |
| 586 | 610 | if pd.notna(sku_row.get('option1')) and 1 in option_name_map: |
| 587 | - specifications.append({ | |
| 588 | - 'sku_id': sku_id, | |
| 589 | - 'name': option_name_map[1], | |
| 590 | - 'value': str(sku_row['option1']) | |
| 591 | - }) | |
| 611 | + spec = _build_specification(option_name_map[1], sku_row['option1'], sku_id) | |
| 612 | + if spec: | |
| 613 | + specifications.append(spec) | |
| 592 | 614 | if pd.notna(sku_row.get('option2')) and 2 in option_name_map: |
| 593 | - specifications.append({ | |
| 594 | - 'sku_id': sku_id, | |
| 595 | - 'name': option_name_map[2], | |
| 596 | - 'value': str(sku_row['option2']) | |
| 597 | - }) | |
| 615 | + spec = _build_specification(option_name_map[2], sku_row['option2'], sku_id) | |
| 616 | + if spec: | |
| 617 | + specifications.append(spec) | |
| 598 | 618 | if pd.notna(sku_row.get('option3')) and 3 in option_name_map: |
| 599 | - specifications.append({ | |
| 600 | - 'sku_id': sku_id, | |
| 601 | - 'name': option_name_map[3], | |
| 602 | - 'value': str(sku_row['option3']) | |
| 603 | - }) | |
| 619 | + spec = _build_specification(option_name_map[3], sku_row['option3'], sku_id) | |
| 620 | + if spec: | |
| 621 | + specifications.append(spec) | |
| 604 | 622 | |
| 605 | 623 | return skus_list, prices, compare_prices, sku_prices, sku_weights, sku_weight_units, total_inventory, specifications |
| 606 | 624 | |
| ... | ... | @@ -636,82 +654,36 @@ class SPUDocumentTransformer: |
| 636 | 654 | |
| 637 | 655 | def _fill_llm_attributes(self, doc: Dict[str, Any], spu_row: pd.Series) -> None: |
| 638 | 656 | """ |
| 639 | - 调用 indexer.product_enrich.analyze_products,为当前 SPU 填充: | |
| 657 | + 调用 indexer.product_enrich 的高层内容理解入口,为当前 SPU 填充: | |
| 640 | 658 | - qanchors.{lang} |
| 641 | - - semantic_attributes (lang/name/value) | |
| 659 | + - tags.{lang} | |
| 660 | + - enriched_attributes[].value.{lang} | |
| 642 | 661 | """ |
| 643 | - try: | |
| 644 | - index_langs = self.tenant_config.get("index_languages") or ["en", "zh"] | |
| 645 | - except Exception: | |
| 646 | - index_langs = ["en", "zh"] | |
| 647 | - | |
| 648 | - # 不再限制为固定 SUPPORTED_LANGS,直接按照租户配置的 index_languages 调用 | |
| 649 | - llm_langs = list(dict.fromkeys(index_langs)) # 去重并保持顺序 | |
| 650 | - | |
| 651 | 662 | spu_id = str(spu_row.get("id") or "").strip() |
| 652 | 663 | title = str(spu_row.get("title") or "").strip() |
| 653 | 664 | if not spu_id or not title: |
| 654 | 665 | return |
| 655 | 666 | |
| 656 | - semantic_list = doc.get("semantic_attributes") or [] | |
| 657 | - qanchors_obj = doc.get("qanchors") or {} | |
| 658 | - | |
| 659 | - dim_keys = [ | |
| 660 | - "tags", | |
| 661 | - "target_audience", | |
| 662 | - "usage_scene", | |
| 663 | - "season", | |
| 664 | - "key_attributes", | |
| 665 | - "material", | |
| 666 | - "features", | |
| 667 | - ] | |
| 668 | - | |
| 669 | 667 | tenant_id = doc.get("tenant_id") |
| 668 | + try: | |
| 669 | + results = build_index_content_fields( | |
| 670 | + items=[ | |
| 671 | + { | |
| 672 | + "id": spu_id, | |
| 673 | + "title": title, | |
| 674 | + "brief": str(spu_row.get("brief") or "").strip(), | |
| 675 | + "description": str(spu_row.get("description") or "").strip(), | |
| 676 | + "image_url": str(spu_row.get("image_src") or "").strip(), | |
| 677 | + } | |
| 678 | + ], | |
| 679 | + tenant_id=str(tenant_id), | |
| 680 | + ) | |
| 681 | + except Exception as e: | |
| 682 | + logger.warning("LLM attribute fill failed for SPU %s: %s", spu_id, e) | |
| 683 | + return | |
| 670 | 684 | |
| 671 | - for lang in llm_langs: | |
| 672 | - try: | |
| 673 | - rows = analyze_products( | |
| 674 | - products=[{"id": spu_id, "title": title}], | |
| 675 | - target_lang=lang, | |
| 676 | - batch_size=1, | |
| 677 | - tenant_id=str(tenant_id), | |
| 678 | - ) | |
| 679 | - except Exception as e: | |
| 680 | - logger.warning( | |
| 681 | - "LLM attribute fill failed for SPU %s, lang=%s: %s", | |
| 682 | - spu_id, | |
| 683 | - lang, | |
| 684 | - e, | |
| 685 | - ) | |
| 686 | - continue | |
| 687 | - | |
| 688 | - if not rows: | |
| 689 | - continue | |
| 690 | - row = rows[0] or {} | |
| 691 | - | |
| 692 | - # qanchors.{lang} | |
| 693 | - anchor_text = str(row.get("anchor_text") or "").strip() | |
| 694 | - if anchor_text: | |
| 695 | - qanchors_obj[lang] = anchor_text | |
| 696 | - | |
| 697 | - # 语义属性:按各维度拆分为短语 | |
| 698 | - for name in dim_keys: | |
| 699 | - raw = row.get(name) | |
| 700 | - if not raw: | |
| 701 | - continue | |
| 702 | - for value in split_multi_value_field(str(raw)): | |
| 703 | - semantic_list.append( | |
| 704 | - { | |
| 705 | - "lang": lang, | |
| 706 | - "name": name, | |
| 707 | - "value": value, | |
| 708 | - } | |
| 709 | - ) | |
| 710 | - | |
| 711 | - if qanchors_obj: | |
| 712 | - doc["qanchors"] = qanchors_obj | |
| 713 | - if semantic_list: | |
| 714 | - doc["semantic_attributes"] = semantic_list | |
| 685 | + if results: | |
| 686 | + self._apply_content_enrichment(doc, results[0]) | |
| 715 | 687 | |
| 716 | 688 | def _transform_sku_row(self, sku_row: pd.Series, option_name_map: Dict[int, str] = None) -> Optional[Dict[str, Any]]: |
| 717 | 689 | """ | ... | ... |
indexer/product_enrich.py
| ... | ... | @@ -146,6 +146,16 @@ if _missing_prompt_langs: |
| 146 | 146 | |
| 147 | 147 | # 多值字段分隔:英文逗号、中文逗号、顿号,及历史约定的 ; | / 与空白 |
| 148 | 148 | _MULTI_VALUE_FIELD_SPLIT_RE = re.compile(r"[,、,;|/\n\t]+") |
| 149 | +_CORE_INDEX_LANGUAGES = ("zh", "en") | |
| 150 | +_ENRICHED_ATTRIBUTE_DIMENSIONS = ( | |
| 151 | + "enriched_tags", | |
| 152 | + "target_audience", | |
| 153 | + "usage_scene", | |
| 154 | + "season", | |
| 155 | + "key_attributes", | |
| 156 | + "material", | |
| 157 | + "features", | |
| 158 | +) | |
| 149 | 159 | |
| 150 | 160 | |
| 151 | 161 | def split_multi_value_field(text: Optional[str]) -> List[str]: |
| ... | ... | @@ -158,6 +168,124 @@ def split_multi_value_field(text: Optional[str]) -> List[str]: |
| 158 | 168 | return [p.strip() for p in _MULTI_VALUE_FIELD_SPLIT_RE.split(s) if p.strip()] |
| 159 | 169 | |
| 160 | 170 | |
| 171 | +def _append_lang_phrase_map(target: Dict[str, List[str]], lang: str, raw_value: Any) -> None: | |
| 172 | + parts = split_multi_value_field(raw_value) | |
| 173 | + if not parts: | |
| 174 | + return | |
| 175 | + existing = target.get(lang) or [] | |
| 176 | + merged = list(dict.fromkeys([str(x).strip() for x in existing if str(x).strip()] + parts)) | |
| 177 | + if merged: | |
| 178 | + target[lang] = merged | |
| 179 | + | |
| 180 | + | |
| 181 | +def _append_enriched_attribute( | |
| 182 | + target: List[Dict[str, Any]], | |
| 183 | + name: str, | |
| 184 | + lang: str, | |
| 185 | + raw_value: Any, | |
| 186 | +) -> None: | |
| 187 | + for value in split_multi_value_field(raw_value): | |
| 188 | + if any( | |
| 189 | + item.get("name") == name | |
| 190 | + and isinstance(item.get("value"), dict) | |
| 191 | + and item["value"].get(lang) == value | |
| 192 | + for item in target | |
| 193 | + ): | |
| 194 | + continue | |
| 195 | + target.append({"name": name, "value": {lang: value}}) | |
| 196 | + | |
| 197 | + | |
| 198 | +def _apply_index_content_row(result: Dict[str, Any], row: Dict[str, Any], lang: str) -> None: | |
| 199 | + if not row or row.get("error"): | |
| 200 | + return | |
| 201 | + | |
| 202 | + anchor_text = str(row.get("anchor_text") or "").strip() | |
| 203 | + if anchor_text: | |
| 204 | + _append_lang_phrase_map(result["qanchors"], lang=lang, raw_value=anchor_text) | |
| 205 | + | |
| 206 | + for name in _ENRICHED_ATTRIBUTE_DIMENSIONS: | |
| 207 | + raw = row.get(name) | |
| 208 | + if not raw: | |
| 209 | + continue | |
| 210 | + _append_enriched_attribute(result["enriched_attributes"], name=name, lang=lang, raw_value=raw) | |
| 211 | + if name == "enriched_tags": | |
| 212 | + _append_lang_phrase_map(result["enriched_tags"], lang=lang, raw_value=raw) | |
| 213 | + | |
| 214 | + | |
| 215 | +def _normalize_index_content_item(item: Dict[str, Any]) -> Dict[str, str]: | |
| 216 | + item_id = str(item.get("id") or item.get("spu_id") or "").strip() | |
| 217 | + return { | |
| 218 | + "id": item_id, | |
| 219 | + "title": str(item.get("title") or "").strip(), | |
| 220 | + "brief": str(item.get("brief") or "").strip(), | |
| 221 | + "description": str(item.get("description") or "").strip(), | |
| 222 | + "image_url": str(item.get("image_url") or "").strip(), | |
| 223 | + } | |
| 224 | + | |
| 225 | + | |
| 226 | +def build_index_content_fields( | |
| 227 | + items: List[Dict[str, Any]], | |
| 228 | + tenant_id: Optional[str] = None, | |
| 229 | +) -> List[Dict[str, Any]]: | |
| 230 | + """ | |
| 231 | + 高层入口:生成与 ES mapping 对齐的内容理解字段。 | |
| 232 | + | |
| 233 | + 输入项需包含: | |
| 234 | + - `id` 或 `spu_id` | |
| 235 | + - `title` | |
| 236 | + - 可选 `brief` / `description` / `image_url` | |
| 237 | + | |
| 238 | + 返回项结构: | |
| 239 | + - `id` | |
| 240 | + - `qanchors` | |
| 241 | + - `enriched_tags` | |
| 242 | + - `enriched_attributes` | |
| 243 | + - 可选 `error` | |
| 244 | + | |
| 245 | + 其中: | |
| 246 | + - `qanchors.{lang}` 为短语数组 | |
| 247 | + - `enriched_tags.{lang}` 为标签数组 | |
| 248 | + """ | |
| 249 | + normalized_items = [_normalize_index_content_item(item) for item in items] | |
| 250 | + if not normalized_items: | |
| 251 | + return [] | |
| 252 | + | |
| 253 | + results_by_id: Dict[str, Dict[str, Any]] = { | |
| 254 | + item["id"]: { | |
| 255 | + "id": item["id"], | |
| 256 | + "qanchors": {}, | |
| 257 | + "enriched_tags": {}, | |
| 258 | + "enriched_attributes": [], | |
| 259 | + } | |
| 260 | + for item in normalized_items | |
| 261 | + } | |
| 262 | + | |
| 263 | + for lang in _CORE_INDEX_LANGUAGES: | |
| 264 | + try: | |
| 265 | + rows = analyze_products( | |
| 266 | + products=normalized_items, | |
| 267 | + target_lang=lang, | |
| 268 | + batch_size=BATCH_SIZE, | |
| 269 | + tenant_id=tenant_id, | |
| 270 | + ) | |
| 271 | + except Exception as e: | |
| 272 | + logger.warning("build_index_content_fields failed for lang=%s: %s", lang, e) | |
| 273 | + for item in normalized_items: | |
| 274 | + results_by_id[item["id"]].setdefault("error", str(e)) | |
| 275 | + continue | |
| 276 | + | |
| 277 | + for row in rows or []: | |
| 278 | + item_id = str(row.get("id") or "").strip() | |
| 279 | + if not item_id or item_id not in results_by_id: | |
| 280 | + continue | |
| 281 | + if row.get("error"): | |
| 282 | + results_by_id[item_id].setdefault("error", row["error"]) | |
| 283 | + continue | |
| 284 | + _apply_index_content_row(results_by_id[item_id], row=row, lang=lang) | |
| 285 | + | |
| 286 | + return [results_by_id[item["id"]] for item in normalized_items] | |
| 287 | + | |
| 288 | + | |
| 161 | 289 | def _normalize_space(text: str) -> str: |
| 162 | 290 | return re.sub(r"\s+", " ", (text or "").strip()) |
| 163 | 291 | |
| ... | ... | @@ -526,7 +654,7 @@ def parse_markdown_table(markdown_content: str) -> List[Dict[str, str]]: |
| 526 | 654 | "seq_no": parts[0], |
| 527 | 655 | "title": parts[1], # 商品标题(按目标语言) |
| 528 | 656 | "category_path": parts[2] if len(parts) > 2 else "", # 品类路径 |
| 529 | - "tags": parts[3] if len(parts) > 3 else "", # 细分标签 | |
| 657 | + "enriched_tags": parts[3] if len(parts) > 3 else "", # 细分标签 | |
| 530 | 658 | "target_audience": parts[4] if len(parts) > 4 else "", # 适用人群 |
| 531 | 659 | "usage_scene": parts[5] if len(parts) > 5 else "", # 使用场景 |
| 532 | 660 | "season": parts[6] if len(parts) > 6 else "", # 适用季节 |
| ... | ... | @@ -603,7 +731,7 @@ def process_batch( |
| 603 | 731 | "title_input": item.get("title", ""), |
| 604 | 732 | "title": "", |
| 605 | 733 | "category_path": "", |
| 606 | - "tags": "", | |
| 734 | + "enriched_tags": "", | |
| 607 | 735 | "target_audience": "", |
| 608 | 736 | "usage_scene": "", |
| 609 | 737 | "season": "", |
| ... | ... | @@ -643,7 +771,7 @@ def process_batch( |
| 643 | 771 | "title_input": batch_data[i]["title"], # 原始输入标题 |
| 644 | 772 | "title": parsed_item.get("title", ""), # 模型生成的标题 |
| 645 | 773 | "category_path": parsed_item.get("category_path", ""), # 品类路径 |
| 646 | - "tags": parsed_item.get("tags", ""), # 细分标签 | |
| 774 | + "enriched_tags": parsed_item.get("enriched_tags", ""), # 细分标签 | |
| 647 | 775 | "target_audience": parsed_item.get("target_audience", ""), # 适用人群 |
| 648 | 776 | "usage_scene": parsed_item.get("usage_scene", ""), # 使用场景 |
| 649 | 777 | "season": parsed_item.get("season", ""), # 适用季节 |
| ... | ... | @@ -686,7 +814,7 @@ def process_batch( |
| 686 | 814 | "title_input": item["title"], |
| 687 | 815 | "title": "", |
| 688 | 816 | "category_path": "", |
| 689 | - "tags": "", | |
| 817 | + "enriched_tags": "", | |
| 690 | 818 | "target_audience": "", |
| 691 | 819 | "usage_scene": "", |
| 692 | 820 | "season": "", | ... | ... |
mappings/README.md
| ... | ... | @@ -2,32 +2,285 @@ |
| 2 | 2 | |
| 3 | 3 | ## 概述 |
| 4 | 4 | |
| 5 | -所有租户共享同一个ES mapping结构,直接使用手写的JSON文件,无需通过config.yaml生成。 | |
| 5 | +所有租户共享同一个 Elasticsearch mapping 结构。 | |
| 6 | 6 | |
| 7 | -## Mapping文件 | |
| 7 | +当前目录采用“声明式 Python 规格 + 字段模板 + 最终 JSON 产物”的方式维护 `search_products` 的索引定义: | |
| 8 | 8 | |
| 9 | -- `search_products.json`: 完整的ES索引配置,包括settings和mappings | |
| 9 | +- `generate_search_products_mapping.py`: 唯一的生成源,包含字段模板、语言列表、分析器配置和递归生成逻辑 | |
| 10 | +- `search_products.json`: 由脚本生成的完整 ES 索引配置,包括 `settings` 和 `mappings` | |
| 11 | +- `search_suggestions.json`: 搜索建议索引配置 | |
| 10 | 12 | |
| 11 | -## 使用方式 | |
| 13 | +默认应修改生成脚本中的规格定义,而不是手工编辑 `search_products.json`。 | |
| 12 | 14 | |
| 13 | -### 创建索引 | |
| 15 | +## 字段抽象 | |
| 16 | + | |
| 17 | +脚本从业务语义上抽象出 4 类文本模板: | |
| 18 | + | |
| 19 | +- `all_language_text`: 全语言字段,不带 `keyword` | |
| 20 | +- `all_language_text_with_keyword`: 全语言字段,所有受支持语言都带 `keyword` | |
| 21 | +- `core_language_text`: 核心索引语言字段,不带 `keyword` | |
| 22 | +- `core_language_text_with_keyword`: 核心索引语言字段,核心语言都带 `keyword` | |
| 23 | + | |
| 24 | +这里的“核心索引语言”不是因为系统只支持两种语言,而是因为所有店铺、所有商品都必须至少产出这两种语言的索引内容。目前核心索引语言固定为: | |
| 25 | + | |
| 26 | +- `zh` | |
| 27 | +- `en` | |
| 28 | + | |
| 29 | +“全语言”表示 mapping 为原始商品语言预留了更多语言槽位。商品实际灌入时,不要求每个字段把所有语言都填满,只要求: | |
| 30 | + | |
| 31 | +- 核心索引语言字段必须填充 `zh` 和 `en` | |
| 32 | +- 全语言字段必须填充 `zh` 和 `en` | |
| 33 | +- 如果商品原始语言属于受支持语言,还应额外填充对应的原始语言字段,例如 `ru` | |
| 34 | + | |
| 35 | +当前字段大致分为几类: | |
| 36 | + | |
| 37 | +- 全语言字段:`title`、`keywords`、`brief`、`description`、`vendor`、`category_path`、`category_name_text` | |
| 38 | +- 核心索引语言字段:`qanchors`、`enriched_tags`、`option1_values`、`option2_values`、`option3_values`、`enriched_attributes.value` | |
| 39 | +- 复合嵌套字段:`image_embedding`、`specifications`、`enriched_attributes`、`skus` | |
| 40 | +- 其他标量字段:`tenant_id`、`spu_id`、价格、库存、类目等 | |
| 41 | + | |
| 42 | +生成规则里的几个基础约束: | |
| 43 | + | |
| 44 | +- 中文字段使用 `index_ik`,并额外设置 `search_analyzer: query_ik` | |
| 45 | +- 非中文语言使用各自的 Elasticsearch 内置 analyzer | |
| 46 | +- 带 `with_keyword` 的模板会为对应语言增加 `.keyword` | |
| 47 | +- `settings.analysis`、`normalizer`、`similarity` 也属于生成结果的一部分,不能只维护 `mappings.properties` | |
| 48 | + | |
| 49 | +## 索引灌入指引 | |
| 50 | + | |
| 51 | +### 基本原则 | |
| 52 | + | |
| 53 | +1. 所有商品都必须生成核心索引语言版本,也就是 `zh` 和 `en`。 | |
| 54 | +2. 全语言字段除了必须有 `zh` 和 `en`,还应尽量保留商品原始语言版本。 | |
| 55 | +3. 如果商品原始语言本身就是 `zh` 或 `en`,则原文直接写入对应字段,另一种核心语言通过翻译补齐。 | |
| 56 | +4. 如果商品原始语言是 `ru` 这类受支持的非核心语言,则应同时写入原始语言字段和 `zh/en` 翻译结果。 | |
| 57 | +5. 如果某个值为空,不应写入伪造内容;应在上游清洗后决定是否跳过该字段。 | |
| 58 | + | |
| 59 | +### 核心索引语言字段 | |
| 60 | + | |
| 61 | +这类字段的目标是保证所有商品都至少能被中文和英文检索到。无论商品原始语言是什么,都应通过翻译或标准化得到 `zh` 和 `en` 两份结果。 | |
| 62 | + | |
| 63 | +典型字段: | |
| 64 | + | |
| 65 | +- `qanchors` | |
| 66 | +- `enriched_tags` | |
| 67 | +- `option1_values` | |
| 68 | +- `option2_values` | |
| 69 | +- `option3_values` | |
| 70 | +- `enriched_attributes.value` | |
| 71 | +- `specifications.value_text` | |
| 72 | + | |
| 73 | +以 `category_path` 和 `option*_values` 为例,核心语言灌入结果应至少包含: | |
| 74 | + | |
| 75 | +- `category_path.zh` | |
| 76 | +- `category_path.en` | |
| 77 | +- `option1_values.zh` | |
| 78 | +- `option1_values.en` | |
| 79 | +- `option2_values.zh` | |
| 80 | +- `option2_values.en` | |
| 81 | +- `option3_values.zh` | |
| 82 | +- `option3_values.en` | |
| 83 | + | |
| 84 | +示例:原始商品语言为俄语,原始 `option1_values` 为 `красный, синий` | |
| 85 | + | |
| 86 | +```json | |
| 87 | +{ | |
| 88 | + "option1_values": { | |
| 89 | + "zh": "红色, 蓝色", | |
| 90 | + "en": "red, blue" | |
| 91 | + } | |
| 92 | +} | |
| 93 | +``` | |
| 94 | + | |
| 95 | +示例:原始商品语言为俄语,类目路径为 `Одежда > Женская одежда > Куртки` | |
| 96 | + | |
| 97 | +```json | |
| 98 | +{ | |
| 99 | + "category_path": { | |
| 100 | + "zh": "服饰 > 女装 > 夹克", | |
| 101 | + "en": "Apparel > Women's Clothing > Jackets", | |
| 102 | + "ru": "Одежда > Женская одежда > Куртки" | |
| 103 | + } | |
| 104 | +} | |
| 105 | +``` | |
| 106 | + | |
| 107 | +注意:`category_path` 在 mapping 上属于全语言字段,但在灌入规范上依然要求 `zh/en` 必填。 | |
| 108 | + | |
| 109 | +### 全语言字段 | |
| 110 | + | |
| 111 | +这类字段既要保证 `zh/en` 两个核心索引语言可用,也要尽量保留商品原始语言,以便原语种召回和更自然的检索。 | |
| 112 | + | |
| 113 | +典型字段: | |
| 114 | + | |
| 115 | +- `title` | |
| 116 | +- `keywords` | |
| 117 | +- `brief` | |
| 118 | +- `description` | |
| 119 | +- `vendor` | |
| 120 | +- `category_path` | |
| 121 | +- `category_name_text` | |
| 122 | + | |
| 123 | +灌入规则: | |
| 124 | + | |
| 125 | +1. 找到商品原始语言,例如 `ru` | |
| 126 | +2. 原文写入对应语言字段,例如 `title.ru` | |
| 127 | +3. 将原文翻译成 `zh` 和 `en` | |
| 128 | +4. 分别写入 `title.zh` 和 `title.en` | |
| 129 | + | |
| 130 | +示例:原始商品语言为俄语,标题为 `Женская зимняя куртка` | |
| 131 | + | |
| 132 | +```json | |
| 133 | +{ | |
| 134 | + "title": { | |
| 135 | + "zh": "女士冬季夹克", | |
| 136 | + "en": "Women's winter jacket", | |
| 137 | + "ru": "Женская зимняя куртка" | |
| 138 | + } | |
| 139 | +} | |
| 140 | +``` | |
| 141 | + | |
| 142 | +示例:原始商品语言为俄语,类目名称为 `Женские куртки` | |
| 143 | + | |
| 144 | +```json | |
| 145 | +{ | |
| 146 | + "category_name_text": { | |
| 147 | + "zh": "女式夹克", | |
| 148 | + "en": "Women's jackets", | |
| 149 | + "ru": "Женские куртки" | |
| 150 | + } | |
| 151 | +} | |
| 152 | +``` | |
| 153 | + | |
| 154 | +示例:规格值 `specifications.value_text` / `specifications.value_keyword` | |
| 155 | + | |
| 156 | +```json | |
| 157 | +{ | |
| 158 | + "specifications": [ | |
| 159 | + { | |
| 160 | + "sku_id": "sku-red-s", | |
| 161 | + "name": "color", | |
| 162 | + "value_keyword": "красный", | |
| 163 | + "value_text": { | |
| 164 | + "zh": "红色", | |
| 165 | + "en": "red" | |
| 166 | + } | |
| 167 | + } | |
| 168 | + ] | |
| 169 | +} | |
| 170 | +``` | |
| 171 | + | |
| 172 | +其中: | |
| 173 | + | |
| 174 | +- `specifications.value_keyword` 保存原始规格值,用于精确过滤 / 分面 | |
| 175 | +- `specifications.value_text` 保存 `zh/en` 两个核心索引语言版本,用于检索召回 | |
| 176 | + | |
| 177 | +### 原始语言为中文或英文时 | |
| 178 | + | |
| 179 | +如果原始语言就是核心索引语言之一,不需要额外再写第三份语言字段。 | |
| 180 | + | |
| 181 | +示例:原始语言为中文 | |
| 182 | + | |
| 183 | +```json | |
| 184 | +{ | |
| 185 | + "title": { | |
| 186 | + "zh": "女士冬季夹克", | |
| 187 | + "en": "Women's winter jacket" | |
| 188 | + }, | |
| 189 | + "option1_values": { | |
| 190 | + "zh": "红色, 蓝色", | |
| 191 | + "en": "red, blue" | |
| 192 | + } | |
| 193 | +} | |
| 194 | +``` | |
| 195 | + | |
| 196 | +示例:原始语言为英文 | |
| 197 | + | |
| 198 | +```json | |
| 199 | +{ | |
| 200 | + "title": { | |
| 201 | + "zh": "女士冬季夹克", | |
| 202 | + "en": "Women's winter jacket" | |
| 203 | + }, | |
| 204 | + "vendor": { | |
| 205 | + "zh": "北境服饰", | |
| 206 | + "en": "Northern Apparel" | |
| 207 | + } | |
| 208 | +} | |
| 209 | +``` | |
| 210 | + | |
| 211 | +### 不同字段的灌入方式 | |
| 212 | + | |
| 213 | +可以按下面的方式理解和实现: | |
| 214 | + | |
| 215 | +- 标量字段:直接写固定值,例如 `tenant_id`、`spu_id`、`min_price` | |
| 216 | +- 核心索引语言字段:只生成 `zh/en` | |
| 217 | +- 全语言字段:生成 `zh/en`,再按原始语言补一个对应语种字段 | |
| 218 | +- 嵌套字段:对每个元素内部重复应用同样规则,例如 `specifications[].value_text`、`enriched_attributes[].value` | |
| 219 | + | |
| 220 | +### 推荐灌入流程 | |
| 221 | + | |
| 222 | +1. 识别商品原始语言 | |
| 223 | +2. 提取原文标题、描述、类目、规格、属性、选项值等字段 | |
| 224 | +3. 生成 `zh` 和 `en` 两份核心索引语言内容 | |
| 225 | +4. 对全语言字段,如果原始语言受支持,则额外写入原始语言字段 | |
| 226 | +5. 组装最终 ES 文档并写入索引 | |
| 227 | + | |
| 228 | +## 生成 Mapping | |
| 229 | + | |
| 230 | +在仓库根目录执行: | |
| 231 | + | |
| 232 | +```bash | |
| 233 | +source activate.sh | |
| 234 | +python mappings/generate_search_products_mapping.py > mappings/search_products.json | |
| 235 | +``` | |
| 236 | + | |
| 237 | +如果只想查看输出而不覆盖文件: | |
| 238 | + | |
| 239 | +```bash | |
| 240 | +source activate.sh | |
| 241 | +python mappings/generate_search_products_mapping.py | |
| 242 | +``` | |
| 243 | + | |
| 244 | +如果想先生成到临时文件: | |
| 245 | + | |
| 246 | +```bash | |
| 247 | +source activate.sh | |
| 248 | +python mappings/generate_search_products_mapping.py > mappings/search_products.generated.json | |
| 249 | +``` | |
| 250 | + | |
| 251 | +## 校验 Mapping | |
| 252 | + | |
| 253 | +确认当前 `search_products.json` 是否与生成规则完全一致: | |
| 254 | + | |
| 255 | +```bash | |
| 256 | +source activate.sh | |
| 257 | +python mappings/generate_search_products_mapping.py --check mappings/search_products.json | |
| 258 | +``` | |
| 259 | + | |
| 260 | +## 创建索引 | |
| 14 | 261 | |
| 15 | 262 | ```python |
| 16 | 263 | from indexer.mapping_generator import load_mapping, create_index_if_not_exists |
| 17 | 264 | from utils.es_client import ESClient |
| 18 | 265 | |
| 19 | 266 | es_client = ESClient(hosts=["http://localhost:9200"]) |
| 20 | -mapping = load_mapping() # 从mappings/search_products.json加载 | |
| 267 | +mapping = load_mapping() | |
| 21 | 268 | create_index_if_not_exists(es_client, "search_products", mapping) |
| 22 | 269 | ``` |
| 23 | 270 | |
| 24 | -### 修改Mapping | |
| 271 | +## 修改 Mapping | |
| 272 | + | |
| 273 | +推荐流程: | |
| 274 | + | |
| 275 | +1. 修改 `mappings/generate_search_products_mapping.py` | |
| 276 | +2. 重新生成 `mappings/search_products.json` | |
| 277 | +3. 用 `--check` 或 diff 确认变更符合预期 | |
| 278 | +4. 重新创建索引并导入数据 | |
| 25 | 279 | |
| 26 | -直接编辑 `mappings/search_products.json` 文件,然后重新创建索引。 | |
| 280 | +注意:Elasticsearch 不支持直接修改已有字段的 mapping 类型,只能新增字段。如需修改字段类型,需要: | |
| 27 | 281 | |
| 28 | -注意:ES不支持修改已有字段的mapping类型,只能添加新字段。如需修改字段类型,需要: | |
| 29 | 282 | 1. 删除旧索引 |
| 30 | -2. 使用新mapping创建索引 | |
| 283 | +2. 使用新 mapping 创建索引 | |
| 31 | 284 | 3. 重新导入数据 |
| 32 | 285 | |
| 33 | 286 | ## 字段说明 | ... | ... |
| ... | ... | @@ -0,0 +1,355 @@ |
| 1 | +#!/usr/bin/env python3 | |
| 2 | +from __future__ import annotations | |
| 3 | + | |
| 4 | +import argparse | |
| 5 | +import json | |
| 6 | +from pathlib import Path | |
| 7 | +from typing import Any | |
| 8 | + | |
| 9 | +ALL_LANGUAGE_CODES = [ | |
| 10 | + "zh", | |
| 11 | + "en", | |
| 12 | + "ar", | |
| 13 | + "hy", | |
| 14 | + "eu", | |
| 15 | + "pt_br", | |
| 16 | + "bg", | |
| 17 | + "ca", | |
| 18 | + "cjk", | |
| 19 | + "cs", | |
| 20 | + "da", | |
| 21 | + "nl", | |
| 22 | + "fi", | |
| 23 | + "fr", | |
| 24 | + "gl", | |
| 25 | + "de", | |
| 26 | + "el", | |
| 27 | + "hi", | |
| 28 | + "hu", | |
| 29 | + "id", | |
| 30 | + "it", | |
| 31 | + "no", | |
| 32 | + "fa", | |
| 33 | + "pt", | |
| 34 | + "ro", | |
| 35 | + "ru", | |
| 36 | + "es", | |
| 37 | + "sv", | |
| 38 | + "tr", | |
| 39 | + "th", | |
| 40 | +] | |
| 41 | + | |
| 42 | +CORE_INDEX_LANGUAGES = ["zh", "en"] | |
| 43 | + | |
| 44 | +LANGUAGE_GROUPS = { | |
| 45 | + "all": ALL_LANGUAGE_CODES, | |
| 46 | + "core": CORE_INDEX_LANGUAGES, | |
| 47 | +} | |
| 48 | + | |
| 49 | +ANALYZERS = { | |
| 50 | + "zh": "index_ik", | |
| 51 | + "en": "english", | |
| 52 | + "ar": "arabic", | |
| 53 | + "hy": "armenian", | |
| 54 | + "eu": "basque", | |
| 55 | + "pt_br": "brazilian", | |
| 56 | + "bg": "bulgarian", | |
| 57 | + "ca": "catalan", | |
| 58 | + "cjk": "cjk", | |
| 59 | + "cs": "czech", | |
| 60 | + "da": "danish", | |
| 61 | + "nl": "dutch", | |
| 62 | + "fi": "finnish", | |
| 63 | + "fr": "french", | |
| 64 | + "gl": "galician", | |
| 65 | + "de": "german", | |
| 66 | + "el": "greek", | |
| 67 | + "hi": "hindi", | |
| 68 | + "hu": "hungarian", | |
| 69 | + "id": "indonesian", | |
| 70 | + "it": "italian", | |
| 71 | + "no": "norwegian", | |
| 72 | + "fa": "persian", | |
| 73 | + "pt": "portuguese", | |
| 74 | + "ro": "romanian", | |
| 75 | + "ru": "russian", | |
| 76 | + "es": "spanish", | |
| 77 | + "sv": "swedish", | |
| 78 | + "tr": "turkish", | |
| 79 | + "th": "thai", | |
| 80 | +} | |
| 81 | + | |
| 82 | +SETTINGS = { | |
| 83 | + "number_of_shards": 1, | |
| 84 | + "number_of_replicas": 0, | |
| 85 | + "refresh_interval": "30s", | |
| 86 | + "analysis": { | |
| 87 | + "analyzer": { | |
| 88 | + "index_ik": { | |
| 89 | + "type": "custom", | |
| 90 | + "tokenizer": "ik_max_word", | |
| 91 | + "filter": ["lowercase", "asciifolding"], | |
| 92 | + }, | |
| 93 | + "query_ik": { | |
| 94 | + "type": "custom", | |
| 95 | + "tokenizer": "ik_smart", | |
| 96 | + "filter": ["lowercase", "asciifolding"], | |
| 97 | + }, | |
| 98 | + }, | |
| 99 | + "normalizer": { | |
| 100 | + "lowercase": { | |
| 101 | + "type": "custom", | |
| 102 | + "filter": ["lowercase"], | |
| 103 | + } | |
| 104 | + }, | |
| 105 | + }, | |
| 106 | + "similarity": { | |
| 107 | + "default": { | |
| 108 | + "type": "BM25", | |
| 109 | + "b": 0.0, | |
| 110 | + "k1": 0.0, | |
| 111 | + } | |
| 112 | + }, | |
| 113 | +} | |
| 114 | + | |
| 115 | +TEXT_FIELD_TEMPLATES = { | |
| 116 | + "all_language_text": { | |
| 117 | + "language_group": "all", | |
| 118 | + "with_keyword": False, | |
| 119 | + }, | |
| 120 | + "all_language_text_with_keyword": { | |
| 121 | + "language_group": "all", | |
| 122 | + "with_keyword": True, | |
| 123 | + }, | |
| 124 | + "core_language_text": { | |
| 125 | + "language_group": "core", | |
| 126 | + "with_keyword": False, | |
| 127 | + }, | |
| 128 | + "core_language_text_with_keyword": { | |
| 129 | + "language_group": "core", | |
| 130 | + "with_keyword": True, | |
| 131 | + }, | |
| 132 | +} | |
| 133 | + | |
| 134 | + | |
| 135 | +def scalar_field(name: str, field_type: str, **extra: Any) -> dict[str, Any]: | |
| 136 | + spec = { | |
| 137 | + "name": name, | |
| 138 | + "kind": "scalar", | |
| 139 | + "type": field_type, | |
| 140 | + } | |
| 141 | + if extra: | |
| 142 | + spec["extra"] = extra | |
| 143 | + return spec | |
| 144 | + | |
| 145 | + | |
| 146 | +def text_field(name: str, template: str) -> dict[str, Any]: | |
| 147 | + return { | |
| 148 | + "name": name, | |
| 149 | + "kind": "text", | |
| 150 | + "template": template, | |
| 151 | + } | |
| 152 | + | |
| 153 | + | |
| 154 | +def nested_field(name: str, *fields: dict[str, Any]) -> dict[str, Any]: | |
| 155 | + return { | |
| 156 | + "name": name, | |
| 157 | + "kind": "nested", | |
| 158 | + "fields": list(fields), | |
| 159 | + } | |
| 160 | + | |
| 161 | +TEXT_EMBEDDING_SIZE = 1024 | |
| 162 | +IMAGE_EMBEDDING_SIZE = 768 | |
| 163 | + | |
| 164 | +FIELD_SPECS = [ | |
| 165 | + scalar_field("spu_id", "keyword"), | |
| 166 | + scalar_field("create_time", "date"), | |
| 167 | + scalar_field("update_time", "date"), | |
| 168 | + text_field("title", "all_language_text"), | |
| 169 | + text_field("keywords", "all_language_text_with_keyword"), | |
| 170 | + text_field("brief", "all_language_text"), | |
| 171 | + text_field("description", "all_language_text"), | |
| 172 | + text_field("vendor", "all_language_text_with_keyword"), | |
| 173 | + scalar_field("image_url", "keyword", index=False), | |
| 174 | + scalar_field( | |
| 175 | + "title_embedding", | |
| 176 | + "dense_vector", | |
| 177 | + dims=TEXT_EMBEDDING_SIZE, | |
| 178 | + index=True, | |
| 179 | + similarity="dot_product", | |
| 180 | + element_type="bfloat16", | |
| 181 | + ), | |
| 182 | + nested_field( | |
| 183 | + "image_embedding", | |
| 184 | + scalar_field( | |
| 185 | + "vector", | |
| 186 | + "dense_vector", | |
| 187 | + dims=IMAGE_EMBEDDING_SIZE, | |
| 188 | + index=True, | |
| 189 | + similarity="dot_product", | |
| 190 | + element_type="bfloat16", | |
| 191 | + ), | |
| 192 | + scalar_field("url", "text"), | |
| 193 | + ), | |
| 194 | + text_field("category_path", "all_language_text_with_keyword"), | |
| 195 | + text_field("category_name_text", "all_language_text_with_keyword"), | |
| 196 | + text_field("tags", "all_language_text_with_keyword"), | |
| 197 | + scalar_field("category_id", "keyword"), | |
| 198 | + scalar_field("category_name", "keyword"), | |
| 199 | + scalar_field("category_level", "integer"), | |
| 200 | + scalar_field("category1_name", "keyword"), | |
| 201 | + scalar_field("category2_name", "keyword"), | |
| 202 | + scalar_field("category3_name", "keyword"), | |
| 203 | + nested_field( | |
| 204 | + "specifications", | |
| 205 | + scalar_field("sku_id", "keyword"), | |
| 206 | + scalar_field("name", "keyword"), | |
| 207 | + scalar_field("value_keyword", "keyword"), | |
| 208 | + text_field("value_text", "core_language_text_with_keyword"), | |
| 209 | + ), | |
| 210 | + text_field("qanchors", "core_language_text"), | |
| 211 | + text_field("enriched_tags", "core_language_text_with_keyword"), | |
| 212 | + nested_field( | |
| 213 | + "enriched_attributes", | |
| 214 | + scalar_field("name", "keyword"), | |
| 215 | + text_field("value", "core_language_text_with_keyword"), | |
| 216 | + ), | |
| 217 | + scalar_field("option1_name", "keyword"), | |
| 218 | + scalar_field("option2_name", "keyword"), | |
| 219 | + scalar_field("option3_name", "keyword"), | |
| 220 | + text_field("option1_values", "core_language_text_with_keyword"), | |
| 221 | + text_field("option2_values", "core_language_text_with_keyword"), | |
| 222 | + text_field("option3_values", "core_language_text_with_keyword"), | |
| 223 | + scalar_field("min_price", "float"), | |
| 224 | + scalar_field("max_price", "float"), | |
| 225 | + scalar_field("compare_at_price", "float"), | |
| 226 | + scalar_field("sku_prices", "float"), | |
| 227 | + scalar_field("sku_weights", "long"), | |
| 228 | + scalar_field("sku_weight_units", "keyword"), | |
| 229 | + scalar_field("total_inventory", "long"), | |
| 230 | + scalar_field("sales", "long"), | |
| 231 | + nested_field( | |
| 232 | + "skus", | |
| 233 | + scalar_field("sku_id", "keyword"), | |
| 234 | + scalar_field("price", "float"), | |
| 235 | + scalar_field("compare_at_price", "float"), | |
| 236 | + scalar_field("sku_code", "keyword"), | |
| 237 | + scalar_field("stock", "long"), | |
| 238 | + scalar_field("weight", "float"), | |
| 239 | + scalar_field("weight_unit", "keyword"), | |
| 240 | + scalar_field("option1_value", "keyword"), | |
| 241 | + scalar_field("option2_value", "keyword"), | |
| 242 | + scalar_field("option3_value", "keyword"), | |
| 243 | + scalar_field("image_src", "keyword", index=False), | |
| 244 | + ), | |
| 245 | +] | |
| 246 | + | |
| 247 | + | |
| 248 | +def build_keyword_fields() -> dict[str, Any]: | |
| 249 | + return { | |
| 250 | + "keyword": { | |
| 251 | + "type": "keyword", | |
| 252 | + "normalizer": "lowercase", | |
| 253 | + } | |
| 254 | + } | |
| 255 | + | |
| 256 | + | |
| 257 | +def build_text_field(language: str, *, add_keyword: bool) -> dict[str, Any]: | |
| 258 | + field = { | |
| 259 | + "type": "text", | |
| 260 | + "analyzer": ANALYZERS[language], | |
| 261 | + } | |
| 262 | + if language == "zh": | |
| 263 | + field["search_analyzer"] = "query_ik" | |
| 264 | + if add_keyword: | |
| 265 | + field["fields"] = build_keyword_fields() | |
| 266 | + return field | |
| 267 | + | |
| 268 | + | |
| 269 | +def render_field(spec: dict[str, Any]) -> dict[str, Any]: | |
| 270 | + kind = spec["kind"] | |
| 271 | + | |
| 272 | + if kind == "scalar": | |
| 273 | + rendered = {"type": spec["type"]} | |
| 274 | + rendered.update(spec.get("extra", {})) | |
| 275 | + return rendered | |
| 276 | + | |
| 277 | + if kind == "text": | |
| 278 | + template = TEXT_FIELD_TEMPLATES[spec["template"]] | |
| 279 | + languages = LANGUAGE_GROUPS[template["language_group"]] | |
| 280 | + properties = {} | |
| 281 | + for language in languages: | |
| 282 | + properties[language] = build_text_field( | |
| 283 | + language, | |
| 284 | + add_keyword=template["with_keyword"], | |
| 285 | + ) | |
| 286 | + return { | |
| 287 | + "type": "object", | |
| 288 | + "properties": properties, | |
| 289 | + } | |
| 290 | + | |
| 291 | + if kind == "nested": | |
| 292 | + properties = {} | |
| 293 | + for child in spec["fields"]: | |
| 294 | + properties[child["name"]] = render_field(child) | |
| 295 | + return { | |
| 296 | + "type": "nested", | |
| 297 | + "properties": properties, | |
| 298 | + } | |
| 299 | + | |
| 300 | + raise ValueError(f"Unknown field kind: {kind}") | |
| 301 | + | |
| 302 | + | |
| 303 | +def build_mapping() -> dict[str, Any]: | |
| 304 | + properties = {} | |
| 305 | + for spec in FIELD_SPECS: | |
| 306 | + properties[spec["name"]] = render_field(spec) | |
| 307 | + | |
| 308 | + return { | |
| 309 | + "settings": SETTINGS, | |
| 310 | + "mappings": { | |
| 311 | + "properties": properties, | |
| 312 | + }, | |
| 313 | + } | |
| 314 | + | |
| 315 | + | |
| 316 | +def render_mapping() -> str: | |
| 317 | + return json.dumps(build_mapping(), indent=2, ensure_ascii=False) | |
| 318 | + | |
| 319 | + | |
| 320 | +def main() -> int: | |
| 321 | + parser = argparse.ArgumentParser( | |
| 322 | + description="Generate mappings/search_products.json from a compact Python spec.", | |
| 323 | + ) | |
| 324 | + parser.add_argument( | |
| 325 | + "-o", | |
| 326 | + "--output", | |
| 327 | + type=Path, | |
| 328 | + help="Write the generated mapping to this file. Defaults to stdout.", | |
| 329 | + ) | |
| 330 | + parser.add_argument( | |
| 331 | + "--check", | |
| 332 | + type=Path, | |
| 333 | + help="Fail if the generated output does not exactly match this file.", | |
| 334 | + ) | |
| 335 | + args = parser.parse_args() | |
| 336 | + | |
| 337 | + rendered = render_mapping() | |
| 338 | + | |
| 339 | + if args.check is not None: | |
| 340 | + existing = args.check.read_text(encoding="utf-8") | |
| 341 | + if existing != rendered: | |
| 342 | + print(f"Generated mapping does not match {args.check}") | |
| 343 | + return 1 | |
| 344 | + print(f"Generated mapping matches {args.check}") | |
| 345 | + | |
| 346 | + if args.output is not None: | |
| 347 | + args.output.write_text(rendered, encoding="utf-8") | |
| 348 | + elif args.check is None: | |
| 349 | + print(rendered, end="") | |
| 350 | + | |
| 351 | + return 0 | |
| 352 | + | |
| 353 | + | |
| 354 | +if __name__ == "__main__": | |
| 355 | + raise SystemExit(main()) | ... | ... |
mappings/search_products.json
| ... | ... | @@ -41,9 +41,6 @@ |
| 41 | 41 | }, |
| 42 | 42 | "mappings": { |
| 43 | 43 | "properties": { |
| 44 | - "tenant_id": { | |
| 45 | - "type": "keyword" | |
| 46 | - }, | |
| 47 | 44 | "spu_id": { |
| 48 | 45 | "type": "keyword" |
| 49 | 46 | }, |
| ... | ... | @@ -179,133 +176,313 @@ |
| 179 | 176 | } |
| 180 | 177 | } |
| 181 | 178 | }, |
| 182 | - "qanchors": { | |
| 179 | + "keywords": { | |
| 183 | 180 | "type": "object", |
| 184 | 181 | "properties": { |
| 185 | 182 | "zh": { |
| 186 | 183 | "type": "text", |
| 187 | 184 | "analyzer": "index_ik", |
| 188 | - "search_analyzer": "query_ik" | |
| 185 | + "search_analyzer": "query_ik", | |
| 186 | + "fields": { | |
| 187 | + "keyword": { | |
| 188 | + "type": "keyword", | |
| 189 | + "normalizer": "lowercase" | |
| 190 | + } | |
| 191 | + } | |
| 189 | 192 | }, |
| 190 | 193 | "en": { |
| 191 | 194 | "type": "text", |
| 192 | - "analyzer": "english" | |
| 195 | + "analyzer": "english", | |
| 196 | + "fields": { | |
| 197 | + "keyword": { | |
| 198 | + "type": "keyword", | |
| 199 | + "normalizer": "lowercase" | |
| 200 | + } | |
| 201 | + } | |
| 193 | 202 | }, |
| 194 | 203 | "ar": { |
| 195 | 204 | "type": "text", |
| 196 | - "analyzer": "arabic" | |
| 205 | + "analyzer": "arabic", | |
| 206 | + "fields": { | |
| 207 | + "keyword": { | |
| 208 | + "type": "keyword", | |
| 209 | + "normalizer": "lowercase" | |
| 210 | + } | |
| 211 | + } | |
| 197 | 212 | }, |
| 198 | 213 | "hy": { |
| 199 | 214 | "type": "text", |
| 200 | - "analyzer": "armenian" | |
| 215 | + "analyzer": "armenian", | |
| 216 | + "fields": { | |
| 217 | + "keyword": { | |
| 218 | + "type": "keyword", | |
| 219 | + "normalizer": "lowercase" | |
| 220 | + } | |
| 221 | + } | |
| 201 | 222 | }, |
| 202 | 223 | "eu": { |
| 203 | 224 | "type": "text", |
| 204 | - "analyzer": "basque" | |
| 225 | + "analyzer": "basque", | |
| 226 | + "fields": { | |
| 227 | + "keyword": { | |
| 228 | + "type": "keyword", | |
| 229 | + "normalizer": "lowercase" | |
| 230 | + } | |
| 231 | + } | |
| 205 | 232 | }, |
| 206 | 233 | "pt_br": { |
| 207 | 234 | "type": "text", |
| 208 | - "analyzer": "brazilian" | |
| 235 | + "analyzer": "brazilian", | |
| 236 | + "fields": { | |
| 237 | + "keyword": { | |
| 238 | + "type": "keyword", | |
| 239 | + "normalizer": "lowercase" | |
| 240 | + } | |
| 241 | + } | |
| 209 | 242 | }, |
| 210 | 243 | "bg": { |
| 211 | 244 | "type": "text", |
| 212 | - "analyzer": "bulgarian" | |
| 245 | + "analyzer": "bulgarian", | |
| 246 | + "fields": { | |
| 247 | + "keyword": { | |
| 248 | + "type": "keyword", | |
| 249 | + "normalizer": "lowercase" | |
| 250 | + } | |
| 251 | + } | |
| 213 | 252 | }, |
| 214 | 253 | "ca": { |
| 215 | 254 | "type": "text", |
| 216 | - "analyzer": "catalan" | |
| 255 | + "analyzer": "catalan", | |
| 256 | + "fields": { | |
| 257 | + "keyword": { | |
| 258 | + "type": "keyword", | |
| 259 | + "normalizer": "lowercase" | |
| 260 | + } | |
| 261 | + } | |
| 217 | 262 | }, |
| 218 | 263 | "cjk": { |
| 219 | 264 | "type": "text", |
| 220 | - "analyzer": "cjk" | |
| 265 | + "analyzer": "cjk", | |
| 266 | + "fields": { | |
| 267 | + "keyword": { | |
| 268 | + "type": "keyword", | |
| 269 | + "normalizer": "lowercase" | |
| 270 | + } | |
| 271 | + } | |
| 221 | 272 | }, |
| 222 | 273 | "cs": { |
| 223 | 274 | "type": "text", |
| 224 | - "analyzer": "czech" | |
| 275 | + "analyzer": "czech", | |
| 276 | + "fields": { | |
| 277 | + "keyword": { | |
| 278 | + "type": "keyword", | |
| 279 | + "normalizer": "lowercase" | |
| 280 | + } | |
| 281 | + } | |
| 225 | 282 | }, |
| 226 | 283 | "da": { |
| 227 | 284 | "type": "text", |
| 228 | - "analyzer": "danish" | |
| 285 | + "analyzer": "danish", | |
| 286 | + "fields": { | |
| 287 | + "keyword": { | |
| 288 | + "type": "keyword", | |
| 289 | + "normalizer": "lowercase" | |
| 290 | + } | |
| 291 | + } | |
| 229 | 292 | }, |
| 230 | 293 | "nl": { |
| 231 | 294 | "type": "text", |
| 232 | - "analyzer": "dutch" | |
| 295 | + "analyzer": "dutch", | |
| 296 | + "fields": { | |
| 297 | + "keyword": { | |
| 298 | + "type": "keyword", | |
| 299 | + "normalizer": "lowercase" | |
| 300 | + } | |
| 301 | + } | |
| 233 | 302 | }, |
| 234 | 303 | "fi": { |
| 235 | 304 | "type": "text", |
| 236 | - "analyzer": "finnish" | |
| 305 | + "analyzer": "finnish", | |
| 306 | + "fields": { | |
| 307 | + "keyword": { | |
| 308 | + "type": "keyword", | |
| 309 | + "normalizer": "lowercase" | |
| 310 | + } | |
| 311 | + } | |
| 237 | 312 | }, |
| 238 | 313 | "fr": { |
| 239 | 314 | "type": "text", |
| 240 | - "analyzer": "french" | |
| 315 | + "analyzer": "french", | |
| 316 | + "fields": { | |
| 317 | + "keyword": { | |
| 318 | + "type": "keyword", | |
| 319 | + "normalizer": "lowercase" | |
| 320 | + } | |
| 321 | + } | |
| 241 | 322 | }, |
| 242 | 323 | "gl": { |
| 243 | 324 | "type": "text", |
| 244 | - "analyzer": "galician" | |
| 325 | + "analyzer": "galician", | |
| 326 | + "fields": { | |
| 327 | + "keyword": { | |
| 328 | + "type": "keyword", | |
| 329 | + "normalizer": "lowercase" | |
| 330 | + } | |
| 331 | + } | |
| 245 | 332 | }, |
| 246 | 333 | "de": { |
| 247 | 334 | "type": "text", |
| 248 | - "analyzer": "german" | |
| 335 | + "analyzer": "german", | |
| 336 | + "fields": { | |
| 337 | + "keyword": { | |
| 338 | + "type": "keyword", | |
| 339 | + "normalizer": "lowercase" | |
| 340 | + } | |
| 341 | + } | |
| 249 | 342 | }, |
| 250 | 343 | "el": { |
| 251 | 344 | "type": "text", |
| 252 | - "analyzer": "greek" | |
| 345 | + "analyzer": "greek", | |
| 346 | + "fields": { | |
| 347 | + "keyword": { | |
| 348 | + "type": "keyword", | |
| 349 | + "normalizer": "lowercase" | |
| 350 | + } | |
| 351 | + } | |
| 253 | 352 | }, |
| 254 | 353 | "hi": { |
| 255 | 354 | "type": "text", |
| 256 | - "analyzer": "hindi" | |
| 355 | + "analyzer": "hindi", | |
| 356 | + "fields": { | |
| 357 | + "keyword": { | |
| 358 | + "type": "keyword", | |
| 359 | + "normalizer": "lowercase" | |
| 360 | + } | |
| 361 | + } | |
| 257 | 362 | }, |
| 258 | 363 | "hu": { |
| 259 | 364 | "type": "text", |
| 260 | - "analyzer": "hungarian" | |
| 365 | + "analyzer": "hungarian", | |
| 366 | + "fields": { | |
| 367 | + "keyword": { | |
| 368 | + "type": "keyword", | |
| 369 | + "normalizer": "lowercase" | |
| 370 | + } | |
| 371 | + } | |
| 261 | 372 | }, |
| 262 | 373 | "id": { |
| 263 | 374 | "type": "text", |
| 264 | - "analyzer": "indonesian" | |
| 375 | + "analyzer": "indonesian", | |
| 376 | + "fields": { | |
| 377 | + "keyword": { | |
| 378 | + "type": "keyword", | |
| 379 | + "normalizer": "lowercase" | |
| 380 | + } | |
| 381 | + } | |
| 265 | 382 | }, |
| 266 | 383 | "it": { |
| 267 | 384 | "type": "text", |
| 268 | - "analyzer": "italian" | |
| 385 | + "analyzer": "italian", | |
| 386 | + "fields": { | |
| 387 | + "keyword": { | |
| 388 | + "type": "keyword", | |
| 389 | + "normalizer": "lowercase" | |
| 390 | + } | |
| 391 | + } | |
| 269 | 392 | }, |
| 270 | 393 | "no": { |
| 271 | 394 | "type": "text", |
| 272 | - "analyzer": "norwegian" | |
| 395 | + "analyzer": "norwegian", | |
| 396 | + "fields": { | |
| 397 | + "keyword": { | |
| 398 | + "type": "keyword", | |
| 399 | + "normalizer": "lowercase" | |
| 400 | + } | |
| 401 | + } | |
| 273 | 402 | }, |
| 274 | 403 | "fa": { |
| 275 | 404 | "type": "text", |
| 276 | - "analyzer": "persian" | |
| 405 | + "analyzer": "persian", | |
| 406 | + "fields": { | |
| 407 | + "keyword": { | |
| 408 | + "type": "keyword", | |
| 409 | + "normalizer": "lowercase" | |
| 410 | + } | |
| 411 | + } | |
| 277 | 412 | }, |
| 278 | 413 | "pt": { |
| 279 | 414 | "type": "text", |
| 280 | - "analyzer": "portuguese" | |
| 415 | + "analyzer": "portuguese", | |
| 416 | + "fields": { | |
| 417 | + "keyword": { | |
| 418 | + "type": "keyword", | |
| 419 | + "normalizer": "lowercase" | |
| 420 | + } | |
| 421 | + } | |
| 281 | 422 | }, |
| 282 | 423 | "ro": { |
| 283 | 424 | "type": "text", |
| 284 | - "analyzer": "romanian" | |
| 425 | + "analyzer": "romanian", | |
| 426 | + "fields": { | |
| 427 | + "keyword": { | |
| 428 | + "type": "keyword", | |
| 429 | + "normalizer": "lowercase" | |
| 430 | + } | |
| 431 | + } | |
| 285 | 432 | }, |
| 286 | 433 | "ru": { |
| 287 | 434 | "type": "text", |
| 288 | - "analyzer": "russian" | |
| 435 | + "analyzer": "russian", | |
| 436 | + "fields": { | |
| 437 | + "keyword": { | |
| 438 | + "type": "keyword", | |
| 439 | + "normalizer": "lowercase" | |
| 440 | + } | |
| 441 | + } | |
| 289 | 442 | }, |
| 290 | 443 | "es": { |
| 291 | 444 | "type": "text", |
| 292 | - "analyzer": "spanish" | |
| 445 | + "analyzer": "spanish", | |
| 446 | + "fields": { | |
| 447 | + "keyword": { | |
| 448 | + "type": "keyword", | |
| 449 | + "normalizer": "lowercase" | |
| 450 | + } | |
| 451 | + } | |
| 293 | 452 | }, |
| 294 | 453 | "sv": { |
| 295 | 454 | "type": "text", |
| 296 | - "analyzer": "swedish" | |
| 455 | + "analyzer": "swedish", | |
| 456 | + "fields": { | |
| 457 | + "keyword": { | |
| 458 | + "type": "keyword", | |
| 459 | + "normalizer": "lowercase" | |
| 460 | + } | |
| 461 | + } | |
| 297 | 462 | }, |
| 298 | 463 | "tr": { |
| 299 | 464 | "type": "text", |
| 300 | - "analyzer": "turkish" | |
| 465 | + "analyzer": "turkish", | |
| 466 | + "fields": { | |
| 467 | + "keyword": { | |
| 468 | + "type": "keyword", | |
| 469 | + "normalizer": "lowercase" | |
| 470 | + } | |
| 471 | + } | |
| 301 | 472 | }, |
| 302 | 473 | "th": { |
| 303 | 474 | "type": "text", |
| 304 | - "analyzer": "thai" | |
| 475 | + "analyzer": "thai", | |
| 476 | + "fields": { | |
| 477 | + "keyword": { | |
| 478 | + "type": "keyword", | |
| 479 | + "normalizer": "lowercase" | |
| 480 | + } | |
| 481 | + } | |
| 305 | 482 | } |
| 306 | 483 | } |
| 307 | 484 | }, |
| 308 | - "keywords": { | |
| 485 | + "brief": { | |
| 309 | 486 | "type": "object", |
| 310 | 487 | "properties": { |
| 311 | 488 | "zh": { |
| ... | ... | @@ -431,7 +608,7 @@ |
| 431 | 608 | } |
| 432 | 609 | } |
| 433 | 610 | }, |
| 434 | - "brief": { | |
| 611 | + "description": { | |
| 435 | 612 | "type": "object", |
| 436 | 613 | "properties": { |
| 437 | 614 | "zh": { |
| ... | ... | @@ -557,139 +734,19 @@ |
| 557 | 734 | } |
| 558 | 735 | } |
| 559 | 736 | }, |
| 560 | - "description": { | |
| 737 | + "vendor": { | |
| 561 | 738 | "type": "object", |
| 562 | 739 | "properties": { |
| 563 | 740 | "zh": { |
| 564 | 741 | "type": "text", |
| 565 | 742 | "analyzer": "index_ik", |
| 566 | - "search_analyzer": "query_ik" | |
| 567 | - }, | |
| 568 | - "en": { | |
| 569 | - "type": "text", | |
| 570 | - "analyzer": "english" | |
| 571 | - }, | |
| 572 | - "ar": { | |
| 573 | - "type": "text", | |
| 574 | - "analyzer": "arabic" | |
| 575 | - }, | |
| 576 | - "hy": { | |
| 577 | - "type": "text", | |
| 578 | - "analyzer": "armenian" | |
| 579 | - }, | |
| 580 | - "eu": { | |
| 581 | - "type": "text", | |
| 582 | - "analyzer": "basque" | |
| 583 | - }, | |
| 584 | - "pt_br": { | |
| 585 | - "type": "text", | |
| 586 | - "analyzer": "brazilian" | |
| 587 | - }, | |
| 588 | - "bg": { | |
| 589 | - "type": "text", | |
| 590 | - "analyzer": "bulgarian" | |
| 591 | - }, | |
| 592 | - "ca": { | |
| 593 | - "type": "text", | |
| 594 | - "analyzer": "catalan" | |
| 595 | - }, | |
| 596 | - "cjk": { | |
| 597 | - "type": "text", | |
| 598 | - "analyzer": "cjk" | |
| 599 | - }, | |
| 600 | - "cs": { | |
| 601 | - "type": "text", | |
| 602 | - "analyzer": "czech" | |
| 603 | - }, | |
| 604 | - "da": { | |
| 605 | - "type": "text", | |
| 606 | - "analyzer": "danish" | |
| 607 | - }, | |
| 608 | - "nl": { | |
| 609 | - "type": "text", | |
| 610 | - "analyzer": "dutch" | |
| 611 | - }, | |
| 612 | - "fi": { | |
| 613 | - "type": "text", | |
| 614 | - "analyzer": "finnish" | |
| 615 | - }, | |
| 616 | - "fr": { | |
| 617 | - "type": "text", | |
| 618 | - "analyzer": "french" | |
| 619 | - }, | |
| 620 | - "gl": { | |
| 621 | - "type": "text", | |
| 622 | - "analyzer": "galician" | |
| 623 | - }, | |
| 624 | - "de": { | |
| 625 | - "type": "text", | |
| 626 | - "analyzer": "german" | |
| 627 | - }, | |
| 628 | - "el": { | |
| 629 | - "type": "text", | |
| 630 | - "analyzer": "greek" | |
| 631 | - }, | |
| 632 | - "hi": { | |
| 633 | - "type": "text", | |
| 634 | - "analyzer": "hindi" | |
| 635 | - }, | |
| 636 | - "hu": { | |
| 637 | - "type": "text", | |
| 638 | - "analyzer": "hungarian" | |
| 639 | - }, | |
| 640 | - "id": { | |
| 641 | - "type": "text", | |
| 642 | - "analyzer": "indonesian" | |
| 643 | - }, | |
| 644 | - "it": { | |
| 645 | - "type": "text", | |
| 646 | - "analyzer": "italian" | |
| 647 | - }, | |
| 648 | - "no": { | |
| 649 | - "type": "text", | |
| 650 | - "analyzer": "norwegian" | |
| 651 | - }, | |
| 652 | - "fa": { | |
| 653 | - "type": "text", | |
| 654 | - "analyzer": "persian" | |
| 655 | - }, | |
| 656 | - "pt": { | |
| 657 | - "type": "text", | |
| 658 | - "analyzer": "portuguese" | |
| 659 | - }, | |
| 660 | - "ro": { | |
| 661 | - "type": "text", | |
| 662 | - "analyzer": "romanian" | |
| 663 | - }, | |
| 664 | - "ru": { | |
| 665 | - "type": "text", | |
| 666 | - "analyzer": "russian" | |
| 667 | - }, | |
| 668 | - "es": { | |
| 669 | - "type": "text", | |
| 670 | - "analyzer": "spanish" | |
| 671 | - }, | |
| 672 | - "sv": { | |
| 673 | - "type": "text", | |
| 674 | - "analyzer": "swedish" | |
| 675 | - }, | |
| 676 | - "tr": { | |
| 677 | - "type": "text", | |
| 678 | - "analyzer": "turkish" | |
| 679 | - }, | |
| 680 | - "th": { | |
| 681 | - "type": "text", | |
| 682 | - "analyzer": "thai" | |
| 683 | - } | |
| 684 | - } | |
| 685 | - }, | |
| 686 | - "vendor": { | |
| 687 | - "type": "object", | |
| 688 | - "properties": { | |
| 689 | - "zh": { | |
| 690 | - "type": "text", | |
| 691 | - "analyzer": "index_ik", | |
| 692 | - "search_analyzer": "query_ik" | |
| 743 | + "search_analyzer": "query_ik", | |
| 744 | + "fields": { | |
| 745 | + "keyword": { | |
| 746 | + "type": "keyword", | |
| 747 | + "normalizer": "lowercase" | |
| 748 | + } | |
| 749 | + } | |
| 693 | 750 | }, |
| 694 | 751 | "en": { |
| 695 | 752 | "type": "text", |
| ... | ... | @@ -983,9 +1040,6 @@ |
| 983 | 1040 | } |
| 984 | 1041 | } |
| 985 | 1042 | }, |
| 986 | - "tags": { | |
| 987 | - "type": "keyword" | |
| 988 | - }, | |
| 989 | 1043 | "image_url": { |
| 990 | 1044 | "type": "keyword", |
| 991 | 1045 | "index": false |
| ... | ... | @@ -1012,144 +1066,309 @@ |
| 1012 | 1066 | } |
| 1013 | 1067 | } |
| 1014 | 1068 | }, |
| 1015 | - "image_embedding_512": { | |
| 1016 | - "type": "nested", | |
| 1017 | - "properties": { | |
| 1018 | - "vector": { | |
| 1019 | - "type": "dense_vector", | |
| 1020 | - "dims": 512, | |
| 1021 | - "index": true, | |
| 1022 | - "similarity": "dot_product", | |
| 1023 | - "element_type": "bfloat16" | |
| 1024 | - }, | |
| 1025 | - "url": { | |
| 1026 | - "type": "text" | |
| 1027 | - } | |
| 1028 | - } | |
| 1029 | - }, | |
| 1030 | 1069 | "category_path": { |
| 1031 | 1070 | "type": "object", |
| 1032 | 1071 | "properties": { |
| 1033 | 1072 | "zh": { |
| 1034 | 1073 | "type": "text", |
| 1035 | 1074 | "analyzer": "index_ik", |
| 1036 | - "search_analyzer": "query_ik" | |
| 1075 | + "search_analyzer": "query_ik", | |
| 1076 | + "fields": { | |
| 1077 | + "keyword": { | |
| 1078 | + "type": "keyword", | |
| 1079 | + "normalizer": "lowercase" | |
| 1080 | + } | |
| 1081 | + } | |
| 1037 | 1082 | }, |
| 1038 | 1083 | "en": { |
| 1039 | 1084 | "type": "text", |
| 1040 | - "analyzer": "english" | |
| 1085 | + "analyzer": "english", | |
| 1086 | + "fields": { | |
| 1087 | + "keyword": { | |
| 1088 | + "type": "keyword", | |
| 1089 | + "normalizer": "lowercase" | |
| 1090 | + } | |
| 1091 | + } | |
| 1041 | 1092 | }, |
| 1042 | 1093 | "ar": { |
| 1043 | 1094 | "type": "text", |
| 1044 | - "analyzer": "arabic" | |
| 1095 | + "analyzer": "arabic", | |
| 1096 | + "fields": { | |
| 1097 | + "keyword": { | |
| 1098 | + "type": "keyword", | |
| 1099 | + "normalizer": "lowercase" | |
| 1100 | + } | |
| 1101 | + } | |
| 1045 | 1102 | }, |
| 1046 | 1103 | "hy": { |
| 1047 | 1104 | "type": "text", |
| 1048 | - "analyzer": "armenian" | |
| 1105 | + "analyzer": "armenian", | |
| 1106 | + "fields": { | |
| 1107 | + "keyword": { | |
| 1108 | + "type": "keyword", | |
| 1109 | + "normalizer": "lowercase" | |
| 1110 | + } | |
| 1111 | + } | |
| 1049 | 1112 | }, |
| 1050 | 1113 | "eu": { |
| 1051 | 1114 | "type": "text", |
| 1052 | - "analyzer": "basque" | |
| 1115 | + "analyzer": "basque", | |
| 1116 | + "fields": { | |
| 1117 | + "keyword": { | |
| 1118 | + "type": "keyword", | |
| 1119 | + "normalizer": "lowercase" | |
| 1120 | + } | |
| 1121 | + } | |
| 1053 | 1122 | }, |
| 1054 | 1123 | "pt_br": { |
| 1055 | 1124 | "type": "text", |
| 1056 | - "analyzer": "brazilian" | |
| 1125 | + "analyzer": "brazilian", | |
| 1126 | + "fields": { | |
| 1127 | + "keyword": { | |
| 1128 | + "type": "keyword", | |
| 1129 | + "normalizer": "lowercase" | |
| 1130 | + } | |
| 1131 | + } | |
| 1057 | 1132 | }, |
| 1058 | 1133 | "bg": { |
| 1059 | 1134 | "type": "text", |
| 1060 | - "analyzer": "bulgarian" | |
| 1135 | + "analyzer": "bulgarian", | |
| 1136 | + "fields": { | |
| 1137 | + "keyword": { | |
| 1138 | + "type": "keyword", | |
| 1139 | + "normalizer": "lowercase" | |
| 1140 | + } | |
| 1141 | + } | |
| 1061 | 1142 | }, |
| 1062 | 1143 | "ca": { |
| 1063 | 1144 | "type": "text", |
| 1064 | - "analyzer": "catalan" | |
| 1145 | + "analyzer": "catalan", | |
| 1146 | + "fields": { | |
| 1147 | + "keyword": { | |
| 1148 | + "type": "keyword", | |
| 1149 | + "normalizer": "lowercase" | |
| 1150 | + } | |
| 1151 | + } | |
| 1065 | 1152 | }, |
| 1066 | 1153 | "cjk": { |
| 1067 | 1154 | "type": "text", |
| 1068 | - "analyzer": "cjk" | |
| 1155 | + "analyzer": "cjk", | |
| 1156 | + "fields": { | |
| 1157 | + "keyword": { | |
| 1158 | + "type": "keyword", | |
| 1159 | + "normalizer": "lowercase" | |
| 1160 | + } | |
| 1161 | + } | |
| 1069 | 1162 | }, |
| 1070 | 1163 | "cs": { |
| 1071 | 1164 | "type": "text", |
| 1072 | - "analyzer": "czech" | |
| 1165 | + "analyzer": "czech", | |
| 1166 | + "fields": { | |
| 1167 | + "keyword": { | |
| 1168 | + "type": "keyword", | |
| 1169 | + "normalizer": "lowercase" | |
| 1170 | + } | |
| 1171 | + } | |
| 1073 | 1172 | }, |
| 1074 | 1173 | "da": { |
| 1075 | 1174 | "type": "text", |
| 1076 | - "analyzer": "danish" | |
| 1175 | + "analyzer": "danish", | |
| 1176 | + "fields": { | |
| 1177 | + "keyword": { | |
| 1178 | + "type": "keyword", | |
| 1179 | + "normalizer": "lowercase" | |
| 1180 | + } | |
| 1181 | + } | |
| 1077 | 1182 | }, |
| 1078 | 1183 | "nl": { |
| 1079 | 1184 | "type": "text", |
| 1080 | - "analyzer": "dutch" | |
| 1185 | + "analyzer": "dutch", | |
| 1186 | + "fields": { | |
| 1187 | + "keyword": { | |
| 1188 | + "type": "keyword", | |
| 1189 | + "normalizer": "lowercase" | |
| 1190 | + } | |
| 1191 | + } | |
| 1081 | 1192 | }, |
| 1082 | 1193 | "fi": { |
| 1083 | 1194 | "type": "text", |
| 1084 | - "analyzer": "finnish" | |
| 1195 | + "analyzer": "finnish", | |
| 1196 | + "fields": { | |
| 1197 | + "keyword": { | |
| 1198 | + "type": "keyword", | |
| 1199 | + "normalizer": "lowercase" | |
| 1200 | + } | |
| 1201 | + } | |
| 1085 | 1202 | }, |
| 1086 | 1203 | "fr": { |
| 1087 | 1204 | "type": "text", |
| 1088 | - "analyzer": "french" | |
| 1205 | + "analyzer": "french", | |
| 1206 | + "fields": { | |
| 1207 | + "keyword": { | |
| 1208 | + "type": "keyword", | |
| 1209 | + "normalizer": "lowercase" | |
| 1210 | + } | |
| 1211 | + } | |
| 1089 | 1212 | }, |
| 1090 | 1213 | "gl": { |
| 1091 | 1214 | "type": "text", |
| 1092 | - "analyzer": "galician" | |
| 1093 | - }, | |
| 1094 | - "de": { | |
| 1215 | + "analyzer": "galician", | |
| 1216 | + "fields": { | |
| 1217 | + "keyword": { | |
| 1218 | + "type": "keyword", | |
| 1219 | + "normalizer": "lowercase" | |
| 1220 | + } | |
| 1221 | + } | |
| 1222 | + }, | |
| 1223 | + "de": { | |
| 1095 | 1224 | "type": "text", |
| 1096 | - "analyzer": "german" | |
| 1225 | + "analyzer": "german", | |
| 1226 | + "fields": { | |
| 1227 | + "keyword": { | |
| 1228 | + "type": "keyword", | |
| 1229 | + "normalizer": "lowercase" | |
| 1230 | + } | |
| 1231 | + } | |
| 1097 | 1232 | }, |
| 1098 | 1233 | "el": { |
| 1099 | 1234 | "type": "text", |
| 1100 | - "analyzer": "greek" | |
| 1235 | + "analyzer": "greek", | |
| 1236 | + "fields": { | |
| 1237 | + "keyword": { | |
| 1238 | + "type": "keyword", | |
| 1239 | + "normalizer": "lowercase" | |
| 1240 | + } | |
| 1241 | + } | |
| 1101 | 1242 | }, |
| 1102 | 1243 | "hi": { |
| 1103 | 1244 | "type": "text", |
| 1104 | - "analyzer": "hindi" | |
| 1245 | + "analyzer": "hindi", | |
| 1246 | + "fields": { | |
| 1247 | + "keyword": { | |
| 1248 | + "type": "keyword", | |
| 1249 | + "normalizer": "lowercase" | |
| 1250 | + } | |
| 1251 | + } | |
| 1105 | 1252 | }, |
| 1106 | 1253 | "hu": { |
| 1107 | 1254 | "type": "text", |
| 1108 | - "analyzer": "hungarian" | |
| 1255 | + "analyzer": "hungarian", | |
| 1256 | + "fields": { | |
| 1257 | + "keyword": { | |
| 1258 | + "type": "keyword", | |
| 1259 | + "normalizer": "lowercase" | |
| 1260 | + } | |
| 1261 | + } | |
| 1109 | 1262 | }, |
| 1110 | 1263 | "id": { |
| 1111 | 1264 | "type": "text", |
| 1112 | - "analyzer": "indonesian" | |
| 1265 | + "analyzer": "indonesian", | |
| 1266 | + "fields": { | |
| 1267 | + "keyword": { | |
| 1268 | + "type": "keyword", | |
| 1269 | + "normalizer": "lowercase" | |
| 1270 | + } | |
| 1271 | + } | |
| 1113 | 1272 | }, |
| 1114 | 1273 | "it": { |
| 1115 | 1274 | "type": "text", |
| 1116 | - "analyzer": "italian" | |
| 1275 | + "analyzer": "italian", | |
| 1276 | + "fields": { | |
| 1277 | + "keyword": { | |
| 1278 | + "type": "keyword", | |
| 1279 | + "normalizer": "lowercase" | |
| 1280 | + } | |
| 1281 | + } | |
| 1117 | 1282 | }, |
| 1118 | 1283 | "no": { |
| 1119 | 1284 | "type": "text", |
| 1120 | - "analyzer": "norwegian" | |
| 1285 | + "analyzer": "norwegian", | |
| 1286 | + "fields": { | |
| 1287 | + "keyword": { | |
| 1288 | + "type": "keyword", | |
| 1289 | + "normalizer": "lowercase" | |
| 1290 | + } | |
| 1291 | + } | |
| 1121 | 1292 | }, |
| 1122 | 1293 | "fa": { |
| 1123 | 1294 | "type": "text", |
| 1124 | - "analyzer": "persian" | |
| 1295 | + "analyzer": "persian", | |
| 1296 | + "fields": { | |
| 1297 | + "keyword": { | |
| 1298 | + "type": "keyword", | |
| 1299 | + "normalizer": "lowercase" | |
| 1300 | + } | |
| 1301 | + } | |
| 1125 | 1302 | }, |
| 1126 | 1303 | "pt": { |
| 1127 | 1304 | "type": "text", |
| 1128 | - "analyzer": "portuguese" | |
| 1305 | + "analyzer": "portuguese", | |
| 1306 | + "fields": { | |
| 1307 | + "keyword": { | |
| 1308 | + "type": "keyword", | |
| 1309 | + "normalizer": "lowercase" | |
| 1310 | + } | |
| 1311 | + } | |
| 1129 | 1312 | }, |
| 1130 | 1313 | "ro": { |
| 1131 | 1314 | "type": "text", |
| 1132 | - "analyzer": "romanian" | |
| 1315 | + "analyzer": "romanian", | |
| 1316 | + "fields": { | |
| 1317 | + "keyword": { | |
| 1318 | + "type": "keyword", | |
| 1319 | + "normalizer": "lowercase" | |
| 1320 | + } | |
| 1321 | + } | |
| 1133 | 1322 | }, |
| 1134 | 1323 | "ru": { |
| 1135 | 1324 | "type": "text", |
| 1136 | - "analyzer": "russian" | |
| 1325 | + "analyzer": "russian", | |
| 1326 | + "fields": { | |
| 1327 | + "keyword": { | |
| 1328 | + "type": "keyword", | |
| 1329 | + "normalizer": "lowercase" | |
| 1330 | + } | |
| 1331 | + } | |
| 1137 | 1332 | }, |
| 1138 | 1333 | "es": { |
| 1139 | 1334 | "type": "text", |
| 1140 | - "analyzer": "spanish" | |
| 1335 | + "analyzer": "spanish", | |
| 1336 | + "fields": { | |
| 1337 | + "keyword": { | |
| 1338 | + "type": "keyword", | |
| 1339 | + "normalizer": "lowercase" | |
| 1340 | + } | |
| 1341 | + } | |
| 1141 | 1342 | }, |
| 1142 | 1343 | "sv": { |
| 1143 | 1344 | "type": "text", |
| 1144 | - "analyzer": "swedish" | |
| 1345 | + "analyzer": "swedish", | |
| 1346 | + "fields": { | |
| 1347 | + "keyword": { | |
| 1348 | + "type": "keyword", | |
| 1349 | + "normalizer": "lowercase" | |
| 1350 | + } | |
| 1351 | + } | |
| 1145 | 1352 | }, |
| 1146 | 1353 | "tr": { |
| 1147 | 1354 | "type": "text", |
| 1148 | - "analyzer": "turkish" | |
| 1355 | + "analyzer": "turkish", | |
| 1356 | + "fields": { | |
| 1357 | + "keyword": { | |
| 1358 | + "type": "keyword", | |
| 1359 | + "normalizer": "lowercase" | |
| 1360 | + } | |
| 1361 | + } | |
| 1149 | 1362 | }, |
| 1150 | 1363 | "th": { |
| 1151 | 1364 | "type": "text", |
| 1152 | - "analyzer": "thai" | |
| 1365 | + "analyzer": "thai", | |
| 1366 | + "fields": { | |
| 1367 | + "keyword": { | |
| 1368 | + "type": "keyword", | |
| 1369 | + "normalizer": "lowercase" | |
| 1370 | + } | |
| 1371 | + } | |
| 1153 | 1372 | } |
| 1154 | 1373 | } |
| 1155 | 1374 | }, |
| ... | ... | @@ -1159,123 +1378,609 @@ |
| 1159 | 1378 | "zh": { |
| 1160 | 1379 | "type": "text", |
| 1161 | 1380 | "analyzer": "index_ik", |
| 1162 | - "search_analyzer": "query_ik" | |
| 1381 | + "search_analyzer": "query_ik", | |
| 1382 | + "fields": { | |
| 1383 | + "keyword": { | |
| 1384 | + "type": "keyword", | |
| 1385 | + "normalizer": "lowercase" | |
| 1386 | + } | |
| 1387 | + } | |
| 1163 | 1388 | }, |
| 1164 | 1389 | "en": { |
| 1165 | 1390 | "type": "text", |
| 1166 | - "analyzer": "english" | |
| 1391 | + "analyzer": "english", | |
| 1392 | + "fields": { | |
| 1393 | + "keyword": { | |
| 1394 | + "type": "keyword", | |
| 1395 | + "normalizer": "lowercase" | |
| 1396 | + } | |
| 1397 | + } | |
| 1167 | 1398 | }, |
| 1168 | 1399 | "ar": { |
| 1169 | 1400 | "type": "text", |
| 1170 | - "analyzer": "arabic" | |
| 1401 | + "analyzer": "arabic", | |
| 1402 | + "fields": { | |
| 1403 | + "keyword": { | |
| 1404 | + "type": "keyword", | |
| 1405 | + "normalizer": "lowercase" | |
| 1406 | + } | |
| 1407 | + } | |
| 1171 | 1408 | }, |
| 1172 | 1409 | "hy": { |
| 1173 | 1410 | "type": "text", |
| 1174 | - "analyzer": "armenian" | |
| 1411 | + "analyzer": "armenian", | |
| 1412 | + "fields": { | |
| 1413 | + "keyword": { | |
| 1414 | + "type": "keyword", | |
| 1415 | + "normalizer": "lowercase" | |
| 1416 | + } | |
| 1417 | + } | |
| 1175 | 1418 | }, |
| 1176 | 1419 | "eu": { |
| 1177 | 1420 | "type": "text", |
| 1178 | - "analyzer": "basque" | |
| 1421 | + "analyzer": "basque", | |
| 1422 | + "fields": { | |
| 1423 | + "keyword": { | |
| 1424 | + "type": "keyword", | |
| 1425 | + "normalizer": "lowercase" | |
| 1426 | + } | |
| 1427 | + } | |
| 1179 | 1428 | }, |
| 1180 | 1429 | "pt_br": { |
| 1181 | 1430 | "type": "text", |
| 1182 | - "analyzer": "brazilian" | |
| 1431 | + "analyzer": "brazilian", | |
| 1432 | + "fields": { | |
| 1433 | + "keyword": { | |
| 1434 | + "type": "keyword", | |
| 1435 | + "normalizer": "lowercase" | |
| 1436 | + } | |
| 1437 | + } | |
| 1183 | 1438 | }, |
| 1184 | 1439 | "bg": { |
| 1185 | 1440 | "type": "text", |
| 1186 | - "analyzer": "bulgarian" | |
| 1441 | + "analyzer": "bulgarian", | |
| 1442 | + "fields": { | |
| 1443 | + "keyword": { | |
| 1444 | + "type": "keyword", | |
| 1445 | + "normalizer": "lowercase" | |
| 1446 | + } | |
| 1447 | + } | |
| 1187 | 1448 | }, |
| 1188 | 1449 | "ca": { |
| 1189 | 1450 | "type": "text", |
| 1190 | - "analyzer": "catalan" | |
| 1451 | + "analyzer": "catalan", | |
| 1452 | + "fields": { | |
| 1453 | + "keyword": { | |
| 1454 | + "type": "keyword", | |
| 1455 | + "normalizer": "lowercase" | |
| 1456 | + } | |
| 1457 | + } | |
| 1191 | 1458 | }, |
| 1192 | 1459 | "cjk": { |
| 1193 | 1460 | "type": "text", |
| 1194 | - "analyzer": "cjk" | |
| 1461 | + "analyzer": "cjk", | |
| 1462 | + "fields": { | |
| 1463 | + "keyword": { | |
| 1464 | + "type": "keyword", | |
| 1465 | + "normalizer": "lowercase" | |
| 1466 | + } | |
| 1467 | + } | |
| 1195 | 1468 | }, |
| 1196 | 1469 | "cs": { |
| 1197 | 1470 | "type": "text", |
| 1198 | - "analyzer": "czech" | |
| 1471 | + "analyzer": "czech", | |
| 1472 | + "fields": { | |
| 1473 | + "keyword": { | |
| 1474 | + "type": "keyword", | |
| 1475 | + "normalizer": "lowercase" | |
| 1476 | + } | |
| 1477 | + } | |
| 1199 | 1478 | }, |
| 1200 | 1479 | "da": { |
| 1201 | 1480 | "type": "text", |
| 1202 | - "analyzer": "danish" | |
| 1481 | + "analyzer": "danish", | |
| 1482 | + "fields": { | |
| 1483 | + "keyword": { | |
| 1484 | + "type": "keyword", | |
| 1485 | + "normalizer": "lowercase" | |
| 1486 | + } | |
| 1487 | + } | |
| 1203 | 1488 | }, |
| 1204 | 1489 | "nl": { |
| 1205 | 1490 | "type": "text", |
| 1206 | - "analyzer": "dutch" | |
| 1491 | + "analyzer": "dutch", | |
| 1492 | + "fields": { | |
| 1493 | + "keyword": { | |
| 1494 | + "type": "keyword", | |
| 1495 | + "normalizer": "lowercase" | |
| 1496 | + } | |
| 1497 | + } | |
| 1207 | 1498 | }, |
| 1208 | 1499 | "fi": { |
| 1209 | 1500 | "type": "text", |
| 1210 | - "analyzer": "finnish" | |
| 1501 | + "analyzer": "finnish", | |
| 1502 | + "fields": { | |
| 1503 | + "keyword": { | |
| 1504 | + "type": "keyword", | |
| 1505 | + "normalizer": "lowercase" | |
| 1506 | + } | |
| 1507 | + } | |
| 1211 | 1508 | }, |
| 1212 | 1509 | "fr": { |
| 1213 | 1510 | "type": "text", |
| 1214 | - "analyzer": "french" | |
| 1511 | + "analyzer": "french", | |
| 1512 | + "fields": { | |
| 1513 | + "keyword": { | |
| 1514 | + "type": "keyword", | |
| 1515 | + "normalizer": "lowercase" | |
| 1516 | + } | |
| 1517 | + } | |
| 1215 | 1518 | }, |
| 1216 | 1519 | "gl": { |
| 1217 | 1520 | "type": "text", |
| 1218 | - "analyzer": "galician" | |
| 1521 | + "analyzer": "galician", | |
| 1522 | + "fields": { | |
| 1523 | + "keyword": { | |
| 1524 | + "type": "keyword", | |
| 1525 | + "normalizer": "lowercase" | |
| 1526 | + } | |
| 1527 | + } | |
| 1219 | 1528 | }, |
| 1220 | 1529 | "de": { |
| 1221 | 1530 | "type": "text", |
| 1222 | - "analyzer": "german" | |
| 1531 | + "analyzer": "german", | |
| 1532 | + "fields": { | |
| 1533 | + "keyword": { | |
| 1534 | + "type": "keyword", | |
| 1535 | + "normalizer": "lowercase" | |
| 1536 | + } | |
| 1537 | + } | |
| 1538 | + }, | |
| 1539 | + "el": { | |
| 1540 | + "type": "text", | |
| 1541 | + "analyzer": "greek", | |
| 1542 | + "fields": { | |
| 1543 | + "keyword": { | |
| 1544 | + "type": "keyword", | |
| 1545 | + "normalizer": "lowercase" | |
| 1546 | + } | |
| 1547 | + } | |
| 1548 | + }, | |
| 1549 | + "hi": { | |
| 1550 | + "type": "text", | |
| 1551 | + "analyzer": "hindi", | |
| 1552 | + "fields": { | |
| 1553 | + "keyword": { | |
| 1554 | + "type": "keyword", | |
| 1555 | + "normalizer": "lowercase" | |
| 1556 | + } | |
| 1557 | + } | |
| 1558 | + }, | |
| 1559 | + "hu": { | |
| 1560 | + "type": "text", | |
| 1561 | + "analyzer": "hungarian", | |
| 1562 | + "fields": { | |
| 1563 | + "keyword": { | |
| 1564 | + "type": "keyword", | |
| 1565 | + "normalizer": "lowercase" | |
| 1566 | + } | |
| 1567 | + } | |
| 1568 | + }, | |
| 1569 | + "id": { | |
| 1570 | + "type": "text", | |
| 1571 | + "analyzer": "indonesian", | |
| 1572 | + "fields": { | |
| 1573 | + "keyword": { | |
| 1574 | + "type": "keyword", | |
| 1575 | + "normalizer": "lowercase" | |
| 1576 | + } | |
| 1577 | + } | |
| 1578 | + }, | |
| 1579 | + "it": { | |
| 1580 | + "type": "text", | |
| 1581 | + "analyzer": "italian", | |
| 1582 | + "fields": { | |
| 1583 | + "keyword": { | |
| 1584 | + "type": "keyword", | |
| 1585 | + "normalizer": "lowercase" | |
| 1586 | + } | |
| 1587 | + } | |
| 1588 | + }, | |
| 1589 | + "no": { | |
| 1590 | + "type": "text", | |
| 1591 | + "analyzer": "norwegian", | |
| 1592 | + "fields": { | |
| 1593 | + "keyword": { | |
| 1594 | + "type": "keyword", | |
| 1595 | + "normalizer": "lowercase" | |
| 1596 | + } | |
| 1597 | + } | |
| 1598 | + }, | |
| 1599 | + "fa": { | |
| 1600 | + "type": "text", | |
| 1601 | + "analyzer": "persian", | |
| 1602 | + "fields": { | |
| 1603 | + "keyword": { | |
| 1604 | + "type": "keyword", | |
| 1605 | + "normalizer": "lowercase" | |
| 1606 | + } | |
| 1607 | + } | |
| 1608 | + }, | |
| 1609 | + "pt": { | |
| 1610 | + "type": "text", | |
| 1611 | + "analyzer": "portuguese", | |
| 1612 | + "fields": { | |
| 1613 | + "keyword": { | |
| 1614 | + "type": "keyword", | |
| 1615 | + "normalizer": "lowercase" | |
| 1616 | + } | |
| 1617 | + } | |
| 1618 | + }, | |
| 1619 | + "ro": { | |
| 1620 | + "type": "text", | |
| 1621 | + "analyzer": "romanian", | |
| 1622 | + "fields": { | |
| 1623 | + "keyword": { | |
| 1624 | + "type": "keyword", | |
| 1625 | + "normalizer": "lowercase" | |
| 1626 | + } | |
| 1627 | + } | |
| 1628 | + }, | |
| 1629 | + "ru": { | |
| 1630 | + "type": "text", | |
| 1631 | + "analyzer": "russian", | |
| 1632 | + "fields": { | |
| 1633 | + "keyword": { | |
| 1634 | + "type": "keyword", | |
| 1635 | + "normalizer": "lowercase" | |
| 1636 | + } | |
| 1637 | + } | |
| 1638 | + }, | |
| 1639 | + "es": { | |
| 1640 | + "type": "text", | |
| 1641 | + "analyzer": "spanish", | |
| 1642 | + "fields": { | |
| 1643 | + "keyword": { | |
| 1644 | + "type": "keyword", | |
| 1645 | + "normalizer": "lowercase" | |
| 1646 | + } | |
| 1647 | + } | |
| 1648 | + }, | |
| 1649 | + "sv": { | |
| 1650 | + "type": "text", | |
| 1651 | + "analyzer": "swedish", | |
| 1652 | + "fields": { | |
| 1653 | + "keyword": { | |
| 1654 | + "type": "keyword", | |
| 1655 | + "normalizer": "lowercase" | |
| 1656 | + } | |
| 1657 | + } | |
| 1658 | + }, | |
| 1659 | + "tr": { | |
| 1660 | + "type": "text", | |
| 1661 | + "analyzer": "turkish", | |
| 1662 | + "fields": { | |
| 1663 | + "keyword": { | |
| 1664 | + "type": "keyword", | |
| 1665 | + "normalizer": "lowercase" | |
| 1666 | + } | |
| 1667 | + } | |
| 1668 | + }, | |
| 1669 | + "th": { | |
| 1670 | + "type": "text", | |
| 1671 | + "analyzer": "thai", | |
| 1672 | + "fields": { | |
| 1673 | + "keyword": { | |
| 1674 | + "type": "keyword", | |
| 1675 | + "normalizer": "lowercase" | |
| 1676 | + } | |
| 1677 | + } | |
| 1678 | + } | |
| 1679 | + } | |
| 1680 | + }, | |
| 1681 | + "tags": { | |
| 1682 | + "type": "object", | |
| 1683 | + "properties": { | |
| 1684 | + "zh": { | |
| 1685 | + "type": "text", | |
| 1686 | + "analyzer": "index_ik", | |
| 1687 | + "search_analyzer": "query_ik", | |
| 1688 | + "fields": { | |
| 1689 | + "keyword": { | |
| 1690 | + "type": "keyword", | |
| 1691 | + "normalizer": "lowercase" | |
| 1692 | + } | |
| 1693 | + } | |
| 1694 | + }, | |
| 1695 | + "en": { | |
| 1696 | + "type": "text", | |
| 1697 | + "analyzer": "english", | |
| 1698 | + "fields": { | |
| 1699 | + "keyword": { | |
| 1700 | + "type": "keyword", | |
| 1701 | + "normalizer": "lowercase" | |
| 1702 | + } | |
| 1703 | + } | |
| 1704 | + }, | |
| 1705 | + "ar": { | |
| 1706 | + "type": "text", | |
| 1707 | + "analyzer": "arabic", | |
| 1708 | + "fields": { | |
| 1709 | + "keyword": { | |
| 1710 | + "type": "keyword", | |
| 1711 | + "normalizer": "lowercase" | |
| 1712 | + } | |
| 1713 | + } | |
| 1714 | + }, | |
| 1715 | + "hy": { | |
| 1716 | + "type": "text", | |
| 1717 | + "analyzer": "armenian", | |
| 1718 | + "fields": { | |
| 1719 | + "keyword": { | |
| 1720 | + "type": "keyword", | |
| 1721 | + "normalizer": "lowercase" | |
| 1722 | + } | |
| 1723 | + } | |
| 1724 | + }, | |
| 1725 | + "eu": { | |
| 1726 | + "type": "text", | |
| 1727 | + "analyzer": "basque", | |
| 1728 | + "fields": { | |
| 1729 | + "keyword": { | |
| 1730 | + "type": "keyword", | |
| 1731 | + "normalizer": "lowercase" | |
| 1732 | + } | |
| 1733 | + } | |
| 1734 | + }, | |
| 1735 | + "pt_br": { | |
| 1736 | + "type": "text", | |
| 1737 | + "analyzer": "brazilian", | |
| 1738 | + "fields": { | |
| 1739 | + "keyword": { | |
| 1740 | + "type": "keyword", | |
| 1741 | + "normalizer": "lowercase" | |
| 1742 | + } | |
| 1743 | + } | |
| 1744 | + }, | |
| 1745 | + "bg": { | |
| 1746 | + "type": "text", | |
| 1747 | + "analyzer": "bulgarian", | |
| 1748 | + "fields": { | |
| 1749 | + "keyword": { | |
| 1750 | + "type": "keyword", | |
| 1751 | + "normalizer": "lowercase" | |
| 1752 | + } | |
| 1753 | + } | |
| 1754 | + }, | |
| 1755 | + "ca": { | |
| 1756 | + "type": "text", | |
| 1757 | + "analyzer": "catalan", | |
| 1758 | + "fields": { | |
| 1759 | + "keyword": { | |
| 1760 | + "type": "keyword", | |
| 1761 | + "normalizer": "lowercase" | |
| 1762 | + } | |
| 1763 | + } | |
| 1764 | + }, | |
| 1765 | + "cjk": { | |
| 1766 | + "type": "text", | |
| 1767 | + "analyzer": "cjk", | |
| 1768 | + "fields": { | |
| 1769 | + "keyword": { | |
| 1770 | + "type": "keyword", | |
| 1771 | + "normalizer": "lowercase" | |
| 1772 | + } | |
| 1773 | + } | |
| 1774 | + }, | |
| 1775 | + "cs": { | |
| 1776 | + "type": "text", | |
| 1777 | + "analyzer": "czech", | |
| 1778 | + "fields": { | |
| 1779 | + "keyword": { | |
| 1780 | + "type": "keyword", | |
| 1781 | + "normalizer": "lowercase" | |
| 1782 | + } | |
| 1783 | + } | |
| 1784 | + }, | |
| 1785 | + "da": { | |
| 1786 | + "type": "text", | |
| 1787 | + "analyzer": "danish", | |
| 1788 | + "fields": { | |
| 1789 | + "keyword": { | |
| 1790 | + "type": "keyword", | |
| 1791 | + "normalizer": "lowercase" | |
| 1792 | + } | |
| 1793 | + } | |
| 1794 | + }, | |
| 1795 | + "nl": { | |
| 1796 | + "type": "text", | |
| 1797 | + "analyzer": "dutch", | |
| 1798 | + "fields": { | |
| 1799 | + "keyword": { | |
| 1800 | + "type": "keyword", | |
| 1801 | + "normalizer": "lowercase" | |
| 1802 | + } | |
| 1803 | + } | |
| 1804 | + }, | |
| 1805 | + "fi": { | |
| 1806 | + "type": "text", | |
| 1807 | + "analyzer": "finnish", | |
| 1808 | + "fields": { | |
| 1809 | + "keyword": { | |
| 1810 | + "type": "keyword", | |
| 1811 | + "normalizer": "lowercase" | |
| 1812 | + } | |
| 1813 | + } | |
| 1814 | + }, | |
| 1815 | + "fr": { | |
| 1816 | + "type": "text", | |
| 1817 | + "analyzer": "french", | |
| 1818 | + "fields": { | |
| 1819 | + "keyword": { | |
| 1820 | + "type": "keyword", | |
| 1821 | + "normalizer": "lowercase" | |
| 1822 | + } | |
| 1823 | + } | |
| 1824 | + }, | |
| 1825 | + "gl": { | |
| 1826 | + "type": "text", | |
| 1827 | + "analyzer": "galician", | |
| 1828 | + "fields": { | |
| 1829 | + "keyword": { | |
| 1830 | + "type": "keyword", | |
| 1831 | + "normalizer": "lowercase" | |
| 1832 | + } | |
| 1833 | + } | |
| 1834 | + }, | |
| 1835 | + "de": { | |
| 1836 | + "type": "text", | |
| 1837 | + "analyzer": "german", | |
| 1838 | + "fields": { | |
| 1839 | + "keyword": { | |
| 1840 | + "type": "keyword", | |
| 1841 | + "normalizer": "lowercase" | |
| 1842 | + } | |
| 1843 | + } | |
| 1223 | 1844 | }, |
| 1224 | 1845 | "el": { |
| 1225 | 1846 | "type": "text", |
| 1226 | - "analyzer": "greek" | |
| 1847 | + "analyzer": "greek", | |
| 1848 | + "fields": { | |
| 1849 | + "keyword": { | |
| 1850 | + "type": "keyword", | |
| 1851 | + "normalizer": "lowercase" | |
| 1852 | + } | |
| 1853 | + } | |
| 1227 | 1854 | }, |
| 1228 | 1855 | "hi": { |
| 1229 | 1856 | "type": "text", |
| 1230 | - "analyzer": "hindi" | |
| 1857 | + "analyzer": "hindi", | |
| 1858 | + "fields": { | |
| 1859 | + "keyword": { | |
| 1860 | + "type": "keyword", | |
| 1861 | + "normalizer": "lowercase" | |
| 1862 | + } | |
| 1863 | + } | |
| 1231 | 1864 | }, |
| 1232 | 1865 | "hu": { |
| 1233 | 1866 | "type": "text", |
| 1234 | - "analyzer": "hungarian" | |
| 1867 | + "analyzer": "hungarian", | |
| 1868 | + "fields": { | |
| 1869 | + "keyword": { | |
| 1870 | + "type": "keyword", | |
| 1871 | + "normalizer": "lowercase" | |
| 1872 | + } | |
| 1873 | + } | |
| 1235 | 1874 | }, |
| 1236 | 1875 | "id": { |
| 1237 | 1876 | "type": "text", |
| 1238 | - "analyzer": "indonesian" | |
| 1877 | + "analyzer": "indonesian", | |
| 1878 | + "fields": { | |
| 1879 | + "keyword": { | |
| 1880 | + "type": "keyword", | |
| 1881 | + "normalizer": "lowercase" | |
| 1882 | + } | |
| 1883 | + } | |
| 1239 | 1884 | }, |
| 1240 | 1885 | "it": { |
| 1241 | 1886 | "type": "text", |
| 1242 | - "analyzer": "italian" | |
| 1887 | + "analyzer": "italian", | |
| 1888 | + "fields": { | |
| 1889 | + "keyword": { | |
| 1890 | + "type": "keyword", | |
| 1891 | + "normalizer": "lowercase" | |
| 1892 | + } | |
| 1893 | + } | |
| 1243 | 1894 | }, |
| 1244 | 1895 | "no": { |
| 1245 | 1896 | "type": "text", |
| 1246 | - "analyzer": "norwegian" | |
| 1897 | + "analyzer": "norwegian", | |
| 1898 | + "fields": { | |
| 1899 | + "keyword": { | |
| 1900 | + "type": "keyword", | |
| 1901 | + "normalizer": "lowercase" | |
| 1902 | + } | |
| 1903 | + } | |
| 1247 | 1904 | }, |
| 1248 | 1905 | "fa": { |
| 1249 | 1906 | "type": "text", |
| 1250 | - "analyzer": "persian" | |
| 1907 | + "analyzer": "persian", | |
| 1908 | + "fields": { | |
| 1909 | + "keyword": { | |
| 1910 | + "type": "keyword", | |
| 1911 | + "normalizer": "lowercase" | |
| 1912 | + } | |
| 1913 | + } | |
| 1251 | 1914 | }, |
| 1252 | 1915 | "pt": { |
| 1253 | 1916 | "type": "text", |
| 1254 | - "analyzer": "portuguese" | |
| 1917 | + "analyzer": "portuguese", | |
| 1918 | + "fields": { | |
| 1919 | + "keyword": { | |
| 1920 | + "type": "keyword", | |
| 1921 | + "normalizer": "lowercase" | |
| 1922 | + } | |
| 1923 | + } | |
| 1255 | 1924 | }, |
| 1256 | 1925 | "ro": { |
| 1257 | 1926 | "type": "text", |
| 1258 | - "analyzer": "romanian" | |
| 1927 | + "analyzer": "romanian", | |
| 1928 | + "fields": { | |
| 1929 | + "keyword": { | |
| 1930 | + "type": "keyword", | |
| 1931 | + "normalizer": "lowercase" | |
| 1932 | + } | |
| 1933 | + } | |
| 1259 | 1934 | }, |
| 1260 | 1935 | "ru": { |
| 1261 | 1936 | "type": "text", |
| 1262 | - "analyzer": "russian" | |
| 1937 | + "analyzer": "russian", | |
| 1938 | + "fields": { | |
| 1939 | + "keyword": { | |
| 1940 | + "type": "keyword", | |
| 1941 | + "normalizer": "lowercase" | |
| 1942 | + } | |
| 1943 | + } | |
| 1263 | 1944 | }, |
| 1264 | 1945 | "es": { |
| 1265 | 1946 | "type": "text", |
| 1266 | - "analyzer": "spanish" | |
| 1947 | + "analyzer": "spanish", | |
| 1948 | + "fields": { | |
| 1949 | + "keyword": { | |
| 1950 | + "type": "keyword", | |
| 1951 | + "normalizer": "lowercase" | |
| 1952 | + } | |
| 1953 | + } | |
| 1267 | 1954 | }, |
| 1268 | 1955 | "sv": { |
| 1269 | 1956 | "type": "text", |
| 1270 | - "analyzer": "swedish" | |
| 1957 | + "analyzer": "swedish", | |
| 1958 | + "fields": { | |
| 1959 | + "keyword": { | |
| 1960 | + "type": "keyword", | |
| 1961 | + "normalizer": "lowercase" | |
| 1962 | + } | |
| 1963 | + } | |
| 1271 | 1964 | }, |
| 1272 | 1965 | "tr": { |
| 1273 | 1966 | "type": "text", |
| 1274 | - "analyzer": "turkish" | |
| 1967 | + "analyzer": "turkish", | |
| 1968 | + "fields": { | |
| 1969 | + "keyword": { | |
| 1970 | + "type": "keyword", | |
| 1971 | + "normalizer": "lowercase" | |
| 1972 | + } | |
| 1973 | + } | |
| 1275 | 1974 | }, |
| 1276 | 1975 | "th": { |
| 1277 | 1976 | "type": "text", |
| 1278 | - "analyzer": "thai" | |
| 1977 | + "analyzer": "thai", | |
| 1978 | + "fields": { | |
| 1979 | + "keyword": { | |
| 1980 | + "type": "keyword", | |
| 1981 | + "normalizer": "lowercase" | |
| 1982 | + } | |
| 1983 | + } | |
| 1279 | 1984 | } |
| 1280 | 1985 | } |
| 1281 | 1986 | }, |
| ... | ... | @@ -1306,8 +2011,108 @@ |
| 1306 | 2011 | "name": { |
| 1307 | 2012 | "type": "keyword" |
| 1308 | 2013 | }, |
| 1309 | - "value": { | |
| 2014 | + "value_keyword": { | |
| 2015 | + "type": "keyword" | |
| 2016 | + }, | |
| 2017 | + "value_text": { | |
| 2018 | + "type": "object", | |
| 2019 | + "properties": { | |
| 2020 | + "zh": { | |
| 2021 | + "type": "text", | |
| 2022 | + "analyzer": "index_ik", | |
| 2023 | + "search_analyzer": "query_ik", | |
| 2024 | + "fields": { | |
| 2025 | + "keyword": { | |
| 2026 | + "type": "keyword", | |
| 2027 | + "normalizer": "lowercase" | |
| 2028 | + } | |
| 2029 | + } | |
| 2030 | + }, | |
| 2031 | + "en": { | |
| 2032 | + "type": "text", | |
| 2033 | + "analyzer": "english", | |
| 2034 | + "fields": { | |
| 2035 | + "keyword": { | |
| 2036 | + "type": "keyword", | |
| 2037 | + "normalizer": "lowercase" | |
| 2038 | + } | |
| 2039 | + } | |
| 2040 | + } | |
| 2041 | + } | |
| 2042 | + } | |
| 2043 | + } | |
| 2044 | + }, | |
| 2045 | + "qanchors": { | |
| 2046 | + "type": "object", | |
| 2047 | + "properties": { | |
| 2048 | + "zh": { | |
| 2049 | + "type": "text", | |
| 2050 | + "analyzer": "index_ik", | |
| 2051 | + "search_analyzer": "query_ik" | |
| 2052 | + }, | |
| 2053 | + "en": { | |
| 2054 | + "type": "text", | |
| 2055 | + "analyzer": "english" | |
| 2056 | + } | |
| 2057 | + } | |
| 2058 | + }, | |
| 2059 | + "enriched_tags": { | |
| 2060 | + "type": "object", | |
| 2061 | + "properties": { | |
| 2062 | + "zh": { | |
| 2063 | + "type": "text", | |
| 2064 | + "analyzer": "index_ik", | |
| 2065 | + "search_analyzer": "query_ik", | |
| 2066 | + "fields": { | |
| 2067 | + "keyword": { | |
| 2068 | + "type": "keyword", | |
| 2069 | + "normalizer": "lowercase" | |
| 2070 | + } | |
| 2071 | + } | |
| 2072 | + }, | |
| 2073 | + "en": { | |
| 2074 | + "type": "text", | |
| 2075 | + "analyzer": "english", | |
| 2076 | + "fields": { | |
| 2077 | + "keyword": { | |
| 2078 | + "type": "keyword", | |
| 2079 | + "normalizer": "lowercase" | |
| 2080 | + } | |
| 2081 | + } | |
| 2082 | + } | |
| 2083 | + } | |
| 2084 | + }, | |
| 2085 | + "enriched_attributes": { | |
| 2086 | + "type": "nested", | |
| 2087 | + "properties": { | |
| 2088 | + "name": { | |
| 1310 | 2089 | "type": "keyword" |
| 2090 | + }, | |
| 2091 | + "value": { | |
| 2092 | + "type": "object", | |
| 2093 | + "properties": { | |
| 2094 | + "zh": { | |
| 2095 | + "type": "text", | |
| 2096 | + "analyzer": "index_ik", | |
| 2097 | + "search_analyzer": "query_ik", | |
| 2098 | + "fields": { | |
| 2099 | + "keyword": { | |
| 2100 | + "type": "keyword", | |
| 2101 | + "normalizer": "lowercase" | |
| 2102 | + } | |
| 2103 | + } | |
| 2104 | + }, | |
| 2105 | + "en": { | |
| 2106 | + "type": "text", | |
| 2107 | + "analyzer": "english", | |
| 2108 | + "fields": { | |
| 2109 | + "keyword": { | |
| 2110 | + "type": "keyword", | |
| 2111 | + "normalizer": "lowercase" | |
| 2112 | + } | |
| 2113 | + } | |
| 2114 | + } | |
| 2115 | + } | |
| 1311 | 2116 | } |
| 1312 | 2117 | } |
| 1313 | 2118 | }, |
| ... | ... | @@ -1321,13 +2126,82 @@ |
| 1321 | 2126 | "type": "keyword" |
| 1322 | 2127 | }, |
| 1323 | 2128 | "option1_values": { |
| 1324 | - "type": "keyword" | |
| 2129 | + "type": "object", | |
| 2130 | + "properties": { | |
| 2131 | + "zh": { | |
| 2132 | + "type": "text", | |
| 2133 | + "analyzer": "index_ik", | |
| 2134 | + "search_analyzer": "query_ik", | |
| 2135 | + "fields": { | |
| 2136 | + "keyword": { | |
| 2137 | + "type": "keyword", | |
| 2138 | + "normalizer": "lowercase" | |
| 2139 | + } | |
| 2140 | + } | |
| 2141 | + }, | |
| 2142 | + "en": { | |
| 2143 | + "type": "text", | |
| 2144 | + "analyzer": "english", | |
| 2145 | + "fields": { | |
| 2146 | + "keyword": { | |
| 2147 | + "type": "keyword", | |
| 2148 | + "normalizer": "lowercase" | |
| 2149 | + } | |
| 2150 | + } | |
| 2151 | + } | |
| 2152 | + } | |
| 1325 | 2153 | }, |
| 1326 | 2154 | "option2_values": { |
| 1327 | - "type": "keyword" | |
| 2155 | + "type": "object", | |
| 2156 | + "properties": { | |
| 2157 | + "zh": { | |
| 2158 | + "type": "text", | |
| 2159 | + "analyzer": "index_ik", | |
| 2160 | + "search_analyzer": "query_ik", | |
| 2161 | + "fields": { | |
| 2162 | + "keyword": { | |
| 2163 | + "type": "keyword", | |
| 2164 | + "normalizer": "lowercase" | |
| 2165 | + } | |
| 2166 | + } | |
| 2167 | + }, | |
| 2168 | + "en": { | |
| 2169 | + "type": "text", | |
| 2170 | + "analyzer": "english", | |
| 2171 | + "fields": { | |
| 2172 | + "keyword": { | |
| 2173 | + "type": "keyword", | |
| 2174 | + "normalizer": "lowercase" | |
| 2175 | + } | |
| 2176 | + } | |
| 2177 | + } | |
| 2178 | + } | |
| 1328 | 2179 | }, |
| 1329 | 2180 | "option3_values": { |
| 1330 | - "type": "keyword" | |
| 2181 | + "type": "object", | |
| 2182 | + "properties": { | |
| 2183 | + "zh": { | |
| 2184 | + "type": "text", | |
| 2185 | + "analyzer": "index_ik", | |
| 2186 | + "search_analyzer": "query_ik", | |
| 2187 | + "fields": { | |
| 2188 | + "keyword": { | |
| 2189 | + "type": "keyword", | |
| 2190 | + "normalizer": "lowercase" | |
| 2191 | + } | |
| 2192 | + } | |
| 2193 | + }, | |
| 2194 | + "en": { | |
| 2195 | + "type": "text", | |
| 2196 | + "analyzer": "english", | |
| 2197 | + "fields": { | |
| 2198 | + "keyword": { | |
| 2199 | + "type": "keyword", | |
| 2200 | + "normalizer": "lowercase" | |
| 2201 | + } | |
| 2202 | + } | |
| 2203 | + } | |
| 2204 | + } | |
| 1331 | 2205 | }, |
| 1332 | 2206 | "min_price": { |
| 1333 | 2207 | "type": "float" |
| ... | ... | @@ -1391,20 +2265,6 @@ |
| 1391 | 2265 | "index": false |
| 1392 | 2266 | } |
| 1393 | 2267 | } |
| 1394 | - }, | |
| 1395 | - "semantic_attributes": { | |
| 1396 | - "type": "nested", | |
| 1397 | - "properties": { | |
| 1398 | - "lang": { | |
| 1399 | - "type": "keyword" | |
| 1400 | - }, | |
| 1401 | - "name": { | |
| 1402 | - "type": "keyword" | |
| 1403 | - }, | |
| 1404 | - "value": { | |
| 1405 | - "type": "keyword" | |
| 1406 | - } | |
| 1407 | - } | |
| 1408 | 2268 | } |
| 1409 | 2269 | } |
| 1410 | 2270 | } | ... | ... |
| ... | ... | @@ -0,0 +1,629 @@ |
| 1 | +{ | |
| 2 | + "settings": { | |
| 3 | + "number_of_shards": 1, | |
| 4 | + "number_of_replicas": 0, | |
| 5 | + "refresh_interval": "30s", | |
| 6 | + "analysis": { | |
| 7 | + "analyzer": { | |
| 8 | + "index_ik": { | |
| 9 | + "type": "custom", | |
| 10 | + "tokenizer": "ik_max_word", | |
| 11 | + "filter": [ | |
| 12 | + "lowercase", | |
| 13 | + "asciifolding" | |
| 14 | + ] | |
| 15 | + }, | |
| 16 | + "query_ik": { | |
| 17 | + "type": "custom", | |
| 18 | + "tokenizer": "ik_smart", | |
| 19 | + "filter": [ | |
| 20 | + "lowercase", | |
| 21 | + "asciifolding" | |
| 22 | + ] | |
| 23 | + } | |
| 24 | + }, | |
| 25 | + "normalizer": { | |
| 26 | + "lowercase": { | |
| 27 | + "type": "custom", | |
| 28 | + "filter": [ | |
| 29 | + "lowercase" | |
| 30 | + ] | |
| 31 | + } | |
| 32 | + } | |
| 33 | + }, | |
| 34 | + "similarity": { | |
| 35 | + "default": { | |
| 36 | + "type": "BM25", | |
| 37 | + "b": 0.0, | |
| 38 | + "k1": 0.0 | |
| 39 | + } | |
| 40 | + } | |
| 41 | + }, | |
| 42 | + "mappings": { | |
| 43 | + "properties": { | |
| 44 | + "tenant_id": { | |
| 45 | + "type": "keyword" | |
| 46 | + }, | |
| 47 | + "spu_id": { | |
| 48 | + "type": "keyword" | |
| 49 | + }, | |
| 50 | + "create_time": { | |
| 51 | + "type": "date" | |
| 52 | + }, | |
| 53 | + "update_time": { | |
| 54 | + "type": "date" | |
| 55 | + }, | |
| 56 | + "title": { | |
| 57 | + "type": "object", | |
| 58 | + "properties": { | |
| 59 | + "zh": { | |
| 60 | + "type": "text", | |
| 61 | + "analyzer": "index_ik", | |
| 62 | + "search_analyzer": "query_ik" | |
| 63 | + }, | |
| 64 | + "en": { | |
| 65 | + "type": "text", | |
| 66 | + "analyzer": "english" | |
| 67 | + }, | |
| 68 | + "ar": { | |
| 69 | + "type": "text", | |
| 70 | + "analyzer": "arabic" | |
| 71 | + }, | |
| 72 | + "hy": { | |
| 73 | + "type": "text", | |
| 74 | + "analyzer": "armenian" | |
| 75 | + }, | |
| 76 | + "eu": { | |
| 77 | + "type": "text", | |
| 78 | + "analyzer": "basque" | |
| 79 | + }, | |
| 80 | + "pt_br": { | |
| 81 | + "type": "text", | |
| 82 | + "analyzer": "brazilian" | |
| 83 | + }, | |
| 84 | + "bg": { | |
| 85 | + "type": "text", | |
| 86 | + "analyzer": "bulgarian" | |
| 87 | + }, | |
| 88 | + "ca": { | |
| 89 | + "type": "text", | |
| 90 | + "analyzer": "catalan" | |
| 91 | + }, | |
| 92 | + "cjk": { | |
| 93 | + "type": "text", | |
| 94 | + "analyzer": "cjk" | |
| 95 | + }, | |
| 96 | + "cs": { | |
| 97 | + "type": "text", | |
| 98 | + "analyzer": "czech" | |
| 99 | + }, | |
| 100 | + "da": { | |
| 101 | + "type": "text", | |
| 102 | + "analyzer": "danish" | |
| 103 | + }, | |
| 104 | + "nl": { | |
| 105 | + "type": "text", | |
| 106 | + "analyzer": "dutch" | |
| 107 | + }, | |
| 108 | + "fi": { | |
| 109 | + "type": "text", | |
| 110 | + "analyzer": "finnish" | |
| 111 | + }, | |
| 112 | + "fr": { | |
| 113 | + "type": "text", | |
| 114 | + "analyzer": "french" | |
| 115 | + }, | |
| 116 | + "gl": { | |
| 117 | + "type": "text", | |
| 118 | + "analyzer": "galician" | |
| 119 | + }, | |
| 120 | + "de": { | |
| 121 | + "type": "text", | |
| 122 | + "analyzer": "german" | |
| 123 | + }, | |
| 124 | + "el": { | |
| 125 | + "type": "text", | |
| 126 | + "analyzer": "greek" | |
| 127 | + }, | |
| 128 | + "hi": { | |
| 129 | + "type": "text", | |
| 130 | + "analyzer": "hindi" | |
| 131 | + }, | |
| 132 | + "hu": { | |
| 133 | + "type": "text", | |
| 134 | + "analyzer": "hungarian" | |
| 135 | + }, | |
| 136 | + "id": { | |
| 137 | + "type": "text", | |
| 138 | + "analyzer": "indonesian" | |
| 139 | + }, | |
| 140 | + "it": { | |
| 141 | + "type": "text", | |
| 142 | + "analyzer": "italian" | |
| 143 | + }, | |
| 144 | + "no": { | |
| 145 | + "type": "text", | |
| 146 | + "analyzer": "norwegian" | |
| 147 | + }, | |
| 148 | + "fa": { | |
| 149 | + "type": "text", | |
| 150 | + "analyzer": "persian" | |
| 151 | + }, | |
| 152 | + "pt": { | |
| 153 | + "type": "text", | |
| 154 | + "analyzer": "portuguese" | |
| 155 | + }, | |
| 156 | + "ro": { | |
| 157 | + "type": "text", | |
| 158 | + "analyzer": "romanian" | |
| 159 | + }, | |
| 160 | + "ru": { | |
| 161 | + "type": "text", | |
| 162 | + "analyzer": "russian" | |
| 163 | + }, | |
| 164 | + "es": { | |
| 165 | + "type": "text", | |
| 166 | + "analyzer": "spanish" | |
| 167 | + }, | |
| 168 | + "sv": { | |
| 169 | + "type": "text", | |
| 170 | + "analyzer": "swedish" | |
| 171 | + }, | |
| 172 | + "tr": { | |
| 173 | + "type": "text", | |
| 174 | + "analyzer": "turkish" | |
| 175 | + }, | |
| 176 | + "th": { | |
| 177 | + "type": "text", | |
| 178 | + "analyzer": "thai" | |
| 179 | + } | |
| 180 | + } | |
| 181 | + }, | |
| 182 | + "keywords": { | |
| 183 | + "type": "object", | |
| 184 | + "properties": { | |
| 185 | + "zh": { | |
| 186 | + "type": "text", | |
| 187 | + "analyzer": "index_ik", | |
| 188 | + "search_analyzer": "query_ik" | |
| 189 | + }, | |
| 190 | + "en": { | |
| 191 | + "type": "text", | |
| 192 | + "analyzer": "english", | |
| 193 | + "fields": { | |
| 194 | + "keyword": { | |
| 195 | + "type": "keyword", | |
| 196 | + "normalizer": "lowercase" | |
| 197 | + } | |
| 198 | + } | |
| 199 | + }, | |
| 200 | + "ar": { | |
| 201 | + "type": "text", | |
| 202 | + "analyzer": "arabic", | |
| 203 | + "fields": { | |
| 204 | + "keyword": { | |
| 205 | + "type": "keyword", | |
| 206 | + "normalizer": "lowercase" | |
| 207 | + } | |
| 208 | + } | |
| 209 | + }, | |
| 210 | +... | |
| 211 | + } | |
| 212 | + }, | |
| 213 | + "brief": { | |
| 214 | + "type": "object", | |
| 215 | + "properties": { | |
| 216 | + "zh": { | |
| 217 | + "type": "text", | |
| 218 | + "analyzer": "index_ik", | |
| 219 | + "search_analyzer": "query_ik" | |
| 220 | + }, | |
| 221 | + "en": { | |
| 222 | + "type": "text", | |
| 223 | + "analyzer": "english" | |
| 224 | + }, | |
| 225 | + "ar": { | |
| 226 | + "type": "text", | |
| 227 | + "analyzer": "arabic" | |
| 228 | + }, | |
| 229 | + ... | |
| 230 | + } | |
| 231 | + }, | |
| 232 | + "description": { | |
| 233 | + "type": "object", | |
| 234 | + "properties": { | |
| 235 | + "zh": { | |
| 236 | + "type": "text", | |
| 237 | + "analyzer": "index_ik", | |
| 238 | + "search_analyzer": "query_ik" | |
| 239 | + }, | |
| 240 | + "en": { | |
| 241 | + "type": "text", | |
| 242 | + "analyzer": "english" | |
| 243 | + }, | |
| 244 | + "ar": { | |
| 245 | + "type": "text", | |
| 246 | + "analyzer": "arabic" | |
| 247 | + }, | |
| 248 | + ... | |
| 249 | + } | |
| 250 | + }, | |
| 251 | + "vendor": { | |
| 252 | + "type": "object", | |
| 253 | + "properties": { | |
| 254 | + "zh": { | |
| 255 | + "type": "text", | |
| 256 | + "analyzer": "index_ik", | |
| 257 | + "search_analyzer": "query_ik" | |
| 258 | + }, | |
| 259 | + "en": { | |
| 260 | + "type": "text", | |
| 261 | + "analyzer": "english", | |
| 262 | + "fields": { | |
| 263 | + "keyword": { | |
| 264 | + "type": "keyword", | |
| 265 | + "normalizer": "lowercase" | |
| 266 | + } | |
| 267 | + } | |
| 268 | + }, | |
| 269 | + "ar": { | |
| 270 | + "type": "text", | |
| 271 | + "analyzer": "arabic", | |
| 272 | + "fields": { | |
| 273 | + "keyword": { | |
| 274 | + "type": "keyword", | |
| 275 | + "normalizer": "lowercase" | |
| 276 | + } | |
| 277 | + } | |
| 278 | + }, | |
| 279 | + ... | |
| 280 | + } | |
| 281 | + }, | |
| 282 | + "image_url": { | |
| 283 | + "type": "keyword", | |
| 284 | + "index": false | |
| 285 | + }, | |
| 286 | + "title_embedding": { | |
| 287 | + "type": "dense_vector", | |
| 288 | + "dims": 1024, | |
| 289 | + "index": true, | |
| 290 | + "similarity": "dot_product", | |
| 291 | + "element_type": "bfloat16" | |
| 292 | + }, | |
| 293 | + "image_embedding": { | |
| 294 | + "type": "nested", | |
| 295 | + "properties": { | |
| 296 | + "vector": { | |
| 297 | + "type": "dense_vector", | |
| 298 | + "dims": 768, | |
| 299 | + "index": true, | |
| 300 | + "similarity": "dot_product", | |
| 301 | + "element_type": "bfloat16" | |
| 302 | + }, | |
| 303 | + "url": { | |
| 304 | + "type": "text" | |
| 305 | + } | |
| 306 | + } | |
| 307 | + }, | |
| 308 | + "category_path": { | |
| 309 | + "type": "object", | |
| 310 | + "properties": { | |
| 311 | + "zh": { | |
| 312 | + "type": "text", | |
| 313 | + "analyzer": "index_ik", | |
| 314 | + "search_analyzer": "query_ik" | |
| 315 | + }, | |
| 316 | + "en": { | |
| 317 | + "type": "text", | |
| 318 | + "analyzer": "english" | |
| 319 | + }, | |
| 320 | + "ar": { | |
| 321 | + "type": "text", | |
| 322 | + "analyzer": "arabic" | |
| 323 | + }, | |
| 324 | + ... | |
| 325 | + } | |
| 326 | + } | |
| 327 | + }, | |
| 328 | + "category_name_text": { | |
| 329 | + "type": "object", | |
| 330 | + "properties": { | |
| 331 | + "zh": { | |
| 332 | + "type": "text", | |
| 333 | + "analyzer": "index_ik", | |
| 334 | + "search_analyzer": "query_ik" | |
| 335 | + }, | |
| 336 | + "en": { | |
| 337 | + "type": "text", | |
| 338 | + "analyzer": "english" | |
| 339 | + }, | |
| 340 | + "ar": { | |
| 341 | + "type": "text", | |
| 342 | + "analyzer": "arabic" | |
| 343 | + }, | |
| 344 | + ... | |
| 345 | + | |
| 346 | + } | |
| 347 | + }, | |
| 348 | + "qanchors": { | |
| 349 | + "type": "object", | |
| 350 | + "properties": { | |
| 351 | + "zh": { | |
| 352 | + "type": "text", | |
| 353 | + "analyzer": "index_ik", | |
| 354 | + "search_analyzer": "query_ik" | |
| 355 | + }, | |
| 356 | + "en": { | |
| 357 | + "type": "text", | |
| 358 | + "analyzer": "english" | |
| 359 | + } | |
| 360 | + } | |
| 361 | + }, | |
| 362 | + "tags": { | |
| 363 | + "type": "object", | |
| 364 | + "properties": { | |
| 365 | + "zh": { | |
| 366 | + "type": "text", | |
| 367 | + "analyzer": "index_ik", | |
| 368 | + "search_analyzer": "query_ik", | |
| 369 | + "fields": { | |
| 370 | + "keyword": { | |
| 371 | + "type": "keyword", | |
| 372 | + "normalizer": "lowercase" | |
| 373 | + } | |
| 374 | + } | |
| 375 | + }, | |
| 376 | + "en": { | |
| 377 | + "type": "text", | |
| 378 | + "analyzer": "english", | |
| 379 | + "fields": { | |
| 380 | + "keyword": { | |
| 381 | + "type": "keyword", | |
| 382 | + "normalizer": "lowercase" | |
| 383 | + } | |
| 384 | + } | |
| 385 | + } | |
| 386 | + } | |
| 387 | + }, | |
| 388 | + "category_id": { | |
| 389 | + "type": "keyword" | |
| 390 | + }, | |
| 391 | + "category_name": { | |
| 392 | + "type": "keyword" | |
| 393 | + }, | |
| 394 | + "category_level": { | |
| 395 | + "type": "integer" | |
| 396 | + }, | |
| 397 | + "category1_name": { | |
| 398 | + "type": "keyword" | |
| 399 | + }, | |
| 400 | + "category2_name": { | |
| 401 | + "type": "keyword" | |
| 402 | + }, | |
| 403 | + "category3_name": { | |
| 404 | + "type": "keyword" | |
| 405 | + }, | |
| 406 | + "specifications": { | |
| 407 | + "type": "nested", | |
| 408 | + "properties": { | |
| 409 | + "sku_id": { | |
| 410 | + "type": "keyword" | |
| 411 | + }, | |
| 412 | + "name": { | |
| 413 | + "type": "keyword" | |
| 414 | + }, | |
| 415 | + "value": { | |
| 416 | + "type": "object", | |
| 417 | + "properties": { | |
| 418 | + "zh": { | |
| 419 | + "type": "text", | |
| 420 | + "analyzer": "index_ik", | |
| 421 | + "search_analyzer": "query_ik", | |
| 422 | + "fields": { | |
| 423 | + "keyword": { | |
| 424 | + "type": "keyword", | |
| 425 | + "normalizer": "lowercase" | |
| 426 | + } | |
| 427 | + } | |
| 428 | + }, | |
| 429 | + "en": { | |
| 430 | + "type": "text", | |
| 431 | + "analyzer": "english", | |
| 432 | + "fields": { | |
| 433 | + "keyword": { | |
| 434 | + "type": "keyword", | |
| 435 | + "normalizer": "lowercase" | |
| 436 | + } | |
| 437 | + } | |
| 438 | + } | |
| 439 | + } | |
| 440 | + } | |
| 441 | + } | |
| 442 | + }, | |
| 443 | + "enriched_attributes": { | |
| 444 | + "type": "nested", | |
| 445 | + "properties": { | |
| 446 | + "name": { | |
| 447 | + "type": "keyword" | |
| 448 | + }, | |
| 449 | + "value": { | |
| 450 | + "type": "object", | |
| 451 | + "properties": { | |
| 452 | + "zh": { | |
| 453 | + "type": "text", | |
| 454 | + "analyzer": "index_ik", | |
| 455 | + "search_analyzer": "query_ik", | |
| 456 | + "fields": { | |
| 457 | + "keyword": { | |
| 458 | + "type": "keyword", | |
| 459 | + "normalizer": "lowercase" | |
| 460 | + } | |
| 461 | + } | |
| 462 | + }, | |
| 463 | + "en": { | |
| 464 | + "type": "text", | |
| 465 | + "analyzer": "english", | |
| 466 | + "fields": { | |
| 467 | + "keyword": { | |
| 468 | + "type": "keyword", | |
| 469 | + "normalizer": "lowercase" | |
| 470 | + } | |
| 471 | + } | |
| 472 | + } | |
| 473 | + } | |
| 474 | + } | |
| 475 | + } | |
| 476 | + }, | |
| 477 | + "option1_name": { | |
| 478 | + "type": "keyword" | |
| 479 | + }, | |
| 480 | + "option2_name": { | |
| 481 | + "type": "keyword" | |
| 482 | + }, | |
| 483 | + "option3_name": { | |
| 484 | + "type": "keyword" | |
| 485 | + }, | |
| 486 | + "option1_values": { | |
| 487 | + "type": "object", | |
| 488 | + "properties": { | |
| 489 | + "zh": { | |
| 490 | + "type": "text", | |
| 491 | + "analyzer": "index_ik", | |
| 492 | + "search_analyzer": "query_ik", | |
| 493 | + "fields": { | |
| 494 | + "keyword": { | |
| 495 | + "type": "keyword", | |
| 496 | + "normalizer": "lowercase" | |
| 497 | + } | |
| 498 | + } | |
| 499 | + }, | |
| 500 | + "en": { | |
| 501 | + "type": "text", | |
| 502 | + "analyzer": "english", | |
| 503 | + "fields": { | |
| 504 | + "keyword": { | |
| 505 | + "type": "keyword", | |
| 506 | + "normalizer": "lowercase" | |
| 507 | + } | |
| 508 | + } | |
| 509 | + } | |
| 510 | + } | |
| 511 | + }, | |
| 512 | + "option2_values": { | |
| 513 | + "type": "object", | |
| 514 | + "properties": { | |
| 515 | + "zh": { | |
| 516 | + "type": "text", | |
| 517 | + "analyzer": "index_ik", | |
| 518 | + "search_analyzer": "query_ik", | |
| 519 | + "fields": { | |
| 520 | + "keyword": { | |
| 521 | + "type": "keyword", | |
| 522 | + "normalizer": "lowercase" | |
| 523 | + } | |
| 524 | + } | |
| 525 | + }, | |
| 526 | + "en": { | |
| 527 | + "type": "text", | |
| 528 | + "analyzer": "english", | |
| 529 | + "fields": { | |
| 530 | + "keyword": { | |
| 531 | + "type": "keyword", | |
| 532 | + "normalizer": "lowercase" | |
| 533 | + } | |
| 534 | + } | |
| 535 | + } | |
| 536 | + } | |
| 537 | + }, | |
| 538 | + "option3_values": { | |
| 539 | + "type": "object", | |
| 540 | + "properties": { | |
| 541 | + "zh": { | |
| 542 | + "type": "text", | |
| 543 | + "analyzer": "index_ik", | |
| 544 | + "search_analyzer": "query_ik", | |
| 545 | + "fields": { | |
| 546 | + "keyword": { | |
| 547 | + "type": "keyword", | |
| 548 | + "normalizer": "lowercase" | |
| 549 | + } | |
| 550 | + } | |
| 551 | + }, | |
| 552 | + "en": { | |
| 553 | + "type": "text", | |
| 554 | + "analyzer": "english", | |
| 555 | + "fields": { | |
| 556 | + "keyword": { | |
| 557 | + "type": "keyword", | |
| 558 | + "normalizer": "lowercase" | |
| 559 | + } | |
| 560 | + } | |
| 561 | + } | |
| 562 | + } | |
| 563 | + }, | |
| 564 | + "min_price": { | |
| 565 | + "type": "float" | |
| 566 | + }, | |
| 567 | + "max_price": { | |
| 568 | + "type": "float" | |
| 569 | + }, | |
| 570 | + "compare_at_price": { | |
| 571 | + "type": "float" | |
| 572 | + }, | |
| 573 | + "sku_prices": { | |
| 574 | + "type": "float" | |
| 575 | + }, | |
| 576 | + "sku_weights": { | |
| 577 | + "type": "long" | |
| 578 | + }, | |
| 579 | + "sku_weight_units": { | |
| 580 | + "type": "keyword" | |
| 581 | + }, | |
| 582 | + "total_inventory": { | |
| 583 | + "type": "long" | |
| 584 | + }, | |
| 585 | + "sales": { | |
| 586 | + "type": "long" | |
| 587 | + }, | |
| 588 | + "skus": { | |
| 589 | + "type": "nested", | |
| 590 | + "properties": { | |
| 591 | + "sku_id": { | |
| 592 | + "type": "keyword" | |
| 593 | + }, | |
| 594 | + "price": { | |
| 595 | + "type": "float" | |
| 596 | + }, | |
| 597 | + "compare_at_price": { | |
| 598 | + "type": "float" | |
| 599 | + }, | |
| 600 | + "sku_code": { | |
| 601 | + "type": "keyword" | |
| 602 | + }, | |
| 603 | + "stock": { | |
| 604 | + "type": "long" | |
| 605 | + }, | |
| 606 | + "weight": { | |
| 607 | + "type": "float" | |
| 608 | + }, | |
| 609 | + "weight_unit": { | |
| 610 | + "type": "keyword" | |
| 611 | + }, | |
| 612 | + "option1_value": { | |
| 613 | + "type": "keyword" | |
| 614 | + }, | |
| 615 | + "option2_value": { | |
| 616 | + "type": "keyword" | |
| 617 | + }, | |
| 618 | + "option3_value": { | |
| 619 | + "type": "keyword" | |
| 620 | + }, | |
| 621 | + "image_src": { | |
| 622 | + "type": "keyword", | |
| 623 | + "index": false | |
| 624 | + } | |
| 625 | + } | |
| 626 | + } | |
| 627 | + } | |
| 628 | + } | |
| 629 | +} | |
| 0 | 630 | \ No newline at end of file | ... | ... |
scripts/es_debug_search.py
| ... | ... | @@ -279,7 +279,9 @@ def _run_es( |
| 279 | 279 | body: Dict[str, Any], |
| 280 | 280 | size: int, |
| 281 | 281 | ) -> List[Dict[str, Any]]: |
| 282 | - resp = es.search(index=index_name, body=body, size=size) | |
| 282 | + # Avoid passing size= alongside body= (deprecated in elasticsearch-py). | |
| 283 | + payload = {**body, "size": size} | |
| 284 | + resp = es.search(index=index_name, body=payload) | |
| 283 | 285 | if hasattr(resp, "body"): |
| 284 | 286 | payload = resp.body |
| 285 | 287 | else: | ... | ... |
search/rerank_client.py
| ... | ... | @@ -239,22 +239,96 @@ def _collect_text_score_components(matched_queries: Any, fallback_es_score: floa |
| 239 | 239 | } |
| 240 | 240 | |
| 241 | 241 | |
| 242 | -def _multiply_fusion_factors( | |
| 243 | - rerank_score: float, | |
| 244 | - fine_score: Optional[float], | |
| 242 | +def _format_debug_float(value: float) -> str: | |
| 243 | + return f"{float(value):.6g}" | |
| 244 | + | |
| 245 | + | |
| 246 | +def _build_hit_signal_bundle( | |
| 247 | + hit: Dict[str, Any], | |
| 248 | + fusion: CoarseRankFusionConfig | RerankFusionConfig, | |
| 249 | +) -> Dict[str, Any]: | |
| 250 | + es_score = _to_score(hit.get("_score")) | |
| 251 | + matched_queries = hit.get("matched_queries") | |
| 252 | + text_components = _collect_text_score_components(matched_queries, es_score) | |
| 253 | + knn_components = _collect_knn_score_components(matched_queries, fusion) | |
| 254 | + return { | |
| 255 | + "doc_id": hit.get("_id"), | |
| 256 | + "es_score": es_score, | |
| 257 | + "matched_queries": matched_queries, | |
| 258 | + "text_components": text_components, | |
| 259 | + "knn_components": knn_components, | |
| 260 | + "text_score": text_components["text_score"], | |
| 261 | + "knn_score": knn_components["knn_score"], | |
| 262 | + } | |
| 263 | + | |
| 264 | + | |
| 265 | +def _build_formula_summary( | |
| 266 | + term_rows: List[Dict[str, Any]], | |
| 267 | + style_boost: float, | |
| 268 | + final_score: float, | |
| 269 | +) -> str: | |
| 270 | + segments = [ | |
| 271 | + ( | |
| 272 | + f"{row['name']}=(" | |
| 273 | + f"{_format_debug_float(row['raw_score'])}" | |
| 274 | + f"+{_format_debug_float(row['bias'])})" | |
| 275 | + f"^{_format_debug_float(row['exponent'])}" | |
| 276 | + f"={_format_debug_float(row['factor'])}" | |
| 277 | + ) | |
| 278 | + for row in term_rows | |
| 279 | + ] | |
| 280 | + if style_boost != 1.0: | |
| 281 | + segments.append(f"style_boost={_format_debug_float(style_boost)}") | |
| 282 | + segments.append(f"final={_format_debug_float(final_score)}") | |
| 283 | + return " | ".join(segments) | |
| 284 | + | |
| 285 | + | |
| 286 | +def _compute_multiplicative_fusion( | |
| 287 | + *, | |
| 245 | 288 | text_score: float, |
| 246 | 289 | knn_score: float, |
| 247 | 290 | fusion: RerankFusionConfig, |
| 248 | -) -> Tuple[float, float, float, float, float]: | |
| 249 | - """(rerank_factor, fine_factor, text_factor, knn_factor, fused_without_style_boost).""" | |
| 250 | - r = (max(rerank_score, 0.0) + fusion.rerank_bias) ** fusion.rerank_exponent | |
| 251 | - if fine_score is None: | |
| 252 | - f = 1.0 | |
| 253 | - else: | |
| 254 | - f = (max(fine_score, 0.0) + fusion.fine_bias) ** fusion.fine_exponent | |
| 255 | - t = (max(text_score, 0.0) + fusion.text_bias) ** fusion.text_exponent | |
| 256 | - k = (max(knn_score, 0.0) + fusion.knn_bias) ** fusion.knn_exponent | |
| 257 | - return r, f, t, k, r * f * t * k | |
| 291 | + rerank_score: Optional[float] = None, | |
| 292 | + fine_score: Optional[float] = None, | |
| 293 | + style_boost: float = 1.0, | |
| 294 | +) -> Dict[str, Any]: | |
| 295 | + term_rows: List[Dict[str, Any]] = [] | |
| 296 | + | |
| 297 | + def _add_term(name: str, raw_score: Optional[float], bias: float, exponent: float) -> None: | |
| 298 | + if raw_score is None: | |
| 299 | + return | |
| 300 | + factor = (max(float(raw_score), 0.0) + bias) ** exponent | |
| 301 | + term_rows.append( | |
| 302 | + { | |
| 303 | + "name": name, | |
| 304 | + "raw_score": float(raw_score), | |
| 305 | + "bias": float(bias), | |
| 306 | + "exponent": float(exponent), | |
| 307 | + "factor": factor, | |
| 308 | + } | |
| 309 | + ) | |
| 310 | + | |
| 311 | + _add_term("rerank_score", rerank_score, fusion.rerank_bias, fusion.rerank_exponent) | |
| 312 | + _add_term("fine_score", fine_score, fusion.fine_bias, fusion.fine_exponent) | |
| 313 | + _add_term("text_score", text_score, fusion.text_bias, fusion.text_exponent) | |
| 314 | + _add_term("knn_score", knn_score, fusion.knn_bias, fusion.knn_exponent) | |
| 315 | + | |
| 316 | + fused = 1.0 | |
| 317 | + factors: Dict[str, float] = {} | |
| 318 | + inputs: Dict[str, float] = {} | |
| 319 | + for row in term_rows: | |
| 320 | + fused *= row["factor"] | |
| 321 | + factors[row["name"]] = row["factor"] | |
| 322 | + inputs[row["name"]] = row["raw_score"] | |
| 323 | + fused *= style_boost | |
| 324 | + factors["style_boost"] = style_boost | |
| 325 | + | |
| 326 | + return { | |
| 327 | + "inputs": inputs, | |
| 328 | + "factors": factors, | |
| 329 | + "score": fused, | |
| 330 | + "summary": _build_formula_summary(term_rows, style_boost, fused), | |
| 331 | + } | |
| 258 | 332 | |
| 259 | 333 | |
| 260 | 334 | def _multiply_coarse_fusion_factors( |
| ... | ... | @@ -283,12 +357,13 @@ def coarse_resort_hits( |
| 283 | 357 | f = fusion or CoarseRankFusionConfig() |
| 284 | 358 | coarse_debug: List[Dict[str, Any]] = [] if debug else [] |
| 285 | 359 | for hit in es_hits: |
| 286 | - es_score = _to_score(hit.get("_score")) | |
| 287 | - matched_queries = hit.get("matched_queries") | |
| 288 | - knn_components = _collect_knn_score_components(matched_queries, f) | |
| 289 | - text_components = _collect_text_score_components(matched_queries, es_score) | |
| 290 | - text_score = text_components["text_score"] | |
| 291 | - knn_score = knn_components["knn_score"] | |
| 360 | + signal_bundle = _build_hit_signal_bundle(hit, f) | |
| 361 | + es_score = signal_bundle["es_score"] | |
| 362 | + matched_queries = signal_bundle["matched_queries"] | |
| 363 | + text_components = signal_bundle["text_components"] | |
| 364 | + knn_components = signal_bundle["knn_components"] | |
| 365 | + text_score = signal_bundle["text_score"] | |
| 366 | + knn_score = signal_bundle["knn_score"] | |
| 292 | 367 | text_factor, knn_factor, coarse_score = _multiply_coarse_fusion_factors( |
| 293 | 368 | text_score=text_score, |
| 294 | 369 | knn_score=knn_score, |
| ... | ... | @@ -372,77 +447,81 @@ def fuse_scores_and_resort( |
| 372 | 447 | n = len(es_hits) |
| 373 | 448 | if n == 0 or len(rerank_scores) != n: |
| 374 | 449 | return [] |
| 375 | - if fine_scores is not None and len(fine_scores) != n: | |
| 376 | - fine_scores = None | |
| 377 | - | |
| 378 | 450 | f = fusion or RerankFusionConfig() |
| 379 | 451 | fused_debug: List[Dict[str, Any]] = [] if debug else [] |
| 380 | 452 | |
| 381 | 453 | for idx, hit in enumerate(es_hits): |
| 382 | - es_score = _to_score(hit.get("_score")) | |
| 454 | + signal_bundle = _build_hit_signal_bundle(hit, f) | |
| 455 | + text_components = signal_bundle["text_components"] | |
| 456 | + knn_components = signal_bundle["knn_components"] | |
| 457 | + text_score = signal_bundle["text_score"] | |
| 458 | + knn_score = signal_bundle["knn_score"] | |
| 383 | 459 | rerank_score = _to_score(rerank_scores[idx]) |
| 384 | - fine_score = _to_score(fine_scores[idx]) if fine_scores is not None else _to_score(hit.get("_fine_score")) | |
| 385 | - matched_queries = hit.get("matched_queries") | |
| 386 | - knn_components = _collect_knn_score_components(matched_queries, f) | |
| 387 | - knn_score = knn_components["knn_score"] | |
| 388 | - text_components = _collect_text_score_components(matched_queries, es_score) | |
| 389 | - text_score = text_components["text_score"] | |
| 390 | - rerank_factor, fine_factor, text_factor, knn_factor, fused = _multiply_fusion_factors( | |
| 391 | - rerank_score, fine_score if fine_scores is not None or "_fine_score" in hit else None, text_score, knn_score, f | |
| 460 | + fine_score_raw = ( | |
| 461 | + _to_score(fine_scores[idx]) | |
| 462 | + if fine_scores is not None and len(fine_scores) == n | |
| 463 | + else _to_score(hit.get("_fine_score")) | |
| 392 | 464 | ) |
| 465 | + fine_score = fine_score_raw if (fine_scores is not None and len(fine_scores) == n) or "_fine_score" in hit else None | |
| 393 | 466 | sku_selected = _has_selected_sku(hit) |
| 394 | 467 | style_boost = style_intent_selected_sku_boost if sku_selected else 1.0 |
| 395 | - fused *= style_boost | |
| 468 | + fusion_result = _compute_multiplicative_fusion( | |
| 469 | + rerank_score=rerank_score, | |
| 470 | + fine_score=fine_score, | |
| 471 | + text_score=text_score, | |
| 472 | + knn_score=knn_score, | |
| 473 | + fusion=f, | |
| 474 | + style_boost=style_boost, | |
| 475 | + ) | |
| 476 | + fused = fusion_result["score"] | |
| 396 | 477 | |
| 397 | 478 | hit["_original_score"] = hit.get("_score") |
| 398 | 479 | hit["_rerank_score"] = rerank_score |
| 399 | - hit["_fine_score"] = fine_score | |
| 480 | + if fine_score is not None: | |
| 481 | + hit["_fine_score"] = fine_score | |
| 400 | 482 | hit["_text_score"] = text_score |
| 401 | 483 | hit["_knn_score"] = knn_score |
| 402 | 484 | hit["_text_knn_score"] = knn_components["text_knn_score"] |
| 403 | 485 | hit["_image_knn_score"] = knn_components["image_knn_score"] |
| 404 | 486 | hit["_fused_score"] = fused |
| 405 | 487 | hit["_style_intent_selected_sku_boost"] = style_boost |
| 406 | - if debug: | |
| 407 | - hit["_text_source_score"] = text_components["source_score"] | |
| 408 | - hit["_text_translation_score"] = text_components["translation_score"] | |
| 409 | - hit["_text_primary_score"] = text_components["primary_text_score"] | |
| 410 | - hit["_text_support_score"] = text_components["support_text_score"] | |
| 411 | - hit["_knn_primary_score"] = knn_components["primary_knn_score"] | |
| 412 | - hit["_knn_support_score"] = knn_components["support_knn_score"] | |
| 413 | 488 | |
| 414 | 489 | if debug: |
| 415 | 490 | debug_entry = { |
| 416 | 491 | "doc_id": hit.get("_id"), |
| 417 | - "es_score": es_score, | |
| 492 | + "score": fused, | |
| 493 | + "es_score": signal_bundle["es_score"], | |
| 418 | 494 | "rerank_score": rerank_score, |
| 419 | 495 | "fine_score": fine_score, |
| 420 | 496 | "text_score": text_score, |
| 497 | + "knn_score": knn_score, | |
| 498 | + "fusion_inputs": fusion_result["inputs"], | |
| 499 | + "fusion_factors": fusion_result["factors"], | |
| 500 | + "fusion_summary": fusion_result["summary"], | |
| 421 | 501 | "text_source_score": text_components["source_score"], |
| 422 | 502 | "text_translation_score": text_components["translation_score"], |
| 423 | 503 | "text_weighted_source_score": text_components["weighted_source_score"], |
| 424 | 504 | "text_weighted_translation_score": text_components["weighted_translation_score"], |
| 425 | 505 | "text_primary_score": text_components["primary_text_score"], |
| 426 | 506 | "text_support_score": text_components["support_text_score"], |
| 427 | - "text_score_fallback_to_es": ( | |
| 428 | - text_score == es_score | |
| 429 | - and text_components["source_score"] <= 0.0 | |
| 430 | - and text_components["translation_score"] <= 0.0 | |
| 431 | - ), | |
| 432 | 507 | "text_knn_score": knn_components["text_knn_score"], |
| 433 | 508 | "image_knn_score": knn_components["image_knn_score"], |
| 434 | 509 | "weighted_text_knn_score": knn_components["weighted_text_knn_score"], |
| 435 | 510 | "weighted_image_knn_score": knn_components["weighted_image_knn_score"], |
| 436 | 511 | "knn_primary_score": knn_components["primary_knn_score"], |
| 437 | 512 | "knn_support_score": knn_components["support_knn_score"], |
| 438 | - "knn_score": knn_score, | |
| 439 | - "rerank_factor": rerank_factor, | |
| 440 | - "fine_factor": fine_factor, | |
| 441 | - "text_factor": text_factor, | |
| 442 | - "knn_factor": knn_factor, | |
| 513 | + "text_score_fallback_to_es": ( | |
| 514 | + text_score == signal_bundle["es_score"] | |
| 515 | + and text_components["source_score"] <= 0.0 | |
| 516 | + and text_components["translation_score"] <= 0.0 | |
| 517 | + ), | |
| 518 | + "rerank_factor": fusion_result["factors"].get("rerank_score"), | |
| 519 | + "fine_factor": fusion_result["factors"].get("fine_score"), | |
| 520 | + "text_factor": fusion_result["factors"].get("text_score"), | |
| 521 | + "knn_factor": fusion_result["factors"].get("knn_score"), | |
| 443 | 522 | "style_intent_selected_sku": sku_selected, |
| 444 | 523 | "style_intent_selected_sku_boost": style_boost, |
| 445 | - "matched_queries": matched_queries, | |
| 524 | + "matched_queries": signal_bundle["matched_queries"], | |
| 446 | 525 | "fused_score": fused, |
| 447 | 526 | } |
| 448 | 527 | if rerank_debug_rows is not None and idx < len(rerank_debug_rows): |
| ... | ... | @@ -530,9 +609,11 @@ def run_lightweight_rerank( |
| 530 | 609 | rerank_doc_template: str = "{title}", |
| 531 | 610 | top_n: Optional[int] = None, |
| 532 | 611 | debug: bool = False, |
| 612 | + fusion: Optional[RerankFusionConfig] = None, | |
| 613 | + style_intent_selected_sku_boost: float = 1.2, | |
| 533 | 614 | service_profile: Optional[str] = "fine", |
| 534 | 615 | ) -> Tuple[Optional[List[float]], Optional[Dict[str, Any]], List[Dict[str, Any]]]: |
| 535 | - """Call lightweight reranker and attach scores to hits without final fusion.""" | |
| 616 | + """Call lightweight reranker and rank by lightweight-model fusion.""" | |
| 536 | 617 | if not es_hits: |
| 537 | 618 | return [], {}, [] |
| 538 | 619 | |
| ... | ... | @@ -554,18 +635,50 @@ def run_lightweight_rerank( |
| 554 | 635 | if scores is None or len(scores) != len(es_hits): |
| 555 | 636 | return None, None, [] |
| 556 | 637 | |
| 638 | + f = fusion or RerankFusionConfig() | |
| 557 | 639 | debug_rows: List[Dict[str, Any]] = [] if debug else [] |
| 558 | 640 | for idx, hit in enumerate(es_hits): |
| 641 | + signal_bundle = _build_hit_signal_bundle(hit, f) | |
| 642 | + text_score = signal_bundle["text_score"] | |
| 643 | + knn_score = signal_bundle["knn_score"] | |
| 559 | 644 | fine_score = _to_score(scores[idx]) |
| 645 | + sku_selected = _has_selected_sku(hit) | |
| 646 | + style_boost = style_intent_selected_sku_boost if sku_selected else 1.0 | |
| 647 | + fusion_result = _compute_multiplicative_fusion( | |
| 648 | + fine_score=fine_score, | |
| 649 | + text_score=text_score, | |
| 650 | + knn_score=knn_score, | |
| 651 | + fusion=f, | |
| 652 | + style_boost=style_boost, | |
| 653 | + ) | |
| 654 | + | |
| 560 | 655 | hit["_fine_score"] = fine_score |
| 656 | + hit["_fine_fused_score"] = fusion_result["score"] | |
| 657 | + hit["_text_score"] = text_score | |
| 658 | + hit["_knn_score"] = knn_score | |
| 659 | + hit["_text_knn_score"] = signal_bundle["knn_components"]["text_knn_score"] | |
| 660 | + hit["_image_knn_score"] = signal_bundle["knn_components"]["image_knn_score"] | |
| 661 | + hit["_style_intent_selected_sku_boost"] = style_boost | |
| 662 | + | |
| 561 | 663 | if debug: |
| 562 | 664 | row: Dict[str, Any] = { |
| 563 | 665 | "doc_id": hit.get("_id"), |
| 666 | + "score": fusion_result["score"], | |
| 564 | 667 | "fine_score": fine_score, |
| 668 | + "text_score": text_score, | |
| 669 | + "knn_score": knn_score, | |
| 670 | + "fusion_inputs": fusion_result["inputs"], | |
| 671 | + "fusion_factors": fusion_result["factors"], | |
| 672 | + "fusion_summary": fusion_result["summary"], | |
| 673 | + "fine_factor": fusion_result["factors"].get("fine_score"), | |
| 674 | + "text_factor": fusion_result["factors"].get("text_score"), | |
| 675 | + "knn_factor": fusion_result["factors"].get("knn_score"), | |
| 676 | + "style_intent_selected_sku": sku_selected, | |
| 677 | + "style_intent_selected_sku_boost": style_boost, | |
| 565 | 678 | } |
| 566 | 679 | if rerank_debug_rows is not None and idx < len(rerank_debug_rows): |
| 567 | 680 | row["rerank_input"] = rerank_debug_rows[idx] |
| 568 | 681 | debug_rows.append(row) |
| 569 | 682 | |
| 570 | - es_hits.sort(key=lambda h: h.get("_fine_score", 0.0), reverse=True) | |
| 683 | + es_hits.sort(key=lambda h: h.get("_fine_fused_score", h.get("_fine_score", 0.0)), reverse=True) | |
| 571 | 684 | return scores, meta, debug_rows | ... | ... |
search/searcher.py
| ... | ... | @@ -720,6 +720,8 @@ class Searcher: |
| 720 | 720 | rerank_doc_template=fine_doc_template, |
| 721 | 721 | top_n=fine_output_window, |
| 722 | 722 | debug=debug, |
| 723 | + fusion=rc.fusion, | |
| 724 | + style_intent_selected_sku_boost=self.config.query_config.style_intent_selected_sku_boost, | |
| 723 | 725 | service_profile=fine_cfg.service_profile, |
| 724 | 726 | ) |
| 725 | 727 | if fine_scores is not None: |
| ... | ... | @@ -745,6 +747,7 @@ class Searcher: |
| 745 | 747 | "docs_out": len(hits), |
| 746 | 748 | "top_n": fine_output_window, |
| 747 | 749 | "meta": fine_meta, |
| 750 | + "fusion": asdict(rc.fusion), | |
| 748 | 751 | } |
| 749 | 752 | context.store_intermediate_result("fine_rank_scores", fine_debug_rows) |
| 750 | 753 | context.logger.info( |
| ... | ... | @@ -781,7 +784,6 @@ class Searcher: |
| 781 | 784 | top_n=(from_ + size), |
| 782 | 785 | debug=debug, |
| 783 | 786 | fusion=rc.fusion, |
| 784 | - fine_scores=fine_scores[:len(final_input)] if fine_scores is not None else None, | |
| 785 | 787 | service_profile=rc.service_profile, |
| 786 | 788 | style_intent_selected_sku_boost=self.config.query_config.style_intent_selected_sku_boost, |
| 787 | 789 | ) |
| ... | ... | @@ -1026,18 +1028,14 @@ class Searcher: |
| 1026 | 1028 | # 若存在重排调试信息,则补充 doc 级别的融合分数信息 |
| 1027 | 1029 | if rerank_debug: |
| 1028 | 1030 | debug_entry["doc_id"] = rerank_debug.get("doc_id") |
| 1029 | - # 与 rerank_client 中字段保持一致,便于前端直接使用 | |
| 1031 | + debug_entry["score"] = rerank_debug.get("score") | |
| 1030 | 1032 | debug_entry["rerank_score"] = rerank_debug.get("rerank_score") |
| 1031 | 1033 | debug_entry["fine_score"] = rerank_debug.get("fine_score") |
| 1032 | 1034 | debug_entry["text_score"] = rerank_debug.get("text_score") |
| 1033 | - debug_entry["text_source_score"] = rerank_debug.get("text_source_score") | |
| 1034 | - debug_entry["text_translation_score"] = rerank_debug.get("text_translation_score") | |
| 1035 | - debug_entry["text_weighted_source_score"] = rerank_debug.get("text_weighted_source_score") | |
| 1036 | - debug_entry["text_weighted_translation_score"] = rerank_debug.get("text_weighted_translation_score") | |
| 1037 | - debug_entry["text_primary_score"] = rerank_debug.get("text_primary_score") | |
| 1038 | - debug_entry["text_support_score"] = rerank_debug.get("text_support_score") | |
| 1039 | - debug_entry["text_score_fallback_to_es"] = rerank_debug.get("text_score_fallback_to_es") | |
| 1040 | 1035 | debug_entry["knn_score"] = rerank_debug.get("knn_score") |
| 1036 | + debug_entry["fusion_inputs"] = rerank_debug.get("fusion_inputs") | |
| 1037 | + debug_entry["fusion_factors"] = rerank_debug.get("fusion_factors") | |
| 1038 | + debug_entry["fusion_summary"] = rerank_debug.get("fusion_summary") | |
| 1041 | 1039 | debug_entry["rerank_factor"] = rerank_debug.get("rerank_factor") |
| 1042 | 1040 | debug_entry["fine_factor"] = rerank_debug.get("fine_factor") |
| 1043 | 1041 | debug_entry["text_factor"] = rerank_debug.get("text_factor") |
| ... | ... | @@ -1047,7 +1045,13 @@ class Searcher: |
| 1047 | 1045 | debug_entry["matched_queries"] = rerank_debug.get("matched_queries") |
| 1048 | 1046 | elif fine_debug: |
| 1049 | 1047 | debug_entry["doc_id"] = fine_debug.get("doc_id") |
| 1048 | + debug_entry["score"] = fine_debug.get("score") | |
| 1050 | 1049 | debug_entry["fine_score"] = fine_debug.get("fine_score") |
| 1050 | + debug_entry["text_score"] = fine_debug.get("text_score") | |
| 1051 | + debug_entry["knn_score"] = fine_debug.get("knn_score") | |
| 1052 | + debug_entry["fusion_inputs"] = fine_debug.get("fusion_inputs") | |
| 1053 | + debug_entry["fusion_factors"] = fine_debug.get("fusion_factors") | |
| 1054 | + debug_entry["fusion_summary"] = fine_debug.get("fusion_summary") | |
| 1051 | 1055 | debug_entry["rerank_input"] = fine_debug.get("rerank_input") |
| 1052 | 1056 | |
| 1053 | 1057 | initial_rank = initial_ranks_by_doc.get(str(doc_id)) if doc_id is not None else None |
| ... | ... | @@ -1081,17 +1085,32 @@ class Searcher: |
| 1081 | 1085 | "fine_rank": { |
| 1082 | 1086 | "rank": fine_rank, |
| 1083 | 1087 | "rank_change": _rank_change(coarse_rank, fine_rank), |
| 1084 | - "score": fine_debug.get("fine_score") if fine_debug else hit.get("_fine_score"), | |
| 1088 | + "score": ( | |
| 1089 | + fine_debug.get("score") | |
| 1090 | + if fine_debug and fine_debug.get("score") is not None | |
| 1091 | + else hit.get("_fine_fused_score", hit.get("_fine_score")) | |
| 1092 | + ), | |
| 1093 | + "fine_score": fine_debug.get("fine_score") if fine_debug else hit.get("_fine_score"), | |
| 1094 | + "text_score": fine_debug.get("text_score") if fine_debug else hit.get("_text_score"), | |
| 1095 | + "knn_score": fine_debug.get("knn_score") if fine_debug else hit.get("_knn_score"), | |
| 1096 | + "fusion_summary": fine_debug.get("fusion_summary") if fine_debug else None, | |
| 1097 | + "fusion_inputs": fine_debug.get("fusion_inputs") if fine_debug else None, | |
| 1098 | + "fusion_factors": fine_debug.get("fusion_factors") if fine_debug else None, | |
| 1085 | 1099 | "rerank_input": fine_debug.get("rerank_input") if fine_debug else None, |
| 1100 | + "signals": fine_debug, | |
| 1086 | 1101 | }, |
| 1087 | 1102 | "rerank": { |
| 1088 | 1103 | "rank": rerank_rank, |
| 1089 | 1104 | "rank_change": _rank_change(fine_rank, rerank_rank), |
| 1105 | + "score": rerank_debug.get("score") if rerank_debug else hit.get("_fused_score"), | |
| 1090 | 1106 | "rerank_score": rerank_debug.get("rerank_score") if rerank_debug else hit.get("_rerank_score"), |
| 1091 | 1107 | "fine_score": rerank_debug.get("fine_score") if rerank_debug else hit.get("_fine_score"), |
| 1092 | 1108 | "fused_score": rerank_debug.get("fused_score") if rerank_debug else hit.get("_fused_score"), |
| 1093 | 1109 | "text_score": rerank_debug.get("text_score") if rerank_debug else hit.get("_text_score"), |
| 1094 | 1110 | "knn_score": rerank_debug.get("knn_score") if rerank_debug else hit.get("_knn_score"), |
| 1111 | + "fusion_summary": rerank_debug.get("fusion_summary") if rerank_debug else None, | |
| 1112 | + "fusion_inputs": rerank_debug.get("fusion_inputs") if rerank_debug else None, | |
| 1113 | + "fusion_factors": rerank_debug.get("fusion_factors") if rerank_debug else None, | |
| 1095 | 1114 | "rerank_factor": rerank_debug.get("rerank_factor") if rerank_debug else None, |
| 1096 | 1115 | "fine_factor": rerank_debug.get("fine_factor") if rerank_debug else None, |
| 1097 | 1116 | "text_factor": rerank_debug.get("text_factor") if rerank_debug else None, | ... | ... |
suggestion/builder.py
| ... | ... | @@ -166,6 +166,29 @@ class SuggestionIndexBuilder: |
| 166 | 166 | out = [p.strip() for p in parts if p and p.strip()] |
| 167 | 167 | return out if out else [s] |
| 168 | 168 | |
| 169 | + def _iter_multilang_product_tags( | |
| 170 | + self, | |
| 171 | + raw: Any, | |
| 172 | + index_languages: List[str], | |
| 173 | + primary_language: str, | |
| 174 | + ) -> List[Tuple[str, str]]: | |
| 175 | + if isinstance(raw, dict): | |
| 176 | + pairs: List[Tuple[str, str]] = [] | |
| 177 | + for lang in index_languages: | |
| 178 | + for tag in self._iter_product_tags(raw.get(lang)): | |
| 179 | + pairs.append((lang, tag)) | |
| 180 | + return pairs | |
| 181 | + | |
| 182 | + pairs = [] | |
| 183 | + for tag in self._iter_product_tags(raw): | |
| 184 | + tag_lang, _, _ = detect_text_language_for_suggestions( | |
| 185 | + tag, | |
| 186 | + index_languages=index_languages, | |
| 187 | + primary_language=primary_language, | |
| 188 | + ) | |
| 189 | + pairs.append((tag_lang, tag)) | |
| 190 | + return pairs | |
| 191 | + | |
| 169 | 192 | @staticmethod |
| 170 | 193 | def _looks_noise(text_value: str) -> bool: |
| 171 | 194 | if not text_value: |
| ... | ... | @@ -487,12 +510,11 @@ class SuggestionIndexBuilder: |
| 487 | 510 | key_to_candidate[key] = c |
| 488 | 511 | c.add_product("qanchor", spu_id=product_id) |
| 489 | 512 | |
| 490 | - for tag in self._iter_product_tags(src.get("tags")): | |
| 491 | - tag_lang, _, _ = detect_text_language_for_suggestions( | |
| 492 | - tag, | |
| 493 | - index_languages=index_languages, | |
| 494 | - primary_language=primary_language, | |
| 495 | - ) | |
| 513 | + for tag_lang, tag in self._iter_multilang_product_tags( | |
| 514 | + src.get("tags"), | |
| 515 | + index_languages=index_languages, | |
| 516 | + primary_language=primary_language, | |
| 517 | + ): | |
| 496 | 518 | text_norm = self._normalize_text(tag) |
| 497 | 519 | if self._looks_noise(text_norm): |
| 498 | 520 | continue | ... | ... |
tests/ci/test_service_api_contracts.py
| ... | ... | @@ -345,33 +345,25 @@ def test_indexer_build_docs_from_db_contract(indexer_client: TestClient): |
| 345 | 345 | def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch): |
| 346 | 346 | import indexer.product_enrich as process_products |
| 347 | 347 | |
| 348 | - def _fake_analyze_products( | |
| 349 | - products: List[Dict[str, str]], | |
| 350 | - target_lang: str = "zh", | |
| 351 | - batch_size: int | None = None, | |
| 352 | - tenant_id: str | None = None, | |
| 353 | - ): | |
| 354 | - assert batch_size == 20 | |
| 348 | + def _fake_build_index_content_fields(items: List[Dict[str, str]], tenant_id: str | None = None): | |
| 349 | + assert tenant_id == "162" | |
| 355 | 350 | return [ |
| 356 | 351 | { |
| 357 | - "id": p["id"], | |
| 358 | - "lang": target_lang, | |
| 359 | - "title_input": p["title"], | |
| 360 | - "title": p["title"], | |
| 361 | - "category_path": "", | |
| 362 | - "tags": "tag1,tag2", | |
| 363 | - "target_audience": "", | |
| 364 | - "usage_scene": "", | |
| 365 | - "season": "", | |
| 366 | - "key_attributes": "", | |
| 367 | - "material": "", | |
| 368 | - "features": "", | |
| 369 | - "anchor_text": f"{target_lang}-anchor-{p['id']}", | |
| 352 | + "id": p["spu_id"], | |
| 353 | + "qanchors": { | |
| 354 | + "zh": [f"zh-anchor-{p['spu_id']}"], | |
| 355 | + "en": [f"en-anchor-{p['spu_id']}"], | |
| 356 | + }, | |
| 357 | + "enriched_tags": {"zh": ["tag1", "tag2"], "en": ["tag1", "tag2"]}, | |
| 358 | + "enriched_attributes": [ | |
| 359 | + {"name": "enriched_tags", "value": {"zh": "tag1"}}, | |
| 360 | + {"name": "enriched_tags", "value": {"en": "tag1"}}, | |
| 361 | + ], | |
| 370 | 362 | } |
| 371 | - for p in products | |
| 363 | + for p in items | |
| 372 | 364 | ] |
| 373 | 365 | |
| 374 | - monkeypatch.setattr(process_products, "analyze_products", _fake_analyze_products) | |
| 366 | + monkeypatch.setattr(process_products, "build_index_content_fields", _fake_build_index_content_fields) | |
| 375 | 367 | |
| 376 | 368 | response = indexer_client.post( |
| 377 | 369 | "/indexer/enrich-content", |
| ... | ... | @@ -381,7 +373,6 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch |
| 381 | 373 | {"spu_id": "1001", "title": "T-shirt"}, |
| 382 | 374 | {"spu_id": "1002", "title": "Toy"}, |
| 383 | 375 | ], |
| 384 | - "languages": ["zh", "en"], | |
| 385 | 376 | }, |
| 386 | 377 | ) |
| 387 | 378 | assert response.status_code == 200 |
| ... | ... | @@ -390,9 +381,14 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch |
| 390 | 381 | assert data["total"] == 2 |
| 391 | 382 | assert len(data["results"]) == 2 |
| 392 | 383 | assert data["results"][0]["spu_id"] == "1001" |
| 393 | - assert data["results"][0]["qanchors"]["zh"] == "zh-anchor-1001" | |
| 394 | - assert data["results"][0]["qanchors"]["en"] == "en-anchor-1001" | |
| 395 | - assert "tag1" in data["results"][0]["tags"] | |
| 384 | + assert data["results"][0]["qanchors"]["zh"] == ["zh-anchor-1001"] | |
| 385 | + assert data["results"][0]["qanchors"]["en"] == ["en-anchor-1001"] | |
| 386 | + assert data["results"][0]["enriched_tags"]["zh"] == ["tag1", "tag2"] | |
| 387 | + assert data["results"][0]["enriched_tags"]["en"] == ["tag1", "tag2"] | |
| 388 | + assert data["results"][0]["enriched_attributes"][0] == { | |
| 389 | + "name": "enriched_tags", | |
| 390 | + "value": {"zh": "tag1"}, | |
| 391 | + } | |
| 396 | 392 | |
| 397 | 393 | |
| 398 | 394 | def test_indexer_documents_contract(indexer_client: TestClient): |
| ... | ... | @@ -515,7 +511,6 @@ def test_indexer_enrich_content_validation_max_items(indexer_client: TestClient) |
| 515 | 511 | json={ |
| 516 | 512 | "tenant_id": "162", |
| 517 | 513 | "items": [{"spu_id": str(i), "title": "x"} for i in range(51)], |
| 518 | - "languages": ["zh"], | |
| 519 | 514 | }, |
| 520 | 515 | ) |
| 521 | 516 | assert response.status_code == 400 | ... | ... |
tests/test_llm_enrichment_batch_fill.py
| ... | ... | @@ -7,33 +7,30 @@ import pandas as pd |
| 7 | 7 | from indexer.document_transformer import SPUDocumentTransformer |
| 8 | 8 | |
| 9 | 9 | |
| 10 | -def test_fill_llm_attributes_batch_calls_analyze_in_batches(monkeypatch): | |
| 10 | +def test_fill_llm_attributes_batch_uses_product_enrich_helper(monkeypatch): | |
| 11 | 11 | seen_calls: List[Dict[str, Any]] = [] |
| 12 | 12 | |
| 13 | - def _fake_analyze_products(products, target_lang="zh", batch_size=None, tenant_id=None): | |
| 14 | - # should always request batch_size=20 and pass full list; internal splitter handles >20 | |
| 15 | - seen_calls.append( | |
| 16 | - { | |
| 17 | - "n": len(products), | |
| 18 | - "target_lang": target_lang, | |
| 19 | - "batch_size": batch_size, | |
| 20 | - "tenant_id": tenant_id, | |
| 21 | - } | |
| 22 | - ) | |
| 13 | + def _fake_build_index_content_fields(items, tenant_id=None): | |
| 14 | + seen_calls.append({"n": len(items), "tenant_id": tenant_id}) | |
| 23 | 15 | return [ |
| 24 | 16 | { |
| 25 | - "id": p["id"], | |
| 26 | - "lang": target_lang, | |
| 27 | - "title_input": p["title"], | |
| 28 | - "tags": "t1,t2", | |
| 29 | - "anchor_text": f"{target_lang}-anchor-{p['id']}", | |
| 17 | + "id": item["id"], | |
| 18 | + "qanchors": { | |
| 19 | + "zh": [f"zh-anchor-{item['id']}"], | |
| 20 | + "en": [f"en-anchor-{item['id']}"], | |
| 21 | + }, | |
| 22 | + "tags": {"zh": ["t1", "t2"], "en": ["t1", "t2"]}, | |
| 23 | + "enriched_attributes": [ | |
| 24 | + {"name": "tags", "value": {"zh": "t1"}}, | |
| 25 | + {"name": "tags", "value": {"en": "t1"}}, | |
| 26 | + ], | |
| 30 | 27 | } |
| 31 | - for p in products | |
| 28 | + for item in items | |
| 32 | 29 | ] |
| 33 | 30 | |
| 34 | 31 | import indexer.document_transformer as doc_tr |
| 35 | 32 | |
| 36 | - monkeypatch.setattr(doc_tr, "analyze_products", _fake_analyze_products) | |
| 33 | + monkeypatch.setattr(doc_tr, "build_index_content_fields", _fake_build_index_content_fields) | |
| 37 | 34 | |
| 38 | 35 | transformer = SPUDocumentTransformer( |
| 39 | 36 | category_id_to_name={}, |
| ... | ... | @@ -54,11 +51,11 @@ def test_fill_llm_attributes_batch_calls_analyze_in_batches(monkeypatch): |
| 54 | 51 | |
| 55 | 52 | transformer.fill_llm_attributes_batch(docs, rows) |
| 56 | 53 | |
| 57 | - # called once per language, with full list; analyze_products handles splitting | |
| 58 | - assert seen_calls == [ | |
| 59 | - {"n": 45, "target_lang": "zh", "batch_size": 20, "tenant_id": "162"}, | |
| 60 | - {"n": 45, "target_lang": "en", "batch_size": 20, "tenant_id": "162"}, | |
| 61 | - ] | |
| 54 | + assert seen_calls == [{"n": 45, "tenant_id": "162"}] | |
| 62 | 55 | |
| 63 | - assert docs[0]["qanchors"]["zh"] == "zh-anchor-0" | |
| 64 | - assert docs[0]["qanchors"]["en"] == "en-anchor-0" | |
| 56 | + assert docs[0]["qanchors"]["zh"] == ["zh-anchor-0"] | |
| 57 | + assert docs[0]["qanchors"]["en"] == ["en-anchor-0"] | |
| 58 | + assert docs[0]["tags"]["zh"] == ["t1", "t2"] | |
| 59 | + assert docs[0]["tags"]["en"] == ["t1", "t2"] | |
| 60 | + assert {"name": "tags", "value": {"zh": "t1"}} in docs[0]["enriched_attributes"] | |
| 61 | + assert {"name": "tags", "value": {"en": "t1"}} in docs[0]["enriched_attributes"] | ... | ... |
tests/test_rerank_client.py
| 1 | 1 | from math import isclose |
| 2 | 2 | |
| 3 | 3 | from config.schema import RerankFusionConfig |
| 4 | -from search.rerank_client import fuse_scores_and_resort | |
| 4 | +from search.rerank_client import fuse_scores_and_resort, run_lightweight_rerank | |
| 5 | 5 | |
| 6 | 6 | |
| 7 | 7 | def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_primary(): |
| ... | ... | @@ -204,3 +204,57 @@ def test_fuse_scores_and_resort_applies_knn_dismax_weights_and_tie_breaker(): |
| 204 | 204 | assert isclose(debug[0]["weighted_image_knn_score"], 0.5, rel_tol=1e-9) |
| 205 | 205 | assert isclose(debug[0]["knn_primary_score"], 0.8, rel_tol=1e-9) |
| 206 | 206 | assert isclose(debug[0]["knn_support_score"], 0.5, rel_tol=1e-9) |
| 207 | + | |
| 208 | + | |
| 209 | +def test_run_lightweight_rerank_sorts_by_fused_stage_score(monkeypatch): | |
| 210 | + hits = [ | |
| 211 | + { | |
| 212 | + "_id": "fine-raw-better", | |
| 213 | + "_score": 1.0, | |
| 214 | + "_source": {"title": {"en": "Alpha"}}, | |
| 215 | + "matched_queries": {"base_query": 0.5, "knn_query": 0.0}, | |
| 216 | + }, | |
| 217 | + { | |
| 218 | + "_id": "fusion-better", | |
| 219 | + "_score": 1.0, | |
| 220 | + "_source": {"title": {"en": "Beta"}}, | |
| 221 | + "matched_queries": {"base_query": 40.0, "knn_query": 0.0}, | |
| 222 | + }, | |
| 223 | + ] | |
| 224 | + | |
| 225 | + monkeypatch.setattr( | |
| 226 | + "search.rerank_client.call_rerank_service", | |
| 227 | + lambda *args, **kwargs: ([0.9, 0.8], {"model": "fine-bge"}), | |
| 228 | + ) | |
| 229 | + | |
| 230 | + scores, meta, debug_rows = run_lightweight_rerank( | |
| 231 | + query="toy", | |
| 232 | + es_hits=hits, | |
| 233 | + language="en", | |
| 234 | + debug=True, | |
| 235 | + ) | |
| 236 | + | |
| 237 | + assert scores == [0.9, 0.8] | |
| 238 | + assert meta == {"model": "fine-bge"} | |
| 239 | + assert [hit["_id"] for hit in hits] == ["fusion-better", "fine-raw-better"] | |
| 240 | + assert hits[0]["_fine_fused_score"] > hits[1]["_fine_fused_score"] | |
| 241 | + assert debug_rows[0]["fusion_summary"] | |
| 242 | + assert "fine_score=" in debug_rows[0]["fusion_summary"] | |
| 243 | + assert "text_score=" in debug_rows[0]["fusion_summary"] | |
| 244 | + | |
| 245 | + | |
| 246 | +def test_fuse_scores_and_resort_uses_hit_level_fine_score_when_not_passed_separately(): | |
| 247 | + hits = [ | |
| 248 | + { | |
| 249 | + "_id": "with-fine", | |
| 250 | + "_score": 1.0, | |
| 251 | + "_fine_score": 0.7, | |
| 252 | + "matched_queries": {"base_query": 2.0, "knn_query": 0.5}, | |
| 253 | + } | |
| 254 | + ] | |
| 255 | + | |
| 256 | + debug = fuse_scores_and_resort(hits, [0.8], debug=True) | |
| 257 | + | |
| 258 | + assert isclose(debug[0]["fine_factor"], (0.7 + 0.00001), rel_tol=1e-9) | |
| 259 | + assert debug[0]["fusion_inputs"]["fine_score"] == 0.7 | |
| 260 | + assert "fine_score=" in debug[0]["fusion_summary"] | ... | ... |
tests/test_suggestions.py
| ... | ... | @@ -403,10 +403,13 @@ def test_build_full_candidates_tags_and_qanchor_phrases(monkeypatch): |
| 403 | 403 | "spu_id": "900", |
| 404 | 404 | "title": {"en": "Tee", "zh": "T恤"}, |
| 405 | 405 | "qanchors": { |
| 406 | - "en": "slim fit, sporty casual", | |
| 407 | - "zh": "修身, 显瘦", | |
| 406 | + "en": ["slim fit", "sporty casual"], | |
| 407 | + "zh": ["修身", "显瘦"], | |
| 408 | + }, | |
| 409 | + "tags": { | |
| 410 | + "en": ["Classic", "ribbed neckline"], | |
| 411 | + "zh": ["辣妹风"], | |
| 408 | 412 | }, |
| 409 | - "tags": ["Classic", "辣妹风", "ribbed neckline"], | |
| 410 | 413 | }, |
| 411 | 414 | } |
| 412 | 415 | ] | ... | ... |