From ccbdf87013773107a41f81410c45cbc9f52bfe77 Mon Sep 17 00:00:00 2001 From: tangwang Date: Fri, 3 Apr 2026 21:11:50 +0800 Subject: [PATCH] enriched_attributes.value字段参与搜索 --- config/config.yaml | 36 +++++++++++++++++++++--------------- docs/LTR-特征非线性映射.md | 181 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ docs/issues/issue-2026-04-01-评估框架-四级label-done-0402.md | 12 ------------ docs/常用查询 - ES.md | 183 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------ docs/相关性检索优化说明.md | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ scripts/evaluation/eval_framework/prompts.py | 12 ------------ 6 files changed, 422 insertions(+), 51 deletions(-) create mode 100644 docs/LTR-特征非线性映射.md diff --git a/config/config.yaml b/config/config.yaml index 7a38319..9170840 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -110,18 +110,19 @@ es_settings: # 若需要按某个语言单独调权,也可以加显式 key(例如 title.de: 3.2)。 field_boosts: title: 3.0 - qanchors: 2.3 - enriched_tags: 2.3 - keywords: 2.0 - tags: 2.0 + qanchors: 2.5 + enriched_tags: 2.5 + enriched_attributes.value: 2.3 category_name_text: 2.0 category_path: 2.0 - brief: 1.5 - description: 1.5 - vendor: 1.5 - option1_values: 1.5 - option2_values: 1.5 - option3_values: 1.5 + keywords: 2.0 + tags: 2.0 + option1_values: 1.7 + option2_values: 1.7 + option3_values: 1.7 + brief: 1.0 + description: 1.0 + vendor: 1.0 # Query Configuration(查询配置) query_config: @@ -188,17 +189,18 @@ query_config: search_fields: multilingual_fields: - title - - qanchors - keywords + - qanchors - enriched_tags + - enriched_attributes.value - option1_values - option2_values - option3_values - category_path - category_name_text - - brief - - description - - vendor + # - brief + # - description + # - vendor # shared_fields: 无语言后缀字段;示例: tags, option1_values, option2_values, option3_values shared_fields: null core_multilingual_fields: @@ -244,7 +246,11 @@ query_config: - category1_name - category2_name - category3_name - - tags + # - tags + # - keywords + # - qanchors + # - enriched_tags + # - enriched_attributes - min_price - compare_at_price - image_url diff --git a/docs/LTR-特征非线性映射.md b/docs/LTR-特征非线性映射.md new file mode 100644 index 0000000..1f76352 --- /dev/null +++ b/docs/LTR-特征非线性映射.md @@ -0,0 +1,181 @@ + +特征工程: +对因子做非线性处理,作为重排阶段的乘法融合 / 特征融合,和LR、FM、浅层 MLP的输入。 + +分成两张表: + +1. **常见非线性映射总表** +2. **按“你想达到什么效果”来选方法** + +--- + +# 一、常见非线性映射对比表 + +> 记号说明: +> +> * (x):原始因子,默认 (x\ge 0) +> * (y=f(x)):映射后特征 +> * “头部/高值/尾部”分别指:高质量区、较大数值区、低影响区 +> * “弹性”可理解为乘法融合里更接近“相对重要性”的量 + +| 方法 | 公式 | 单调性 | 斜率/敏感性特征 | 弹性/相对敏感性特征 | 主要效果 | 适合场景 | 风险/注意点 | +| ----------------------- | ---------------------------------- | ----: | -------------------- | ------------------------- | ---------------------- | ------------------ | ------------------ | +| 加偏置 | (y=x+b) | 单调增 | (\frac{dy}{dx}=1) 不变 | (\frac{x}{x+b}),随(x)增大而增大 | **主要削弱低值区相对重要性** | 防止小值/0值过度伤害乘法分数 | 对高值区抑制很弱 | +| 对数 | (y=\log(1+x)) | 单调增 | 高值区越来越平 | 高值相对敏感性下降明显 | **强压缩大值/长尾** | 计数、曝光、频次、热度 | 小值区区分度有时不够 | +| 幂次(0~1) | (y=x^\gamma,\ 0<\gamma<1) | 单调增 | 高值区变平 | 弹性恒为(\gamma<1) | **整体降权,压缩高值** | 重尾分布、避免大值碾压 | 需要处理0附近/尺度问题 | +| 幂次(>1) | (y=x^\gamma,\ \gamma>1) | 单调增 | 高值区更陡 | 弹性恒为(\gamma>1) | **整体放大,强调高值** | 想突出强信号头部 | 容易放大异常值、过拟合 | +| 负幂 / 倒数幂 | (y=(x+\epsilon)^\gamma,\ \gamma<0) | 单调减 | 小(x)处敏感 | 适合“坏度/距离/排名”类反向量 | **把大值变小、小值变大** | 距离、惩罚项、rank-like特征 | 必须防0,数值稳定要小心 | +| Michaelis-Menten / 饱和分式 | (y=\frac{x}{x+K}) | 单调增 | 前陡后平 | (\frac{K}{x+K}),随(x)增大而减小 | **高值区重要性显著衰减** | 相似度、质量分、置信度饱和 | 需选(K),解释阈值要清晰 | +| 指数饱和 | (y=1-e^{-\alpha x}) | 单调增 | 初期快,后期快饱和 | 高值区快速失敏 | **前期收益大,后期收益递减** | “够好即可”的因子 | (\alpha)过大易太早饱和 | +| 倒数/双曲线 | (y=\frac{1}{x+c}) | 单调减 | **头部陡、尾部平** | 对小(x)更敏感 | **强头部区分,尾部压平** | rank、距离、位置惩罚 | 语义是反向量时更自然 | +| RRF | (y=\frac{1}{k+r}) | 单调减 | **头部下降快,尾部下降慢** | 离散相邻rank差在头部更大 | **强调Top位置,压缩长尾rank差异** | 多路召回排序融合 | 更适合rank,不适合已校准连续分数 | +| Sigmoid | (y=\sigma(\alpha(x-c))) | 单调增 | **中间陡,两端平** | 阈值附近最敏感 | **中段阈值化,低高两端压缩** | 有明显阈值的质量因子 | 容易过压缩,参数敏感 | +| Tanh | (y=\tanh(\alpha(x-c))) | 单调增 | **中间陡,两端平** | 以中心点为对称中枢 | **适合有“好/坏偏离”语义** | 偏离均值、标准化后特征 | 要先中心化更合理 | +| Softplus | (y=\log(1+e^{\alpha(x-c)})) | 单调增 | 平滑版ReLU | 阈值后近似线性 | **软阈值激活** | 希望“超过阈值才开始起作用” | 不如硬阈值直观 | +| ReLU/铰链 | (y=\max(0,x-c)) | 单调增 | 阈值前0,后线性 | 明确阈值激活 | **只让超过阈值部分生效** | 明确业务门槛 | 不连续,不够平滑 | +| 截断/裁剪 | (y=\min(\max(x,L),U)) | 单调 | 两端直接压平 | 控制极值影响 | **抗异常值,防爆** | 样本少、分布脏 | 可能损失极值信息 | +| 分段线性 | 分段定义 | 单调可控 | 可手工指定各段斜率 | 可按业务调敏感区间 | **可解释、稳、好控** | 规则清晰的业务场景 | 需要人工定阈值 | +| Arctan | (y=\arctan(\alpha(x-c))) | 单调增 | 类S形但更柔和 | 中间敏感、两端平 | **温和版S型压缩** | 不想用太激进sigmoid时 | 解释性略弱 | +| 分位数/Percentile | (y=\text{percentile}(x)) | 单调增 | 基于排序,不看绝对差值 | 消除原始尺度影响 | **保序、抗异常、跨源统一尺度** | 多源分数难校准 | 丢失绝对量纲信息 | +| 分桶/Binning | 区间映射到桶 | 不一定连续 | 桶内不敏感,桶间跳变 | 强离散化 | **把非线性变成离散模式** | 样本少、LR很常见 | 桶边界敏感 | +| Box-Cox | (\frac{x^\lambda-1}{\lambda}) | 单调增 | 介于log与power之间 | 可调分布形态 | **系统化连续压缩族** | 想系统试幂/log家族 | 解释性不如手工映射 | +| Yeo-Johnson | 可处理负值 | 单调 | 类似Box-Cox | 可处理(\le 0) | **负值/零值也能做分布矫正** | 特征可能有负值 | 工程解释性一般 | + +--- + +# 二、按“想达到什么效果”选方法 + +这个表更适合你做特征工程时快速决策。 + +| 目标 | 推荐方法 | 核心机制 | 典型用途 | +| -------------- | ----------------------------------------------------------------------- | --------------- | ---------------- | +| 防止小值/0值把乘法分数打穿 | (x+b) | 给低值加保护垫 | 质量分、置信分、召回弱信号保护 | +| 压制大值主导、做收益递减 | (\log(1+x)), (x^\gamma(0<\gamma<1)), (\frac{x}{x+K}), (1-e^{-\alpha x}) | 高值区斜率变小 | 热度、频次、相似度、历史点击率 | +| 明显削弱高值区重要性 | (\frac{x}{x+K}) | 弹性随(x)增大而下降 | “高了以后别再太影响排序” | +| 整体降低某因子的乘法重要性 | (x^\gamma,\ 0<\gamma<1) | 弹性恒定缩小到(\gamma) | 统一降权某类因子 | +| 整体放大某因子的乘法重要性 | (x^\gamma,\ \gamma>1) | 弹性恒定放大到(\gamma) | 想强化强信号 | +| 强调Top,压平长尾 | (\frac{1}{x+c}), RRF | 头部陡、尾部平 | rank融合、位置因子、多路召回 | +| 只在阈值附近最敏感 | Sigmoid / Tanh / Arctan | 中间陡、两端平 | 质量过线、置信阈值、门控因子 | +| 超过阈值才起作用 | ReLU / Hinge / Softplus | 阈值激活 | “达到一定水平才算有效” | +| 抗异常值、防极值爆炸 | Clip / Winsorize / Log | 直接限幅或压缩长尾 | 脏数据、样本少、稳定性优先 | +| 分布跨源不一致、量纲不统一 | Quantile / Percentile | 保序统一尺度 | 多路打分融合、异构召回分 | +| 业务规则清晰、想强可解释 | 分段线性 / 分桶 | 手工指定各区间作用方式 | 规则强、可解释要求高 | +| 距离/惩罚/坏度越大越差 | 倒数、负幂、指数衰减 | 反向单调映射 | 距离、时延、惩罚项 | + +--- + +# 三、几个你当前最关心的方法,单独再压缩成小表 + +## 1)加偏置 vs Michaelis-Menten + +| 方法 | 公式 | 低值区 | 高值区 | 适合作用 | +| ---- | --------------- | -------- | -------- | -------- | +| 加偏置 | (x+b) | **削弱更多** | 基本保留 | 防止低值过分伤害 | +| MM饱和 | (\frac{x}{x+K}) | 保留较强敏感性 | **削弱更多** | 防止高值持续主导 | + +一句话区别: + +* **加偏置**:主要“救低值” +* **MM饱和**:主要“压高值” + +--- + +## 2)RRF 的特征 + +| 维度 | 结论 | +| -------- | --------------------- | +| 公式 | (\frac{1}{k+r}) | +| 单调性 | 随rank变差单调下降 | +| 头部变化 | **更陡** | +| 尾部变化 | **更平** | +| 相邻rank差异 | rank越靠前,相邻差越大 | +| 适合 | 多路召回融合、强调Top结果 | +| 本质 | 是一种“头部敏感、尾部压缩”的rank映射 | + +你的判断是对的:**RRF 确实是头部变化陡、尾部变化平。** + +--- + +## 3)Sigmoid / Tanh / ReLU 三者区别 + +| 方法 | 形状 | 最敏感区域 | 适用语义 | +| ---------- | -------------- | ------ | ------------ | +| Sigmoid | S型,输出(0\sim1) | 中心阈值附近 | 质量是否过线、概率型因子 | +| Tanh | S型,输出(-1\sim1) | 中心附近 | 正负偏离、相对均值偏差 | +| ReLU/Hinge | 折线 | 阈值以上 | 超过门槛才开始加分 | + +--- + +# 四、用于 LR / FM 输入时,最推荐的一组“低风险特征字典” + +如果你现在是要做**有限样本下的工程化输入**,建议不要一下上太多复杂函数,而是每个核心因子先派生一个**小型稳定字典**: + +| 变换类别 | 推荐形式 | 作用 | +| ----- | -------------------------------- | ------------ | +| 原始 | (x) | 保留原信息 | +| 稳定 | (x+b) | 防止低值过伤 | +| 压缩 | (\log(1+x)) | 压长尾 | +| 弱压缩 | (\sqrt{x}) | 温和收益递减 | +| 饱和 | (\frac{x}{x+K}) | 明确高值衰减 | +| 截断 | (\min(x,U)) | 防极值爆炸 | +| 阈值 | (\mathbf 1[x>t]) 或 (\max(0,x-t)) | 强化门槛效应 | +| rank型 | (\frac{1}{x+c}) / RRF | 处理位置、rank、距离 | + +--- + +# 五、如果你要在实际工程里优先试哪些 + +我建议优先级这样排: + +## 第一梯队:最稳 + +* (x) +* (x+b) +* (\log(1+x)) +* (\sqrt{x}) +* (\frac{x}{x+K}) +* clip +* rank倒数 / RRF + +## 第二梯队:有明确业务阈值时 + +* sigmoid +* tanh +* relu / hinge +* softplus +* 分桶 + +## 第三梯队:谨慎使用 + +* 指数放大 +* 高次幂 (x^3,x^4) +* 很激进的负幂 +* 过多参数化函数族 + +因为样本少时,最怕的是: + +* 变换过激 +* 参数太多 +* 极值被放大 +* 输入间强共线后模型不稳 + +--- + +# 六、最后给你一个超短总结版 + +| 方法族 | 代表公式 | 本质效果 | +| --- | ---------------------------------- | ---------- | +| 偏置族 | (x+b) | 救低值 | +| 压缩族 | (\log(1+x), \sqrt{x}) | 压大值、减长尾 | +| 饱和族 | (\frac{x}{x+K}, 1-e^{-\alpha x}) | 高值收益递减 | +| 放大族 | (x^\gamma,\gamma>1) | 强化高值 | +| 反向族 | (\frac{1}{x+c}, x^\gamma,\gamma<0) | 距离/排名/惩罚映射 | +| S型族 | sigmoid, tanh, arctan | 中间敏感、两端变平 | +| 阈值族 | ReLU, hinge, softplus | 超过门槛才生效 | +| 稳定族 | clip, winsorize, quantile | 抗异常、保稳 | +| 离散族 | binning, percentile | 保序/分段表达 | + +--- + +如果你愿意,我下一步可以继续帮你整理成一版**“适合直接放进方案文档/评审PPT”的表格版**,再补一列: +**“推荐用于哪些 rerank 因子(相关性、质量、热度、位置、先验)”**。 diff --git a/docs/issues/issue-2026-04-01-评估框架-四级label-done-0402.md b/docs/issues/issue-2026-04-01-评估框架-四级label-done-0402.md index 2b84494..16dc257 100644 --- a/docs/issues/issue-2026-04-01-评估框架-四级label-done-0402.md +++ b/docs/issues/issue-2026-04-01-评估框架-四级label-done-0402.md @@ -168,12 +168,6 @@ Typical examples: - Barely acceptable substitute → **Low Relevant** - Hardly substitutable at all → **Irrelevant** -6. **When product information is insufficient, do not treat “cannot confirm” as “conflict”.** - If a product does not mention an attribute, that does not mean the attribute is definitely violated. - Therefore: - - If the attribute is not mentioned or cannot be confirmed, prefer **High Relevant**; - - Only treat it as a conflict when the product information clearly shows the opposite of the query requirement. - Query: {query} {intent_suffix} @@ -312,12 +306,6 @@ _CLASSIFY_TEMPLATE_ZH = """你是一个服饰电商搜索系统中的相关性 - 勉强替代品 → **弱相关** - 几乎不可替代 → **不相关** -6. **若商品信息不足,不要把“无法确认”误判为“冲突”。** - 商品未写明某属性,不等于该属性一定不符合。 - 因此: - - 未提及 / 无法确认,优先按“基本相关”处理; - - 只有当商品信息明确显示与查询要求相反时,才视为属性冲突。 - 查询:{query} {intent_suffix} diff --git a/docs/常用查询 - ES.md b/docs/常用查询 - ES.md index a582a8c..ea1015b 100644 --- a/docs/常用查询 - ES.md +++ b/docs/常用查询 - ES.md @@ -348,7 +348,7 @@ curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_te #### 1.1 查询特定租户的商品,显示分面相关字段 ```bash -curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_163/_search?pretty' -H 'Content-Type: application/json' -d '{ "query": { "term": { "tenant_id": "162" } }, @@ -363,7 +363,7 @@ curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_te #### 1.2 验证 category1_name 字段是否有数据 ```bash -curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_163/_search?pretty' -H 'Content-Type: application/json' -d '{ "query": { "bool": { "filter": [ @@ -378,7 +378,7 @@ curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_te #### 1.3 验证 specifications 字段是否有数据 ```bash -curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_163/_search?pretty' -H 'Content-Type: application/json' -d '{ "query": { "bool": { "filter": [ @@ -397,7 +397,7 @@ curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_te #### 2.1 category1_name 分面聚合 ```bash -curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_163/_search?pretty' -H 'Content-Type: application/json' -d '{ "query": { "match_all": {} }, "size": 0, "aggs": { @@ -410,7 +410,7 @@ curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_te #### 2.2 specifications.color 分面聚合 ```bash -curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_163/_search?pretty' -H 'Content-Type: application/json' -d '{ "query": { "match_all": {} }, "size": 0, "aggs": { @@ -431,7 +431,7 @@ curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_te #### 2.3 specifications.size 分面聚合 ```bash -curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_163/_search?pretty' -H 'Content-Type: application/json' -d '{ "query": { "match_all": {} }, "size": 0, "aggs": { @@ -452,7 +452,7 @@ curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_te #### 2.4 specifications.material 分面聚合 ```bash -curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_163/_search?pretty' -H 'Content-Type: application/json' -d '{ "query": { "match_all": {} }, "size": 0, "aggs": { @@ -473,7 +473,7 @@ curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_te #### 2.5 综合分面聚合(category + color + size + material) ```bash -curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_163/_search?pretty' -H 'Content-Type: application/json' -d '{ "query": { "match_all": {} }, "size": 0, "aggs": { @@ -545,13 +545,172 @@ curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_s }' ``` +#### 3.3 `enriched_attributes`:`.value.zh` / `.value.en` 的 keyword 精确匹配与 text 全文匹配 + +> `enriched_attributes` 为 **nested**,检索需包在 `nested` 里。`.keyword` 子字段带 `lowercase` normalizer,英文词建议用小写做 `term`。 + +**keyword 精确匹配**(示例词:中文 `法式风格`,英文 `long skirt`) + +```bash +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_163/_search?pretty' -H 'Content-Type: application/json' -d '{ + "size": 1, + "_source": ["spu_id", "title", "enriched_attributes"], + "query": { + "nested": { + "path": "enriched_attributes", + "query": { + "bool": { + "should": [ + { "term": { "enriched_attributes.value.zh.keyword": "法式风格" } }, + { "term": { "enriched_attributes.value.en.keyword": "long skirt" } } + ], + "minimum_should_match": 2 + } + } + } + } +}' +``` + +**text 全文匹配**(经 `index_ik` / `english` 分词;可与上式对照) + +```bash +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_163/_search?pretty' -H 'Content-Type: application/json' -d '{ + "size": 1, + "_source": ["spu_id", "title", "enriched_attributes"], + "query": { + "nested": { + "path": "enriched_attributes", + "query": { + "bool": { + "should": [ + { "match": { "enriched_attributes.value.zh": "法式风格" } }, + { "match": { "enriched_attributes.value.en": "long skirt" } } + ], + "minimum_should_match": 2 + } + } + } + } +}' +``` + +若需要 **拼写容错**,可在 `match` 上增加 `"fuzziness": "AUTO"`(对英文更常见)。 + +#### 3.4 `option1_values`:keyword 与 text 分别查 `蓝色` / `blue` + +**keyword 精确匹配** + +```bash +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_163/_search?pretty' -H 'Content-Type: application/json' -d '{ + "size": 1, + "_source": ["spu_id", "title", "option1_values"], + "query": { + "bool": { + "should": [ + { "term": { "option1_values.zh.keyword": "蓝色" } }, + { "term": { "option1_values.en.keyword": "blue" } } + ], + "minimum_should_match": 2 + } + } +}' +``` + +**text 全文匹配** + +```bash +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_163/_search?pretty' -H 'Content-Type: application/json' -d '{ + "size": 1, + "_source": ["spu_id", "title", "option1_values"], + "query": { + "bool": { + "should": [ + { "match": { "option1_values.zh": "蓝色" } }, + { "match": { "option1_values.en": "blue" } } + ], + "minimum_should_match": 2 + } + } +}' +``` + +#### 3.5 `enriched_tags.zh` / `enriched_tags.en`:keyword 与 text(`高腰` / `high waist`) + +**keyword 精确匹配** + +```bash +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_163/_search?pretty' -H 'Content-Type: application/json' -d '{ + "size": 1, + "_source": ["spu_id", "title", "enriched_tags"], + "query": { + "bool": { + "should": [ + { "term": { "enriched_tags.zh.keyword": "高腰" } }, + { "term": { "enriched_tags.en.keyword": "high waist" } } + ], + "minimum_should_match": 2 + } + } +}' +``` + +**text 全文匹配** + +```bash +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_163/_search?pretty' -H 'Content-Type: application/json' -d '{ + "size": 1, + "_source": ["spu_id", "title", "enriched_tags"], + "query": { + "bool": { + "should": [ + { "match": { "enriched_tags.zh": "高腰" } }, + { "match": { "enriched_tags.en": "high waist" } } + ], + "minimum_should_match": 2 + } + } +}' +``` + +#### 3.6 `specifications`:`value_keyword` 与 `value_text.zh` / `value_text.en`(`蓝色` / `blue`) + +> `specifications` 为 **nested**,`value_keyword` 为整词匹配;`value_text.*` 可同时 `term` 子字段或 `match` 主 text。 + +```bash +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_163/_search?pretty' -H 'Content-Type: application/json' -d '{ + "size": 1, + "_source": ["spu_id", "title", "specifications"], + "query": { + "nested": { + "path": "specifications", + "query": { + "bool": { + "should": [ + { "term": { "specifications.value_keyword": "蓝色" } }, + { "term": { "specifications.value_keyword": "blue" } }, + { "term": { "specifications.value_text.zh.keyword": "蓝色" } }, + { "term": { "specifications.value_text.en.keyword": "blue" } }, + { "match": { "specifications.value_text.zh": "蓝色" } }, + { "match": { "specifications.value_text.en": "blue" } } + ], + "minimum_should_match": 5 + } + } + } + } +}' +``` + +仅查 **keyword 类**(`value_keyword` + `value_text.*.keyword`)时可从上面 `should` 里删掉两条 `match`;仅 **全文** 时可只保留两条 `match`。 + --- ### 4. 统计查询 #### 4.1 统计有 category1_name 的文档数量 ```bash -curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_count?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_163/_count?pretty' -H 'Content-Type: application/json' -d '{ "query": { "bool": { "filter": [ @@ -564,7 +723,7 @@ curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_te #### 4.2 统计有 specifications 的文档数量 ```bash -curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_count?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_163/_count?pretty' -H 'Content-Type: application/json' -d '{ "query": { "bool": { "filter": [ @@ -581,7 +740,7 @@ curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_te #### 5.1 查找没有 category1_name 但有 category 的文档(MySQL 有数据但 ES 没有) ```bash -curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_163/_search?pretty' -H 'Content-Type: application/json' -d '{ "query": { "bool": { "filter": [ @@ -599,7 +758,7 @@ curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_te #### 5.2 查找有 option 但没有 specifications 的文档(数据转换问题) ```bash -curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_163/_search?pretty' -H 'Content-Type: application/json' -d '{ "query": { "bool": { "filter": [ diff --git a/docs/相关性检索优化说明.md b/docs/相关性检索优化说明.md index f7843a6..a20fddb 100644 --- a/docs/相关性检索优化说明.md +++ b/docs/相关性检索优化说明.md @@ -850,3 +850,52 @@ title.zh: Dockers 男士经典版型工作日卡其色智能360度弹力裤( Rerank score: 0.0981 title.en: Lazy One Pajama Shorts for Men, Men's Pajama Bottoms, Sleepwear title.zh: 懒人男士睡裤,男式家居裤,睡眠服饰 + + + + +q=修身牛仔裤 + +这些好结果得分很低: + +rerank_score:0.0564 + "en": "Judy Blue Women's High Waist Button Fly Skinny Jeans 82319", + "zh": "Judy Blue 女士高腰纽扣开叉修身牛仔裤 82319" + + +rerank_score:0.0790 + "en": "2025 New Fashion European and American Women's Jeans High-Waisted Slim Straight Denim Pants Popular Floor-Length Pants", + "zh": "2025新款欧美风女式高腰显瘦直筒牛仔裤 时尚及地长裤" + + +rerank_score:0.0822 + "en": "roswear Women's Trendy Stretchy Flare Jeans Mid Rise Bootcut Curvy Denim Pants", + "zh": "Roswear 女士时尚弹力喇叭牛仔裤 中腰高腰修身直筒牛仔裤" + + +rerank_score:0.0956 + "en": "POSHGLAM Women's Maternity Jeans Over Belly 29'' Skinny Denim Jeggings Comfy Stretch Clearance Pregnancy Pants", + "zh": "POSHGLAM 女士孕产期高腰显瘦牛仔紧身裤 29英寸 紧身弹力孕妇裤 休闲舒适 清仓特价" + +(带有 Slim Stretch Jeans,但是打分只有0.0135,极低) +rerank_score:0.0135 + "en": "European and American Export Temu American Retro Sexy Bell-Bottomed Pants Slim Slim Stretch Jeans Women's Pants", + "zh": "欧美出口 蒂姆美国复古性感喇叭裤 修身弹力女裤" + + +这几个结果比较差,但是得分很高: + +rerank_score:0.4692 + "en": "American Vintage Low Waist Non-Elastic Washed Straight-Leg Jeans Women's Autumn New Street Wide Leg Denim Women's Pants", + "zh": "美式复古低腰无弹洗水直筒阔腿牛仔裤 女士秋季新款阔腿牛仔裤" + + +rerank_score:0.4784 + "en": "Europe and the United States cross-border foreign trade 2025 spring and summer new Amazon independent station washed waist adjustable Denim pants", + "zh": "欧美跨境外贸2025春夏新款亚马逊独立站洗水腰 adjustable 牛仔裤" + + +rerank_score:0.5849 + "zh": "新款女士修身仿旧牛仔短裤 – 休闲性感磨边水洗牛仔短裤,时尚舒", + "en": "New Women's Slim-fit Vintage Washed Denim Shorts – Casual Sexy Frayed Hem, Fashionable & Comfortable" + diff --git a/scripts/evaluation/eval_framework/prompts.py b/scripts/evaluation/eval_framework/prompts.py index f7dea90..a0ca126 100644 --- a/scripts/evaluation/eval_framework/prompts.py +++ b/scripts/evaluation/eval_framework/prompts.py @@ -175,12 +175,6 @@ Typical examples: - Barely acceptable substitute → **Low Relevant** - Hardly substitutable at all → **Irrelevant** -6. **When product information is insufficient, do not treat “cannot confirm” as “conflict”.** - If a product does not mention an attribute, that does not mean the attribute is definitely violated. - Therefore: - - If the attribute is not mentioned or cannot be confirmed, prefer **High Relevant**; - - Only treat it as a conflict when the product information clearly shows the opposite of the query requirement. - Query: {query} {intent_suffix} @@ -319,12 +313,6 @@ _CLASSIFY_TEMPLATE_ZH = """你是一个服饰电商搜索系统中的相关性 - 勉强替代品 → **弱相关** - 几乎不可替代 → **不相关** -6. **若商品信息不足,不要把“无法确认”误判为“冲突”。** - 商品未写明某属性,不等于该属性一定不符合。 - 因此: - - 未提及 / 无法确认,优先按“基本相关”处理; - - 只有当商品信息明确显示与查询要求相反时,才视为属性冲突。 - 查询:{query} {intent_suffix} -- libgit2 0.21.2