From 3984ec640ae8b23d680e0d8d699b7e8c68cab0d3 Mon Sep 17 00:00:00 2001 From: tangwang Date: Tue, 31 Mar 2026 20:33:35 +0800 Subject: [PATCH] evalution 标注标准优化 --- .gitignore | 5 +++++ scripts/evaluation/eval_framework/prompts.py | 305 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------------------------------------------- 2 files changed, 246 insertions(+), 64 deletions(-) diff --git a/.gitignore b/.gitignore index 656444b..36f6814 100644 --- a/.gitignore +++ b/.gitignore @@ -77,3 +77,8 @@ logs_*/ models/ model_cache/ + +# Search evaluation: local SQLite DB and generated outputs (large / regenerable) +artifacts/search_evaluation/*.sqlite3 +artifacts/search_evaluation/batch_reports/ +artifacts/search_evaluation/tuning_runs/ diff --git a/scripts/evaluation/eval_framework/prompts.py b/scripts/evaluation/eval_framework/prompts.py index 8b6731b..77445a2 100644 --- a/scripts/evaluation/eval_framework/prompts.py +++ b/scripts/evaluation/eval_framework/prompts.py @@ -5,62 +5,251 @@ from __future__ import annotations import json from typing import Any, Dict, Sequence +_CLASSIFY_BATCH_SIMPLE_TEMPLATE = """You are a relevance evaluation assistant for an apparel e-commerce search system. +Given the user query and each product's information, assign one relevance label to each product. + +## Relevance Labels + +### Exact +The product fully satisfies the user's search intent. + +Use Exact when: +- The product matches the core product type named in the query. +- The key requirements explicitly stated in the query are satisfied. +- There is no clear conflict with any explicit user requirement. + +Typical cases: +- The query is only a product type, and the product is exactly that product type. +- The query includes product type + attributes, and the product matches the type and those attributes. + +### Partial +The product satisfies the user's primary intent, but does not fully satisfy all specified details. + +Use Partial when: +- The core product type matches, but some requested attributes cannot be confirmed. +- The core product type matches, but only some secondary attributes are satisfied. +- The core product type matches, and there are minor or non-critical deviations from the query. +- The product does not clearly contradict the user's explicit requirements, but it also cannot be considered a full match. + +Typical cases: +- Query: "red fitted t-shirt", product: "Women's T-Shirt" → color/fit cannot be confirmed. +- Query: "red fitted t-shirt", product: "Blue Fitted T-Shirt" → product type and fit match, but color differs. +- Query: "cotton long sleeve blouse", product: "Long Sleeve Blouse" → material not confirmed. + +Important: +Partial should mainly be used when the core product type is correct, but the detailed requirements are incomplete, uncertain, or only partially matched. + +### Irrelevant +The product does not satisfy the user's main shopping intent. + +Use Irrelevant when: +- The core product type does not match the query. +- The product matches the general category but is a different product type that shoppers would not consider interchangeable. +- The core product type matches, but the product clearly contradicts an explicit and important requirement in the query. + +Typical cases: +- Query: "pants", product: "shoes" → wrong product type. +- Query: "dress", product: "skirt" → different product type. +- Query: "fitted pants", product: "loose wide-leg pants" → explicit contradiction on fit. +- Query: "sleeveless dress", product: "long sleeve dress" → explicit contradiction on sleeve style. + +## Decision Principles + +1. Product type is the highest-priority factor. + If the query clearly specifies a concrete product type, the result must match that product type to be Exact or Partial. + A different product type is usually Irrelevant, not Partial. + +2. Similar or related product types are not interchangeable when the query is specific. + For example: + - dress vs skirt vs jumpsuit + - jeans vs pants + - t-shirt vs blouse + - cardigan vs sweater + - boots vs shoes + - bra vs top + - backpack vs bag + If the user explicitly searched for one of these, the others should usually be judged Irrelevant. + +3. If the core product type matches, then evaluate attributes. + - If attributes fully match → Exact + - If attributes are missing, uncertain, or only partially matched → Partial + - If attributes clearly contradict an explicit important requirement → Irrelevant + +4. Distinguish carefully between "not mentioned" and "contradicted". + - If an attribute is not mentioned or cannot be verified, prefer Partial. + - If an attribute is explicitly opposite to the query, use Irrelevant. + +5. Do not overuse Exact. + Exact requires strong evidence that the product satisfies the user's stated intent, not just the general category. + +Query: {query} + +Products: +{lines} + +## Output Format +Strictly output {n} lines, each line containing exactly one of: +Exact +Partial +Irrelevant + +The lines must correspond sequentially to the products above. +Do not output any other information. +""" + +_CLASSIFY_BATCH_SIMPLE_TEMPLATE__zh = """你是一个服装电商搜索系统的相关性评估助手。 +给定用户查询和每个产品的信息,为每个产品分配一个相关性标签。 + +## 相关性标签 + +### 完全相关 +该产品完全满足用户的搜索意图。 + +在以下情况使用完全相关: +- 产品与查询中指定的核心产品类型相匹配。 +- 满足了查询中明确说明的关键要求。 +- 与用户明确的任何要求没有明显冲突。 + +典型情况: +- 查询仅包含产品类型,而产品恰好是该产品类型。 +- 查询包含产品类型 + 属性,而产品与该类型及这些属性相匹配。 + +### 部分相关 +该产品满足了用户的主要意图,但并未完全满足所有指定的细节。 + +在以下情况使用部分相关: +- 核心产品类型匹配,但部分请求的属性无法确认。 +- 核心产品类型匹配,但仅满足了部分次要属性。 +- 核心产品类型匹配,但与查询存在微小或非关键的偏差。 +- 产品未明显违背用户的明确要求,但也不能视为完全匹配。 + +典型情况: +- 查询:"红色修身T恤",产品:"女士T恤" → 颜色/版型无法确认。 +- 查询:"红色修身T恤",产品:"蓝色修身T恤" → 产品类型和版型匹配,但颜色不同。 +- 查询:"棉质长袖衬衫",产品:"长袖衬衫" → 材质未确认。 + +重要提示: +部分相关主要应在核心产品类型正确,但详细要求不完整、不确定或仅部分匹配时使用。 + +### 不相关 +该产品不满足用户的主要购物意图。 + +在以下情况使用不相关: +- 核心产品类型与查询不匹配。 +- 产品匹配了大致类别,但属于购物者不会认为可互换的不同产品类型。 +- 核心产品类型匹配,但产品明显违背了查询中一个明确且重要的要求。 + +典型情况: +- 查询:"裤子",产品:"鞋子" → 错误的产品类型。 +- 查询:"连衣裙",产品:"半身裙" → 不同的产品类型。 +- 查询:"修身裤",产品:"宽松阔腿裤" → 版型上明显矛盾。 +- 查询:"无袖连衣裙",产品:"长袖连衣裙" → 袖型上明显矛盾。 + +## 决策原则 + +1. 产品类型是最高优先级的因素。 + 如果查询明确指定了具体产品类型,结果必须匹配该产品类型才能被评为完全相关或部分相关。 + 不同的产品类型通常是不相关,而非部分相关。 + +2. 当查询明确时,相似或相关的产品类型不可互换。 + 例如: + - 连衣裙 vs 半身裙 vs 连体裤 + - 牛仔裤 vs 裤子 + - T恤 vs 衬衫 + - 开衫 vs 毛衣 + - 靴子 vs 鞋子 + - 文胸 vs 上衣 + - 双肩包 vs 包 + 如果用户明确搜索了其中一种,其他的通常应判断为不相关。 + +3. 如果核心产品类型匹配,则评估属性。 + - 如果属性完全匹配 → 完全相关 + - 如果属性缺失、不确定或仅部分匹配 → 部分相关 + - 如果属性明显违背明确的重点要求 → 不相关 + +4. 仔细区分“未提及”和“矛盾”。 + - 如果属性未提及或无法验证,倾向于部分相关。 + - 如果属性与查询明确相反,使用不相关。 + +5. 不要过度使用完全相关。 + 完全相关需要强有力的证据表明产品满足了用户声明的意图,而不仅仅是通用类别。 + +查询: {query} + +产品: +{lines} + +## 输出格式 +严格输出 {n} 行,每行包含以下之一: +Exact +Partial +Irrelevant + +这些行必须按顺序对应上面的产品。 +不要输出任何其他信息。 +""" + + def classify_batch_simple_prompt(query: str, numbered_doc_lines: Sequence[str]) -> str: lines = "\n".join(numbered_doc_lines) n = len(numbered_doc_lines) - return ( - "You are an e-commerce search result relevance evaluation assistant. " - "Based on the user query and each product's information, output the relevance level for each product.\n\n" - "## Relevance Level Criteria\n" - "Exact — Fully matches the user's search intent.\n" - "Partial — Primary intent satisfied (same category or similar use, basically aligns with search intent), " - "but secondary attributes such as color, style, size, fit, length, or material deviate from or cannot be confirmed.\n" - "Irrelevant — Category or use case mismatched, primary intent not satisfied.\n\n" - "Additional judging guidance:\n" - "- If the query clearly names a product type, product type matching has the highest priority. " - "Dress vs skirt vs jumpsuit, jeans vs pants, T-shirt vs blouse, cardigan vs sweater, boots vs shoes, " - "bra vs top, backpack vs bag are not interchangeable.\n" - "- When the query clearly specifies a concrete product type, a different product type should usually be Irrelevant, not Partial.\n" - "- If an attribute looks missing or uncertain, prefer Partial instead of Exact.\n" - "- Do not guess missing attributes.\n" - "- Graphic, slogan, holiday, memorial, or message tees are not Exact for a plain tee query unless that graphic/theme is requested.\n" - "- Be conservative with Exact.\n\n" - f"Query: {query}\n\n" - "Products:\n" - f"{lines}\n\n" - "## Output Format\n" - f"Strictly output {n} lines, each line containing exactly one of Exact / Partial / Irrelevant. " - "They must correspond sequentially to the products above. Do not output any other information.\n" - ) + return _CLASSIFY_BATCH_SIMPLE_TEMPLATE.format(query=query, lines=lines, n=n) + + +_EXTRACT_QUERY_PROFILE_TEMPLATE = """You are building a structured intent profile for e-commerce relevance judging. +Use the original user query as the source of truth. Parser hints may help, but if a hint conflicts with the original query, trust the original query. +Be conservative: only mark an attribute as required if the user explicitly asked for it. + +Return JSON with this schema: +{{ + "normalized_query_en": string, + "primary_category": string, + "allowed_categories": [string], + "required_attributes": [ + {{"name": string, "required_terms": [string], "conflicting_terms": [string], "match_mode": "explicit"}} + ], + "notes": [string] +}} + +Guidelines: +- Exact later will require explicit evidence for all required attributes. +- allowed_categories should contain only near-synonyms of the same product type, not substitutes. For example dress can allow midi dress/cocktail dress, but not skirt, top, jumpsuit, or outfit unless the query explicitly asks for them. +- If the query asks for dress/skirt/jeans/t-shirt, near but different product types are not Exact. +- If the query includes color, fit, silhouette, or length, include them as required_attributes. +- For fit words, include conflicting terms when obvious, e.g. fitted conflicts with oversized/loose; oversized conflicts with fitted/tight. +- For color, include conflicting colors only when clear from the query. + +Original query: {query} +Parser hints JSON: {hints_json} +""" def extract_query_profile_prompt(query: str, parser_hints: Dict[str, Any]) -> str: hints_json = json.dumps(parser_hints, ensure_ascii=False) - return ( - "You are building a structured intent profile for e-commerce relevance judging.\n" - "Use the original user query as the source of truth. Parser hints may help, but if a hint conflicts with the original query, trust the original query.\n" - "Be conservative: only mark an attribute as required if the user explicitly asked for it.\n\n" - "Return JSON with this schema:\n" - "{\n" - ' "normalized_query_en": string,\n' - ' "primary_category": string,\n' - ' "allowed_categories": [string],\n' - ' "required_attributes": [\n' - ' {"name": string, "required_terms": [string], "conflicting_terms": [string], "match_mode": "explicit"}\n' - " ],\n" - ' "notes": [string]\n' - "}\n\n" - "Guidelines:\n" - "- Exact later will require explicit evidence for all required attributes.\n" - "- allowed_categories should contain only near-synonyms of the same product type, not substitutes. For example dress can allow midi dress/cocktail dress, but not skirt, top, jumpsuit, or outfit unless the query explicitly asks for them.\n" - "- If the query asks for dress/skirt/jeans/t-shirt, near but different product types are not Exact.\n" - "- If the query includes color, fit, silhouette, or length, include them as required_attributes.\n" - "- For fit words, include conflicting terms when obvious, e.g. fitted conflicts with oversized/loose; oversized conflicts with fitted/tight.\n" - "- For color, include conflicting colors only when clear from the query.\n\n" - f"Original query: {query}\n" - f"Parser hints JSON: {hints_json}\n" - ) + return _EXTRACT_QUERY_PROFILE_TEMPLATE.format(query=query, hints_json=hints_json) + + +_CLASSIFY_BATCH_COMPLEX_TEMPLATE = """You are an e-commerce search relevance judge. +Judge each product against the structured query profile below. + +Relevance rules: +- Exact: product type matches the target intent, and every explicit required attribute is positively supported by the title/options/tags/category. If an attribute is missing or only guessed, it is NOT Exact. +- Partial: main product type/use case matches, but some required attribute is missing, weaker, uncertain, or only approximately matched. +- Irrelevant: product type/use case mismatched, or an explicit required attribute clearly conflicts. +- Be conservative with Exact. +- Graphic/holiday/message tees are not Exact for a plain color/style tee query unless that graphic/theme was requested. +- Jumpsuit/romper/set is not Exact for dress/skirt/jeans queries. + +Original query: {query} +Structured query profile JSON: {profile_json} + +Products: +{lines} + +Return JSON only, with schema: +{{"labels":[{{"index":1,"label":"Exact","reason":"short phrase"}}]}} +""" def classify_batch_complex_prompt( @@ -70,20 +259,8 @@ def classify_batch_complex_prompt( ) -> str: lines = "\n".join(numbered_doc_lines) profile_json = json.dumps(query_profile, ensure_ascii=False) - return ( - "You are an e-commerce search relevance judge.\n" - "Judge each product against the structured query profile below.\n\n" - "Relevance rules:\n" - "- Exact: product type matches the target intent, and every explicit required attribute is positively supported by the title/options/tags/category. If an attribute is missing or only guessed, it is NOT Exact.\n" - "- Partial: main product type/use case matches, but some required attribute is missing, weaker, uncertain, or only approximately matched.\n" - "- Irrelevant: product type/use case mismatched, or an explicit required attribute clearly conflicts.\n" - "- Be conservative with Exact.\n" - "- Graphic/holiday/message tees are not Exact for a plain color/style tee query unless that graphic/theme was requested.\n" - "- Jumpsuit/romper/set is not Exact for dress/skirt/jeans queries.\n\n" - f"Original query: {query}\n" - f"Structured query profile JSON: {profile_json}\n\n" - "Products:\n" - f"{lines}\n\n" - "Return JSON only, with schema:\n" - '{"labels":[{"index":1,"label":"Exact","reason":"short phrase"}]}\n' + return _CLASSIFY_BATCH_COMPLEX_TEMPLATE.format( + query=query, + profile_json=profile_json, + lines=lines, ) -- libgit2 0.21.2