Commit 3984ec640ae8b23d680e0d8d699b7e8c68cab0d3
1 parent
c81b0fc1
evalution 标注标准优化
Made-with: Cursor
Showing
2 changed files
with
246 additions
and
64 deletions
Show diff stats
.gitignore
| ... | ... | @@ -77,3 +77,8 @@ logs_*/ |
| 77 | 77 | |
| 78 | 78 | models/ |
| 79 | 79 | model_cache/ |
| 80 | + | |
| 81 | +# Search evaluation: local SQLite DB and generated outputs (large / regenerable) | |
| 82 | +artifacts/search_evaluation/*.sqlite3 | |
| 83 | +artifacts/search_evaluation/batch_reports/ | |
| 84 | +artifacts/search_evaluation/tuning_runs/ | ... | ... |
scripts/evaluation/eval_framework/prompts.py
| ... | ... | @@ -5,62 +5,251 @@ from __future__ import annotations |
| 5 | 5 | import json |
| 6 | 6 | from typing import Any, Dict, Sequence |
| 7 | 7 | |
| 8 | +_CLASSIFY_BATCH_SIMPLE_TEMPLATE = """You are a relevance evaluation assistant for an apparel e-commerce search system. | |
| 9 | +Given the user query and each product's information, assign one relevance label to each product. | |
| 10 | + | |
| 11 | +## Relevance Labels | |
| 12 | + | |
| 13 | +### Exact | |
| 14 | +The product fully satisfies the user's search intent. | |
| 15 | + | |
| 16 | +Use Exact when: | |
| 17 | +- The product matches the core product type named in the query. | |
| 18 | +- The key requirements explicitly stated in the query are satisfied. | |
| 19 | +- There is no clear conflict with any explicit user requirement. | |
| 20 | + | |
| 21 | +Typical cases: | |
| 22 | +- The query is only a product type, and the product is exactly that product type. | |
| 23 | +- The query includes product type + attributes, and the product matches the type and those attributes. | |
| 24 | + | |
| 25 | +### Partial | |
| 26 | +The product satisfies the user's primary intent, but does not fully satisfy all specified details. | |
| 27 | + | |
| 28 | +Use Partial when: | |
| 29 | +- The core product type matches, but some requested attributes cannot be confirmed. | |
| 30 | +- The core product type matches, but only some secondary attributes are satisfied. | |
| 31 | +- The core product type matches, and there are minor or non-critical deviations from the query. | |
| 32 | +- The product does not clearly contradict the user's explicit requirements, but it also cannot be considered a full match. | |
| 33 | + | |
| 34 | +Typical cases: | |
| 35 | +- Query: "red fitted t-shirt", product: "Women's T-Shirt" → color/fit cannot be confirmed. | |
| 36 | +- Query: "red fitted t-shirt", product: "Blue Fitted T-Shirt" → product type and fit match, but color differs. | |
| 37 | +- Query: "cotton long sleeve blouse", product: "Long Sleeve Blouse" → material not confirmed. | |
| 38 | + | |
| 39 | +Important: | |
| 40 | +Partial should mainly be used when the core product type is correct, but the detailed requirements are incomplete, uncertain, or only partially matched. | |
| 41 | + | |
| 42 | +### Irrelevant | |
| 43 | +The product does not satisfy the user's main shopping intent. | |
| 44 | + | |
| 45 | +Use Irrelevant when: | |
| 46 | +- The core product type does not match the query. | |
| 47 | +- The product matches the general category but is a different product type that shoppers would not consider interchangeable. | |
| 48 | +- The core product type matches, but the product clearly contradicts an explicit and important requirement in the query. | |
| 49 | + | |
| 50 | +Typical cases: | |
| 51 | +- Query: "pants", product: "shoes" → wrong product type. | |
| 52 | +- Query: "dress", product: "skirt" → different product type. | |
| 53 | +- Query: "fitted pants", product: "loose wide-leg pants" → explicit contradiction on fit. | |
| 54 | +- Query: "sleeveless dress", product: "long sleeve dress" → explicit contradiction on sleeve style. | |
| 55 | + | |
| 56 | +## Decision Principles | |
| 57 | + | |
| 58 | +1. Product type is the highest-priority factor. | |
| 59 | + If the query clearly specifies a concrete product type, the result must match that product type to be Exact or Partial. | |
| 60 | + A different product type is usually Irrelevant, not Partial. | |
| 61 | + | |
| 62 | +2. Similar or related product types are not interchangeable when the query is specific. | |
| 63 | + For example: | |
| 64 | + - dress vs skirt vs jumpsuit | |
| 65 | + - jeans vs pants | |
| 66 | + - t-shirt vs blouse | |
| 67 | + - cardigan vs sweater | |
| 68 | + - boots vs shoes | |
| 69 | + - bra vs top | |
| 70 | + - backpack vs bag | |
| 71 | + If the user explicitly searched for one of these, the others should usually be judged Irrelevant. | |
| 72 | + | |
| 73 | +3. If the core product type matches, then evaluate attributes. | |
| 74 | + - If attributes fully match → Exact | |
| 75 | + - If attributes are missing, uncertain, or only partially matched → Partial | |
| 76 | + - If attributes clearly contradict an explicit important requirement → Irrelevant | |
| 77 | + | |
| 78 | +4. Distinguish carefully between "not mentioned" and "contradicted". | |
| 79 | + - If an attribute is not mentioned or cannot be verified, prefer Partial. | |
| 80 | + - If an attribute is explicitly opposite to the query, use Irrelevant. | |
| 81 | + | |
| 82 | +5. Do not overuse Exact. | |
| 83 | + Exact requires strong evidence that the product satisfies the user's stated intent, not just the general category. | |
| 84 | + | |
| 85 | +Query: {query} | |
| 86 | + | |
| 87 | +Products: | |
| 88 | +{lines} | |
| 89 | + | |
| 90 | +## Output Format | |
| 91 | +Strictly output {n} lines, each line containing exactly one of: | |
| 92 | +Exact | |
| 93 | +Partial | |
| 94 | +Irrelevant | |
| 95 | + | |
| 96 | +The lines must correspond sequentially to the products above. | |
| 97 | +Do not output any other information. | |
| 98 | +""" | |
| 99 | + | |
| 100 | +_CLASSIFY_BATCH_SIMPLE_TEMPLATE__zh = """你是一个服装电商搜索系统的相关性评估助手。 | |
| 101 | +给定用户查询和每个产品的信息,为每个产品分配一个相关性标签。 | |
| 102 | + | |
| 103 | +## 相关性标签 | |
| 104 | + | |
| 105 | +### 完全相关 | |
| 106 | +该产品完全满足用户的搜索意图。 | |
| 107 | + | |
| 108 | +在以下情况使用完全相关: | |
| 109 | +- 产品与查询中指定的核心产品类型相匹配。 | |
| 110 | +- 满足了查询中明确说明的关键要求。 | |
| 111 | +- 与用户明确的任何要求没有明显冲突。 | |
| 112 | + | |
| 113 | +典型情况: | |
| 114 | +- 查询仅包含产品类型,而产品恰好是该产品类型。 | |
| 115 | +- 查询包含产品类型 + 属性,而产品与该类型及这些属性相匹配。 | |
| 116 | + | |
| 117 | +### 部分相关 | |
| 118 | +该产品满足了用户的主要意图,但并未完全满足所有指定的细节。 | |
| 119 | + | |
| 120 | +在以下情况使用部分相关: | |
| 121 | +- 核心产品类型匹配,但部分请求的属性无法确认。 | |
| 122 | +- 核心产品类型匹配,但仅满足了部分次要属性。 | |
| 123 | +- 核心产品类型匹配,但与查询存在微小或非关键的偏差。 | |
| 124 | +- 产品未明显违背用户的明确要求,但也不能视为完全匹配。 | |
| 125 | + | |
| 126 | +典型情况: | |
| 127 | +- 查询:"红色修身T恤",产品:"女士T恤" → 颜色/版型无法确认。 | |
| 128 | +- 查询:"红色修身T恤",产品:"蓝色修身T恤" → 产品类型和版型匹配,但颜色不同。 | |
| 129 | +- 查询:"棉质长袖衬衫",产品:"长袖衬衫" → 材质未确认。 | |
| 130 | + | |
| 131 | +重要提示: | |
| 132 | +部分相关主要应在核心产品类型正确,但详细要求不完整、不确定或仅部分匹配时使用。 | |
| 133 | + | |
| 134 | +### 不相关 | |
| 135 | +该产品不满足用户的主要购物意图。 | |
| 136 | + | |
| 137 | +在以下情况使用不相关: | |
| 138 | +- 核心产品类型与查询不匹配。 | |
| 139 | +- 产品匹配了大致类别,但属于购物者不会认为可互换的不同产品类型。 | |
| 140 | +- 核心产品类型匹配,但产品明显违背了查询中一个明确且重要的要求。 | |
| 141 | + | |
| 142 | +典型情况: | |
| 143 | +- 查询:"裤子",产品:"鞋子" → 错误的产品类型。 | |
| 144 | +- 查询:"连衣裙",产品:"半身裙" → 不同的产品类型。 | |
| 145 | +- 查询:"修身裤",产品:"宽松阔腿裤" → 版型上明显矛盾。 | |
| 146 | +- 查询:"无袖连衣裙",产品:"长袖连衣裙" → 袖型上明显矛盾。 | |
| 147 | + | |
| 148 | +## 决策原则 | |
| 149 | + | |
| 150 | +1. 产品类型是最高优先级的因素。 | |
| 151 | + 如果查询明确指定了具体产品类型,结果必须匹配该产品类型才能被评为完全相关或部分相关。 | |
| 152 | + 不同的产品类型通常是不相关,而非部分相关。 | |
| 153 | + | |
| 154 | +2. 当查询明确时,相似或相关的产品类型不可互换。 | |
| 155 | + 例如: | |
| 156 | + - 连衣裙 vs 半身裙 vs 连体裤 | |
| 157 | + - 牛仔裤 vs 裤子 | |
| 158 | + - T恤 vs 衬衫 | |
| 159 | + - 开衫 vs 毛衣 | |
| 160 | + - 靴子 vs 鞋子 | |
| 161 | + - 文胸 vs 上衣 | |
| 162 | + - 双肩包 vs 包 | |
| 163 | + 如果用户明确搜索了其中一种,其他的通常应判断为不相关。 | |
| 164 | + | |
| 165 | +3. 如果核心产品类型匹配,则评估属性。 | |
| 166 | + - 如果属性完全匹配 → 完全相关 | |
| 167 | + - 如果属性缺失、不确定或仅部分匹配 → 部分相关 | |
| 168 | + - 如果属性明显违背明确的重点要求 → 不相关 | |
| 169 | + | |
| 170 | +4. 仔细区分“未提及”和“矛盾”。 | |
| 171 | + - 如果属性未提及或无法验证,倾向于部分相关。 | |
| 172 | + - 如果属性与查询明确相反,使用不相关。 | |
| 173 | + | |
| 174 | +5. 不要过度使用完全相关。 | |
| 175 | + 完全相关需要强有力的证据表明产品满足了用户声明的意图,而不仅仅是通用类别。 | |
| 176 | + | |
| 177 | +查询: {query} | |
| 178 | + | |
| 179 | +产品: | |
| 180 | +{lines} | |
| 181 | + | |
| 182 | +## 输出格式 | |
| 183 | +严格输出 {n} 行,每行包含以下之一: | |
| 184 | +Exact | |
| 185 | +Partial | |
| 186 | +Irrelevant | |
| 187 | + | |
| 188 | +这些行必须按顺序对应上面的产品。 | |
| 189 | +不要输出任何其他信息。 | |
| 190 | +""" | |
| 191 | + | |
| 192 | + | |
| 8 | 193 | |
| 9 | 194 | def classify_batch_simple_prompt(query: str, numbered_doc_lines: Sequence[str]) -> str: |
| 10 | 195 | lines = "\n".join(numbered_doc_lines) |
| 11 | 196 | n = len(numbered_doc_lines) |
| 12 | - return ( | |
| 13 | - "You are an e-commerce search result relevance evaluation assistant. " | |
| 14 | - "Based on the user query and each product's information, output the relevance level for each product.\n\n" | |
| 15 | - "## Relevance Level Criteria\n" | |
| 16 | - "Exact — Fully matches the user's search intent.\n" | |
| 17 | - "Partial — Primary intent satisfied (same category or similar use, basically aligns with search intent), " | |
| 18 | - "but secondary attributes such as color, style, size, fit, length, or material deviate from or cannot be confirmed.\n" | |
| 19 | - "Irrelevant — Category or use case mismatched, primary intent not satisfied.\n\n" | |
| 20 | - "Additional judging guidance:\n" | |
| 21 | - "- If the query clearly names a product type, product type matching has the highest priority. " | |
| 22 | - "Dress vs skirt vs jumpsuit, jeans vs pants, T-shirt vs blouse, cardigan vs sweater, boots vs shoes, " | |
| 23 | - "bra vs top, backpack vs bag are not interchangeable.\n" | |
| 24 | - "- When the query clearly specifies a concrete product type, a different product type should usually be Irrelevant, not Partial.\n" | |
| 25 | - "- If an attribute looks missing or uncertain, prefer Partial instead of Exact.\n" | |
| 26 | - "- Do not guess missing attributes.\n" | |
| 27 | - "- Graphic, slogan, holiday, memorial, or message tees are not Exact for a plain tee query unless that graphic/theme is requested.\n" | |
| 28 | - "- Be conservative with Exact.\n\n" | |
| 29 | - f"Query: {query}\n\n" | |
| 30 | - "Products:\n" | |
| 31 | - f"{lines}\n\n" | |
| 32 | - "## Output Format\n" | |
| 33 | - f"Strictly output {n} lines, each line containing exactly one of Exact / Partial / Irrelevant. " | |
| 34 | - "They must correspond sequentially to the products above. Do not output any other information.\n" | |
| 35 | - ) | |
| 197 | + return _CLASSIFY_BATCH_SIMPLE_TEMPLATE.format(query=query, lines=lines, n=n) | |
| 198 | + | |
| 199 | + | |
| 200 | +_EXTRACT_QUERY_PROFILE_TEMPLATE = """You are building a structured intent profile for e-commerce relevance judging. | |
| 201 | +Use the original user query as the source of truth. Parser hints may help, but if a hint conflicts with the original query, trust the original query. | |
| 202 | +Be conservative: only mark an attribute as required if the user explicitly asked for it. | |
| 203 | + | |
| 204 | +Return JSON with this schema: | |
| 205 | +{{ | |
| 206 | + "normalized_query_en": string, | |
| 207 | + "primary_category": string, | |
| 208 | + "allowed_categories": [string], | |
| 209 | + "required_attributes": [ | |
| 210 | + {{"name": string, "required_terms": [string], "conflicting_terms": [string], "match_mode": "explicit"}} | |
| 211 | + ], | |
| 212 | + "notes": [string] | |
| 213 | +}} | |
| 214 | + | |
| 215 | +Guidelines: | |
| 216 | +- Exact later will require explicit evidence for all required attributes. | |
| 217 | +- allowed_categories should contain only near-synonyms of the same product type, not substitutes. For example dress can allow midi dress/cocktail dress, but not skirt, top, jumpsuit, or outfit unless the query explicitly asks for them. | |
| 218 | +- If the query asks for dress/skirt/jeans/t-shirt, near but different product types are not Exact. | |
| 219 | +- If the query includes color, fit, silhouette, or length, include them as required_attributes. | |
| 220 | +- For fit words, include conflicting terms when obvious, e.g. fitted conflicts with oversized/loose; oversized conflicts with fitted/tight. | |
| 221 | +- For color, include conflicting colors only when clear from the query. | |
| 222 | + | |
| 223 | +Original query: {query} | |
| 224 | +Parser hints JSON: {hints_json} | |
| 225 | +""" | |
| 36 | 226 | |
| 37 | 227 | |
| 38 | 228 | def extract_query_profile_prompt(query: str, parser_hints: Dict[str, Any]) -> str: |
| 39 | 229 | hints_json = json.dumps(parser_hints, ensure_ascii=False) |
| 40 | - return ( | |
| 41 | - "You are building a structured intent profile for e-commerce relevance judging.\n" | |
| 42 | - "Use the original user query as the source of truth. Parser hints may help, but if a hint conflicts with the original query, trust the original query.\n" | |
| 43 | - "Be conservative: only mark an attribute as required if the user explicitly asked for it.\n\n" | |
| 44 | - "Return JSON with this schema:\n" | |
| 45 | - "{\n" | |
| 46 | - ' "normalized_query_en": string,\n' | |
| 47 | - ' "primary_category": string,\n' | |
| 48 | - ' "allowed_categories": [string],\n' | |
| 49 | - ' "required_attributes": [\n' | |
| 50 | - ' {"name": string, "required_terms": [string], "conflicting_terms": [string], "match_mode": "explicit"}\n' | |
| 51 | - " ],\n" | |
| 52 | - ' "notes": [string]\n' | |
| 53 | - "}\n\n" | |
| 54 | - "Guidelines:\n" | |
| 55 | - "- Exact later will require explicit evidence for all required attributes.\n" | |
| 56 | - "- allowed_categories should contain only near-synonyms of the same product type, not substitutes. For example dress can allow midi dress/cocktail dress, but not skirt, top, jumpsuit, or outfit unless the query explicitly asks for them.\n" | |
| 57 | - "- If the query asks for dress/skirt/jeans/t-shirt, near but different product types are not Exact.\n" | |
| 58 | - "- If the query includes color, fit, silhouette, or length, include them as required_attributes.\n" | |
| 59 | - "- For fit words, include conflicting terms when obvious, e.g. fitted conflicts with oversized/loose; oversized conflicts with fitted/tight.\n" | |
| 60 | - "- For color, include conflicting colors only when clear from the query.\n\n" | |
| 61 | - f"Original query: {query}\n" | |
| 62 | - f"Parser hints JSON: {hints_json}\n" | |
| 63 | - ) | |
| 230 | + return _EXTRACT_QUERY_PROFILE_TEMPLATE.format(query=query, hints_json=hints_json) | |
| 231 | + | |
| 232 | + | |
| 233 | +_CLASSIFY_BATCH_COMPLEX_TEMPLATE = """You are an e-commerce search relevance judge. | |
| 234 | +Judge each product against the structured query profile below. | |
| 235 | + | |
| 236 | +Relevance rules: | |
| 237 | +- Exact: product type matches the target intent, and every explicit required attribute is positively supported by the title/options/tags/category. If an attribute is missing or only guessed, it is NOT Exact. | |
| 238 | +- Partial: main product type/use case matches, but some required attribute is missing, weaker, uncertain, or only approximately matched. | |
| 239 | +- Irrelevant: product type/use case mismatched, or an explicit required attribute clearly conflicts. | |
| 240 | +- Be conservative with Exact. | |
| 241 | +- Graphic/holiday/message tees are not Exact for a plain color/style tee query unless that graphic/theme was requested. | |
| 242 | +- Jumpsuit/romper/set is not Exact for dress/skirt/jeans queries. | |
| 243 | + | |
| 244 | +Original query: {query} | |
| 245 | +Structured query profile JSON: {profile_json} | |
| 246 | + | |
| 247 | +Products: | |
| 248 | +{lines} | |
| 249 | + | |
| 250 | +Return JSON only, with schema: | |
| 251 | +{{"labels":[{{"index":1,"label":"Exact","reason":"short phrase"}}]}} | |
| 252 | +""" | |
| 64 | 253 | |
| 65 | 254 | |
| 66 | 255 | def classify_batch_complex_prompt( |
| ... | ... | @@ -70,20 +259,8 @@ def classify_batch_complex_prompt( |
| 70 | 259 | ) -> str: |
| 71 | 260 | lines = "\n".join(numbered_doc_lines) |
| 72 | 261 | profile_json = json.dumps(query_profile, ensure_ascii=False) |
| 73 | - return ( | |
| 74 | - "You are an e-commerce search relevance judge.\n" | |
| 75 | - "Judge each product against the structured query profile below.\n\n" | |
| 76 | - "Relevance rules:\n" | |
| 77 | - "- Exact: product type matches the target intent, and every explicit required attribute is positively supported by the title/options/tags/category. If an attribute is missing or only guessed, it is NOT Exact.\n" | |
| 78 | - "- Partial: main product type/use case matches, but some required attribute is missing, weaker, uncertain, or only approximately matched.\n" | |
| 79 | - "- Irrelevant: product type/use case mismatched, or an explicit required attribute clearly conflicts.\n" | |
| 80 | - "- Be conservative with Exact.\n" | |
| 81 | - "- Graphic/holiday/message tees are not Exact for a plain color/style tee query unless that graphic/theme was requested.\n" | |
| 82 | - "- Jumpsuit/romper/set is not Exact for dress/skirt/jeans queries.\n\n" | |
| 83 | - f"Original query: {query}\n" | |
| 84 | - f"Structured query profile JSON: {profile_json}\n\n" | |
| 85 | - "Products:\n" | |
| 86 | - f"{lines}\n\n" | |
| 87 | - "Return JSON only, with schema:\n" | |
| 88 | - '{"labels":[{"index":1,"label":"Exact","reason":"short phrase"}]}\n' | |
| 262 | + return _CLASSIFY_BATCH_COMPLEX_TEMPLATE.format( | |
| 263 | + query=query, | |
| 264 | + profile_json=profile_json, | |
| 265 | + lines=lines, | |
| 89 | 266 | ) | ... | ... |