Commit 3984ec640ae8b23d680e0d8d699b7e8c68cab0d3
1 parent
c81b0fc1
evalution 标注标准优化
Made-with: Cursor
Showing
2 changed files
with
246 additions
and
64 deletions
Show diff stats
.gitignore
| @@ -77,3 +77,8 @@ logs_*/ | @@ -77,3 +77,8 @@ logs_*/ | ||
| 77 | 77 | ||
| 78 | models/ | 78 | models/ |
| 79 | model_cache/ | 79 | model_cache/ |
| 80 | + | ||
| 81 | +# Search evaluation: local SQLite DB and generated outputs (large / regenerable) | ||
| 82 | +artifacts/search_evaluation/*.sqlite3 | ||
| 83 | +artifacts/search_evaluation/batch_reports/ | ||
| 84 | +artifacts/search_evaluation/tuning_runs/ |
scripts/evaluation/eval_framework/prompts.py
| @@ -5,62 +5,251 @@ from __future__ import annotations | @@ -5,62 +5,251 @@ from __future__ import annotations | ||
| 5 | import json | 5 | import json |
| 6 | from typing import Any, Dict, Sequence | 6 | from typing import Any, Dict, Sequence |
| 7 | 7 | ||
| 8 | +_CLASSIFY_BATCH_SIMPLE_TEMPLATE = """You are a relevance evaluation assistant for an apparel e-commerce search system. | ||
| 9 | +Given the user query and each product's information, assign one relevance label to each product. | ||
| 10 | + | ||
| 11 | +## Relevance Labels | ||
| 12 | + | ||
| 13 | +### Exact | ||
| 14 | +The product fully satisfies the user's search intent. | ||
| 15 | + | ||
| 16 | +Use Exact when: | ||
| 17 | +- The product matches the core product type named in the query. | ||
| 18 | +- The key requirements explicitly stated in the query are satisfied. | ||
| 19 | +- There is no clear conflict with any explicit user requirement. | ||
| 20 | + | ||
| 21 | +Typical cases: | ||
| 22 | +- The query is only a product type, and the product is exactly that product type. | ||
| 23 | +- The query includes product type + attributes, and the product matches the type and those attributes. | ||
| 24 | + | ||
| 25 | +### Partial | ||
| 26 | +The product satisfies the user's primary intent, but does not fully satisfy all specified details. | ||
| 27 | + | ||
| 28 | +Use Partial when: | ||
| 29 | +- The core product type matches, but some requested attributes cannot be confirmed. | ||
| 30 | +- The core product type matches, but only some secondary attributes are satisfied. | ||
| 31 | +- The core product type matches, and there are minor or non-critical deviations from the query. | ||
| 32 | +- The product does not clearly contradict the user's explicit requirements, but it also cannot be considered a full match. | ||
| 33 | + | ||
| 34 | +Typical cases: | ||
| 35 | +- Query: "red fitted t-shirt", product: "Women's T-Shirt" → color/fit cannot be confirmed. | ||
| 36 | +- Query: "red fitted t-shirt", product: "Blue Fitted T-Shirt" → product type and fit match, but color differs. | ||
| 37 | +- Query: "cotton long sleeve blouse", product: "Long Sleeve Blouse" → material not confirmed. | ||
| 38 | + | ||
| 39 | +Important: | ||
| 40 | +Partial should mainly be used when the core product type is correct, but the detailed requirements are incomplete, uncertain, or only partially matched. | ||
| 41 | + | ||
| 42 | +### Irrelevant | ||
| 43 | +The product does not satisfy the user's main shopping intent. | ||
| 44 | + | ||
| 45 | +Use Irrelevant when: | ||
| 46 | +- The core product type does not match the query. | ||
| 47 | +- The product matches the general category but is a different product type that shoppers would not consider interchangeable. | ||
| 48 | +- The core product type matches, but the product clearly contradicts an explicit and important requirement in the query. | ||
| 49 | + | ||
| 50 | +Typical cases: | ||
| 51 | +- Query: "pants", product: "shoes" → wrong product type. | ||
| 52 | +- Query: "dress", product: "skirt" → different product type. | ||
| 53 | +- Query: "fitted pants", product: "loose wide-leg pants" → explicit contradiction on fit. | ||
| 54 | +- Query: "sleeveless dress", product: "long sleeve dress" → explicit contradiction on sleeve style. | ||
| 55 | + | ||
| 56 | +## Decision Principles | ||
| 57 | + | ||
| 58 | +1. Product type is the highest-priority factor. | ||
| 59 | + If the query clearly specifies a concrete product type, the result must match that product type to be Exact or Partial. | ||
| 60 | + A different product type is usually Irrelevant, not Partial. | ||
| 61 | + | ||
| 62 | +2. Similar or related product types are not interchangeable when the query is specific. | ||
| 63 | + For example: | ||
| 64 | + - dress vs skirt vs jumpsuit | ||
| 65 | + - jeans vs pants | ||
| 66 | + - t-shirt vs blouse | ||
| 67 | + - cardigan vs sweater | ||
| 68 | + - boots vs shoes | ||
| 69 | + - bra vs top | ||
| 70 | + - backpack vs bag | ||
| 71 | + If the user explicitly searched for one of these, the others should usually be judged Irrelevant. | ||
| 72 | + | ||
| 73 | +3. If the core product type matches, then evaluate attributes. | ||
| 74 | + - If attributes fully match → Exact | ||
| 75 | + - If attributes are missing, uncertain, or only partially matched → Partial | ||
| 76 | + - If attributes clearly contradict an explicit important requirement → Irrelevant | ||
| 77 | + | ||
| 78 | +4. Distinguish carefully between "not mentioned" and "contradicted". | ||
| 79 | + - If an attribute is not mentioned or cannot be verified, prefer Partial. | ||
| 80 | + - If an attribute is explicitly opposite to the query, use Irrelevant. | ||
| 81 | + | ||
| 82 | +5. Do not overuse Exact. | ||
| 83 | + Exact requires strong evidence that the product satisfies the user's stated intent, not just the general category. | ||
| 84 | + | ||
| 85 | +Query: {query} | ||
| 86 | + | ||
| 87 | +Products: | ||
| 88 | +{lines} | ||
| 89 | + | ||
| 90 | +## Output Format | ||
| 91 | +Strictly output {n} lines, each line containing exactly one of: | ||
| 92 | +Exact | ||
| 93 | +Partial | ||
| 94 | +Irrelevant | ||
| 95 | + | ||
| 96 | +The lines must correspond sequentially to the products above. | ||
| 97 | +Do not output any other information. | ||
| 98 | +""" | ||
| 99 | + | ||
| 100 | +_CLASSIFY_BATCH_SIMPLE_TEMPLATE__zh = """你是一个服装电商搜索系统的相关性评估助手。 | ||
| 101 | +给定用户查询和每个产品的信息,为每个产品分配一个相关性标签。 | ||
| 102 | + | ||
| 103 | +## 相关性标签 | ||
| 104 | + | ||
| 105 | +### 完全相关 | ||
| 106 | +该产品完全满足用户的搜索意图。 | ||
| 107 | + | ||
| 108 | +在以下情况使用完全相关: | ||
| 109 | +- 产品与查询中指定的核心产品类型相匹配。 | ||
| 110 | +- 满足了查询中明确说明的关键要求。 | ||
| 111 | +- 与用户明确的任何要求没有明显冲突。 | ||
| 112 | + | ||
| 113 | +典型情况: | ||
| 114 | +- 查询仅包含产品类型,而产品恰好是该产品类型。 | ||
| 115 | +- 查询包含产品类型 + 属性,而产品与该类型及这些属性相匹配。 | ||
| 116 | + | ||
| 117 | +### 部分相关 | ||
| 118 | +该产品满足了用户的主要意图,但并未完全满足所有指定的细节。 | ||
| 119 | + | ||
| 120 | +在以下情况使用部分相关: | ||
| 121 | +- 核心产品类型匹配,但部分请求的属性无法确认。 | ||
| 122 | +- 核心产品类型匹配,但仅满足了部分次要属性。 | ||
| 123 | +- 核心产品类型匹配,但与查询存在微小或非关键的偏差。 | ||
| 124 | +- 产品未明显违背用户的明确要求,但也不能视为完全匹配。 | ||
| 125 | + | ||
| 126 | +典型情况: | ||
| 127 | +- 查询:"红色修身T恤",产品:"女士T恤" → 颜色/版型无法确认。 | ||
| 128 | +- 查询:"红色修身T恤",产品:"蓝色修身T恤" → 产品类型和版型匹配,但颜色不同。 | ||
| 129 | +- 查询:"棉质长袖衬衫",产品:"长袖衬衫" → 材质未确认。 | ||
| 130 | + | ||
| 131 | +重要提示: | ||
| 132 | +部分相关主要应在核心产品类型正确,但详细要求不完整、不确定或仅部分匹配时使用。 | ||
| 133 | + | ||
| 134 | +### 不相关 | ||
| 135 | +该产品不满足用户的主要购物意图。 | ||
| 136 | + | ||
| 137 | +在以下情况使用不相关: | ||
| 138 | +- 核心产品类型与查询不匹配。 | ||
| 139 | +- 产品匹配了大致类别,但属于购物者不会认为可互换的不同产品类型。 | ||
| 140 | +- 核心产品类型匹配,但产品明显违背了查询中一个明确且重要的要求。 | ||
| 141 | + | ||
| 142 | +典型情况: | ||
| 143 | +- 查询:"裤子",产品:"鞋子" → 错误的产品类型。 | ||
| 144 | +- 查询:"连衣裙",产品:"半身裙" → 不同的产品类型。 | ||
| 145 | +- 查询:"修身裤",产品:"宽松阔腿裤" → 版型上明显矛盾。 | ||
| 146 | +- 查询:"无袖连衣裙",产品:"长袖连衣裙" → 袖型上明显矛盾。 | ||
| 147 | + | ||
| 148 | +## 决策原则 | ||
| 149 | + | ||
| 150 | +1. 产品类型是最高优先级的因素。 | ||
| 151 | + 如果查询明确指定了具体产品类型,结果必须匹配该产品类型才能被评为完全相关或部分相关。 | ||
| 152 | + 不同的产品类型通常是不相关,而非部分相关。 | ||
| 153 | + | ||
| 154 | +2. 当查询明确时,相似或相关的产品类型不可互换。 | ||
| 155 | + 例如: | ||
| 156 | + - 连衣裙 vs 半身裙 vs 连体裤 | ||
| 157 | + - 牛仔裤 vs 裤子 | ||
| 158 | + - T恤 vs 衬衫 | ||
| 159 | + - 开衫 vs 毛衣 | ||
| 160 | + - 靴子 vs 鞋子 | ||
| 161 | + - 文胸 vs 上衣 | ||
| 162 | + - 双肩包 vs 包 | ||
| 163 | + 如果用户明确搜索了其中一种,其他的通常应判断为不相关。 | ||
| 164 | + | ||
| 165 | +3. 如果核心产品类型匹配,则评估属性。 | ||
| 166 | + - 如果属性完全匹配 → 完全相关 | ||
| 167 | + - 如果属性缺失、不确定或仅部分匹配 → 部分相关 | ||
| 168 | + - 如果属性明显违背明确的重点要求 → 不相关 | ||
| 169 | + | ||
| 170 | +4. 仔细区分“未提及”和“矛盾”。 | ||
| 171 | + - 如果属性未提及或无法验证,倾向于部分相关。 | ||
| 172 | + - 如果属性与查询明确相反,使用不相关。 | ||
| 173 | + | ||
| 174 | +5. 不要过度使用完全相关。 | ||
| 175 | + 完全相关需要强有力的证据表明产品满足了用户声明的意图,而不仅仅是通用类别。 | ||
| 176 | + | ||
| 177 | +查询: {query} | ||
| 178 | + | ||
| 179 | +产品: | ||
| 180 | +{lines} | ||
| 181 | + | ||
| 182 | +## 输出格式 | ||
| 183 | +严格输出 {n} 行,每行包含以下之一: | ||
| 184 | +Exact | ||
| 185 | +Partial | ||
| 186 | +Irrelevant | ||
| 187 | + | ||
| 188 | +这些行必须按顺序对应上面的产品。 | ||
| 189 | +不要输出任何其他信息。 | ||
| 190 | +""" | ||
| 191 | + | ||
| 192 | + | ||
| 8 | 193 | ||
| 9 | def classify_batch_simple_prompt(query: str, numbered_doc_lines: Sequence[str]) -> str: | 194 | def classify_batch_simple_prompt(query: str, numbered_doc_lines: Sequence[str]) -> str: |
| 10 | lines = "\n".join(numbered_doc_lines) | 195 | lines = "\n".join(numbered_doc_lines) |
| 11 | n = len(numbered_doc_lines) | 196 | n = len(numbered_doc_lines) |
| 12 | - return ( | ||
| 13 | - "You are an e-commerce search result relevance evaluation assistant. " | ||
| 14 | - "Based on the user query and each product's information, output the relevance level for each product.\n\n" | ||
| 15 | - "## Relevance Level Criteria\n" | ||
| 16 | - "Exact — Fully matches the user's search intent.\n" | ||
| 17 | - "Partial — Primary intent satisfied (same category or similar use, basically aligns with search intent), " | ||
| 18 | - "but secondary attributes such as color, style, size, fit, length, or material deviate from or cannot be confirmed.\n" | ||
| 19 | - "Irrelevant — Category or use case mismatched, primary intent not satisfied.\n\n" | ||
| 20 | - "Additional judging guidance:\n" | ||
| 21 | - "- If the query clearly names a product type, product type matching has the highest priority. " | ||
| 22 | - "Dress vs skirt vs jumpsuit, jeans vs pants, T-shirt vs blouse, cardigan vs sweater, boots vs shoes, " | ||
| 23 | - "bra vs top, backpack vs bag are not interchangeable.\n" | ||
| 24 | - "- When the query clearly specifies a concrete product type, a different product type should usually be Irrelevant, not Partial.\n" | ||
| 25 | - "- If an attribute looks missing or uncertain, prefer Partial instead of Exact.\n" | ||
| 26 | - "- Do not guess missing attributes.\n" | ||
| 27 | - "- Graphic, slogan, holiday, memorial, or message tees are not Exact for a plain tee query unless that graphic/theme is requested.\n" | ||
| 28 | - "- Be conservative with Exact.\n\n" | ||
| 29 | - f"Query: {query}\n\n" | ||
| 30 | - "Products:\n" | ||
| 31 | - f"{lines}\n\n" | ||
| 32 | - "## Output Format\n" | ||
| 33 | - f"Strictly output {n} lines, each line containing exactly one of Exact / Partial / Irrelevant. " | ||
| 34 | - "They must correspond sequentially to the products above. Do not output any other information.\n" | ||
| 35 | - ) | 197 | + return _CLASSIFY_BATCH_SIMPLE_TEMPLATE.format(query=query, lines=lines, n=n) |
| 198 | + | ||
| 199 | + | ||
| 200 | +_EXTRACT_QUERY_PROFILE_TEMPLATE = """You are building a structured intent profile for e-commerce relevance judging. | ||
| 201 | +Use the original user query as the source of truth. Parser hints may help, but if a hint conflicts with the original query, trust the original query. | ||
| 202 | +Be conservative: only mark an attribute as required if the user explicitly asked for it. | ||
| 203 | + | ||
| 204 | +Return JSON with this schema: | ||
| 205 | +{{ | ||
| 206 | + "normalized_query_en": string, | ||
| 207 | + "primary_category": string, | ||
| 208 | + "allowed_categories": [string], | ||
| 209 | + "required_attributes": [ | ||
| 210 | + {{"name": string, "required_terms": [string], "conflicting_terms": [string], "match_mode": "explicit"}} | ||
| 211 | + ], | ||
| 212 | + "notes": [string] | ||
| 213 | +}} | ||
| 214 | + | ||
| 215 | +Guidelines: | ||
| 216 | +- Exact later will require explicit evidence for all required attributes. | ||
| 217 | +- allowed_categories should contain only near-synonyms of the same product type, not substitutes. For example dress can allow midi dress/cocktail dress, but not skirt, top, jumpsuit, or outfit unless the query explicitly asks for them. | ||
| 218 | +- If the query asks for dress/skirt/jeans/t-shirt, near but different product types are not Exact. | ||
| 219 | +- If the query includes color, fit, silhouette, or length, include them as required_attributes. | ||
| 220 | +- For fit words, include conflicting terms when obvious, e.g. fitted conflicts with oversized/loose; oversized conflicts with fitted/tight. | ||
| 221 | +- For color, include conflicting colors only when clear from the query. | ||
| 222 | + | ||
| 223 | +Original query: {query} | ||
| 224 | +Parser hints JSON: {hints_json} | ||
| 225 | +""" | ||
| 36 | 226 | ||
| 37 | 227 | ||
| 38 | def extract_query_profile_prompt(query: str, parser_hints: Dict[str, Any]) -> str: | 228 | def extract_query_profile_prompt(query: str, parser_hints: Dict[str, Any]) -> str: |
| 39 | hints_json = json.dumps(parser_hints, ensure_ascii=False) | 229 | hints_json = json.dumps(parser_hints, ensure_ascii=False) |
| 40 | - return ( | ||
| 41 | - "You are building a structured intent profile for e-commerce relevance judging.\n" | ||
| 42 | - "Use the original user query as the source of truth. Parser hints may help, but if a hint conflicts with the original query, trust the original query.\n" | ||
| 43 | - "Be conservative: only mark an attribute as required if the user explicitly asked for it.\n\n" | ||
| 44 | - "Return JSON with this schema:\n" | ||
| 45 | - "{\n" | ||
| 46 | - ' "normalized_query_en": string,\n' | ||
| 47 | - ' "primary_category": string,\n' | ||
| 48 | - ' "allowed_categories": [string],\n' | ||
| 49 | - ' "required_attributes": [\n' | ||
| 50 | - ' {"name": string, "required_terms": [string], "conflicting_terms": [string], "match_mode": "explicit"}\n' | ||
| 51 | - " ],\n" | ||
| 52 | - ' "notes": [string]\n' | ||
| 53 | - "}\n\n" | ||
| 54 | - "Guidelines:\n" | ||
| 55 | - "- Exact later will require explicit evidence for all required attributes.\n" | ||
| 56 | - "- allowed_categories should contain only near-synonyms of the same product type, not substitutes. For example dress can allow midi dress/cocktail dress, but not skirt, top, jumpsuit, or outfit unless the query explicitly asks for them.\n" | ||
| 57 | - "- If the query asks for dress/skirt/jeans/t-shirt, near but different product types are not Exact.\n" | ||
| 58 | - "- If the query includes color, fit, silhouette, or length, include them as required_attributes.\n" | ||
| 59 | - "- For fit words, include conflicting terms when obvious, e.g. fitted conflicts with oversized/loose; oversized conflicts with fitted/tight.\n" | ||
| 60 | - "- For color, include conflicting colors only when clear from the query.\n\n" | ||
| 61 | - f"Original query: {query}\n" | ||
| 62 | - f"Parser hints JSON: {hints_json}\n" | ||
| 63 | - ) | 230 | + return _EXTRACT_QUERY_PROFILE_TEMPLATE.format(query=query, hints_json=hints_json) |
| 231 | + | ||
| 232 | + | ||
| 233 | +_CLASSIFY_BATCH_COMPLEX_TEMPLATE = """You are an e-commerce search relevance judge. | ||
| 234 | +Judge each product against the structured query profile below. | ||
| 235 | + | ||
| 236 | +Relevance rules: | ||
| 237 | +- Exact: product type matches the target intent, and every explicit required attribute is positively supported by the title/options/tags/category. If an attribute is missing or only guessed, it is NOT Exact. | ||
| 238 | +- Partial: main product type/use case matches, but some required attribute is missing, weaker, uncertain, or only approximately matched. | ||
| 239 | +- Irrelevant: product type/use case mismatched, or an explicit required attribute clearly conflicts. | ||
| 240 | +- Be conservative with Exact. | ||
| 241 | +- Graphic/holiday/message tees are not Exact for a plain color/style tee query unless that graphic/theme was requested. | ||
| 242 | +- Jumpsuit/romper/set is not Exact for dress/skirt/jeans queries. | ||
| 243 | + | ||
| 244 | +Original query: {query} | ||
| 245 | +Structured query profile JSON: {profile_json} | ||
| 246 | + | ||
| 247 | +Products: | ||
| 248 | +{lines} | ||
| 249 | + | ||
| 250 | +Return JSON only, with schema: | ||
| 251 | +{{"labels":[{{"index":1,"label":"Exact","reason":"short phrase"}}]}} | ||
| 252 | +""" | ||
| 64 | 253 | ||
| 65 | 254 | ||
| 66 | def classify_batch_complex_prompt( | 255 | def classify_batch_complex_prompt( |
| @@ -70,20 +259,8 @@ def classify_batch_complex_prompt( | @@ -70,20 +259,8 @@ def classify_batch_complex_prompt( | ||
| 70 | ) -> str: | 259 | ) -> str: |
| 71 | lines = "\n".join(numbered_doc_lines) | 260 | lines = "\n".join(numbered_doc_lines) |
| 72 | profile_json = json.dumps(query_profile, ensure_ascii=False) | 261 | profile_json = json.dumps(query_profile, ensure_ascii=False) |
| 73 | - return ( | ||
| 74 | - "You are an e-commerce search relevance judge.\n" | ||
| 75 | - "Judge each product against the structured query profile below.\n\n" | ||
| 76 | - "Relevance rules:\n" | ||
| 77 | - "- Exact: product type matches the target intent, and every explicit required attribute is positively supported by the title/options/tags/category. If an attribute is missing or only guessed, it is NOT Exact.\n" | ||
| 78 | - "- Partial: main product type/use case matches, but some required attribute is missing, weaker, uncertain, or only approximately matched.\n" | ||
| 79 | - "- Irrelevant: product type/use case mismatched, or an explicit required attribute clearly conflicts.\n" | ||
| 80 | - "- Be conservative with Exact.\n" | ||
| 81 | - "- Graphic/holiday/message tees are not Exact for a plain color/style tee query unless that graphic/theme was requested.\n" | ||
| 82 | - "- Jumpsuit/romper/set is not Exact for dress/skirt/jeans queries.\n\n" | ||
| 83 | - f"Original query: {query}\n" | ||
| 84 | - f"Structured query profile JSON: {profile_json}\n\n" | ||
| 85 | - "Products:\n" | ||
| 86 | - f"{lines}\n\n" | ||
| 87 | - "Return JSON only, with schema:\n" | ||
| 88 | - '{"labels":[{"index":1,"label":"Exact","reason":"short phrase"}]}\n' | 262 | + return _CLASSIFY_BATCH_COMPLEX_TEMPLATE.format( |
| 263 | + query=query, | ||
| 264 | + profile_json=profile_json, | ||
| 265 | + lines=lines, | ||
| 89 | ) | 266 | ) |