Blame view

scripts/evaluation/eval_framework/prompts.py 5.21 KB
c81b0fc1   tangwang   scripts/evaluatio...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
  """LLM prompt templates for relevance judging (keep wording changes here)."""
  
  from __future__ import annotations
  
  import json
  from typing import Any, Dict, Sequence
  
  
  def classify_batch_simple_prompt(query: str, numbered_doc_lines: Sequence[str]) -> str:
      lines = "\n".join(numbered_doc_lines)
      n = len(numbered_doc_lines)
      return (
          "You are an e-commerce search result relevance evaluation assistant. "
          "Based on the user query and each product's information, output the relevance level for each product.\n\n"
          "## Relevance Level Criteria\n"
          "Exact — Fully matches the user's search intent.\n"
          "Partial — Primary intent satisfied (same category or similar use, basically aligns with search intent), "
          "but secondary attributes such as color, style, size, fit, length, or material deviate from or cannot be confirmed.\n"
          "Irrelevant — Category or use case mismatched, primary intent not satisfied.\n\n"
          "Additional judging guidance:\n"
          "- If the query clearly names a product type, product type matching has the highest priority. "
          "Dress vs skirt vs jumpsuit, jeans vs pants, T-shirt vs blouse, cardigan vs sweater, boots vs shoes, "
          "bra vs top, backpack vs bag are not interchangeable.\n"
          "- When the query clearly specifies a concrete product type, a different product type should usually be Irrelevant, not Partial.\n"
          "- If an attribute looks missing or uncertain, prefer Partial instead of Exact.\n"
          "- Do not guess missing attributes.\n"
          "- Graphic, slogan, holiday, memorial, or message tees are not Exact for a plain tee query unless that graphic/theme is requested.\n"
          "- Be conservative with Exact.\n\n"
          f"Query: {query}\n\n"
          "Products:\n"
          f"{lines}\n\n"
          "## Output Format\n"
          f"Strictly output {n} lines, each line containing exactly one of Exact / Partial / Irrelevant. "
          "They must correspond sequentially to the products above. Do not output any other information.\n"
      )
  
  
  def extract_query_profile_prompt(query: str, parser_hints: Dict[str, Any]) -> str:
      hints_json = json.dumps(parser_hints, ensure_ascii=False)
      return (
          "You are building a structured intent profile for e-commerce relevance judging.\n"
          "Use the original user query as the source of truth. Parser hints may help, but if a hint conflicts with the original query, trust the original query.\n"
          "Be conservative: only mark an attribute as required if the user explicitly asked for it.\n\n"
          "Return JSON with this schema:\n"
          "{\n"
          '  "normalized_query_en": string,\n'
          '  "primary_category": string,\n'
          '  "allowed_categories": [string],\n'
          '  "required_attributes": [\n'
          '    {"name": string, "required_terms": [string], "conflicting_terms": [string], "match_mode": "explicit"}\n'
          "  ],\n"
          '  "notes": [string]\n'
          "}\n\n"
          "Guidelines:\n"
          "- Exact later will require explicit evidence for all required attributes.\n"
          "- allowed_categories should contain only near-synonyms of the same product type, not substitutes. For example dress can allow midi dress/cocktail dress, but not skirt, top, jumpsuit, or outfit unless the query explicitly asks for them.\n"
          "- If the query asks for dress/skirt/jeans/t-shirt, near but different product types are not Exact.\n"
          "- If the query includes color, fit, silhouette, or length, include them as required_attributes.\n"
          "- For fit words, include conflicting terms when obvious, e.g. fitted conflicts with oversized/loose; oversized conflicts with fitted/tight.\n"
          "- For color, include conflicting colors only when clear from the query.\n\n"
          f"Original query: {query}\n"
          f"Parser hints JSON: {hints_json}\n"
      )
  
  
  def classify_batch_complex_prompt(
      query: str,
      query_profile: Dict[str, Any],
      numbered_doc_lines: Sequence[str],
  ) -> str:
      lines = "\n".join(numbered_doc_lines)
      profile_json = json.dumps(query_profile, ensure_ascii=False)
      return (
          "You are an e-commerce search relevance judge.\n"
          "Judge each product against the structured query profile below.\n\n"
          "Relevance rules:\n"
          "- Exact: product type matches the target intent, and every explicit required attribute is positively supported by the title/options/tags/category. If an attribute is missing or only guessed, it is NOT Exact.\n"
          "- Partial: main product type/use case matches, but some required attribute is missing, weaker, uncertain, or only approximately matched.\n"
          "- Irrelevant: product type/use case mismatched, or an explicit required attribute clearly conflicts.\n"
          "- Be conservative with Exact.\n"
          "- Graphic/holiday/message tees are not Exact for a plain color/style tee query unless that graphic/theme was requested.\n"
          "- Jumpsuit/romper/set is not Exact for dress/skirt/jeans queries.\n\n"
          f"Original query: {query}\n"
          f"Structured query profile JSON: {profile_json}\n\n"
          "Products:\n"
          f"{lines}\n\n"
          "Return JSON only, with schema:\n"
          '{"labels":[{"index":1,"label":"Exact","reason":"short phrase"}]}\n'
      )