scripts/evaluation/eval_framework/prompts.py

"""LLM prompt templates for relevance judging (keep wording changes here)."""
from __future__ import annotations
import json
from typing import Any, Dict, Sequence
def classify_batch_simple_prompt(query: str, numbered_doc_lines: Sequence[str]) -> str:
    lines = "\n".join(numbered_doc_lines)
    n = len(numbered_doc_lines)
    return (
        "You are an e-commerce search result relevance evaluation assistant. "
        "Based on the user query and each product's information, output the relevance level for each product.\n\n"
        "## Relevance Level Criteria\n"
        "Exact — Fully matches the user's search intent.\n"
        "Partial — Primary intent satisfied (same category or similar use, basically aligns with search intent), "
        "but secondary attributes such as color, style, size, fit, length, or material deviate from or cannot be confirmed.\n"
        "Irrelevant — Category or use case mismatched, primary intent not satisfied.\n\n"
        "Additional judging guidance:\n"
        "- If the query clearly names a product type, product type matching has the highest priority. "
        "Dress vs skirt vs jumpsuit, jeans vs pants, T-shirt vs blouse, cardigan vs sweater, boots vs shoes, "
        "bra vs top, backpack vs bag are not interchangeable.\n"
        "- When the query clearly specifies a concrete product type, a different product type should usually be Irrelevant, not Partial.\n"
        "- If an attribute looks missing or uncertain, prefer Partial instead of Exact.\n"
        "- Do not guess missing attributes.\n"
        "- Graphic, slogan, holiday, memorial, or message tees are not Exact for a plain tee query unless that graphic/theme is requested.\n"
        "- Be conservative with Exact.\n\n"
        f"Query: {query}\n\n"
        "Products:\n"
        f"{lines}\n\n"
        "## Output Format\n"
        f"Strictly output {n} lines, each line containing exactly one of Exact / Partial / Irrelevant. "
        "They must correspond sequentially to the products above. Do not output any other information.\n"
    )
def extract_query_profile_prompt(query: str, parser_hints: Dict[str, Any]) -> str:
    hints_json = json.dumps(parser_hints, ensure_ascii=False)
    return (
        "You are building a structured intent profile for e-commerce relevance judging.\n"
        "Use the original user query as the source of truth. Parser hints may help, but if a hint conflicts with the original query, trust the original query.\n"
        "Be conservative: only mark an attribute as required if the user explicitly asked for it.\n\n"
        "Return JSON with this schema:\n"
        "{\n"
        '  "normalized_query_en": string,\n'
        '  "primary_category": string,\n'
        '  "allowed_categories": [string],\n'
        '  "required_attributes": [\n'
        '    {"name": string, "required_terms": [string], "conflicting_terms": [string], "match_mode": "explicit"}\n'
        "  ],\n"
        '  "notes": [string]\n'
        "}\n\n"
        "Guidelines:\n"
        "- Exact later will require explicit evidence for all required attributes.\n"
        "- allowed_categories should contain only near-synonyms of the same product type, not substitutes. For example dress can allow midi dress/cocktail dress, but not skirt, top, jumpsuit, or outfit unless the query explicitly asks for them.\n"
        "- If the query asks for dress/skirt/jeans/t-shirt, near but different product types are not Exact.\n"
        "- If the query includes color, fit, silhouette, or length, include them as required_attributes.\n"
        "- For fit words, include conflicting terms when obvious, e.g. fitted conflicts with oversized/loose; oversized conflicts with fitted/tight.\n"
        "- For color, include conflicting colors only when clear from the query.\n\n"
        f"Original query: {query}\n"
        f"Parser hints JSON: {hints_json}\n"
    )
def classify_batch_complex_prompt(
    query: str,
    query_profile: Dict[str, Any],
    numbered_doc_lines: Sequence[str],
) -> str:
    lines = "\n".join(numbered_doc_lines)
    profile_json = json.dumps(query_profile, ensure_ascii=False)
    return (
        "You are an e-commerce search relevance judge.\n"
        "Judge each product against the structured query profile below.\n\n"
        "Relevance rules:\n"
        "- Exact: product type matches the target intent, and every explicit required attribute is positively supported by the title/options/tags/category. If an attribute is missing or only guessed, it is NOT Exact.\n"
        "- Partial: main product type/use case matches, but some required attribute is missing, weaker, uncertain, or only approximately matched.\n"
        "- Irrelevant: product type/use case mismatched, or an explicit required attribute clearly conflicts.\n"
        "- Be conservative with Exact.\n"
        "- Graphic/holiday/message tees are not Exact for a plain color/style tee query unless that graphic/theme was requested.\n"
        "- Jumpsuit/romper/set is not Exact for dress/skirt/jeans queries.\n\n"
        f"Original query: {query}\n"
        f"Structured query profile JSON: {profile_json}\n\n"
        "Products:\n"
        f"{lines}\n\n"
        "Return JSON only, with schema:\n"
        '{"labels":[{"index":1,"label":"Exact","reason":"short phrase"}]}\n'
    )