tangwang · tangwang · tangwang · tangwang · tangwang · tangwang
Showing 20 changed files Show diff stats
docs/Usage-Guide.md
docs/issue-2026-03-31-评估框架-done-0331.md
docs/相关性检索优化说明.md
indexer/product_enrich.py
scripts/evaluation/README.md
scripts/evaluation/eval_framework/__init__.py
scripts/evaluation/eval_framework/cli.py
scripts/evaluation/eval_framework/clients.py
scripts/evaluation/eval_framework/constants.py
scripts/evaluation/eval_framework/framework.py
scripts/evaluation/eval_framework/metrics.py
scripts/evaluation/eval_framework/prompts.py
scripts/evaluation/eval_framework/reports.py
scripts/evaluation/eval_framework/static/eval_web.css
scripts/evaluation/eval_framework/static/eval_web.js
scripts/evaluation/eval_framework/static/index.html
scripts/evaluation/eval_framework/store.py
scripts/evaluation/eval_framework/utils.py
scripts/evaluation/quick_start_eval.sh -> scripts/evaluation/start_eval.sh
tests/test_product_enrich_partial_mode.py
@@ -202,7 +202,7 @@ python -m pytest -q tests/test_rerank_client.py tests/test_es_query_builder.py t
 ./scripts/service_ctl.sh restart backend
 sleep 3
 ./scripts/service_ctl.sh status backend
-./scripts/evaluation/quick_start_eval.sh batch
+./scripts/evaluation/start_eval.sh.sh batch
 ```
  
 离线批量评估会把标注与报表写到 `artifacts/search_evaluation/`（SQLite、`batch_reports/` 下的 JSON/Markdown 等）。说明与命令见 [scripts/evaluation/README.md](../scripts/evaluation/README.md)。
@@ -138,7 +138,7 @@ queries默认是queries/queries.txt，填入左侧列表框，点击其中任何
  
  
 @scripts/evaluation/README.md @scripts/evaluation/eval_framework/framework.py 
-@quick_start_eval.sh (29-35) 
+@start_eval.sh.sh (29-35) 
 请以如下流程为准，进行改造：
 如果重建的话，对每个query：
 每个搜索结果应该会扫描全库，
@@ -240,7 +240,7 @@ python -m pytest -q tests/test_rerank_client.py tests/test_es_query_builder.py t
 ./scripts/service_ctl.sh restart backend
 sleep 3
 ./scripts/service_ctl.sh status backend
-./scripts/evaluation/quick_start_eval.sh batch
+./scripts/evaluation/start_eval.sh.sh batch
 ```
  
 评估产物在 `artifacts/search_evaluation/`（如 `search_eval.sqlite3`、`batch_reports/` 下的 JSON/Markdown）。流程与参数说明见 [scripts/evaluation/README.md](../scripts/evaluation/README.md)。
@@ -147,15 +147,40 @@ if _missing_prompt_langs:
 # 多值字段分隔：英文逗号、中文逗号、顿号，及历史约定的 ; | / 与空白
 _MULTI_VALUE_FIELD_SPLIT_RE = re.compile(r"[，、,;|/\n\t]+")
 _CORE_INDEX_LANGUAGES = ("zh", "en")
-_ENRICHED_ATTRIBUTE_DIMENSIONS = (
-    "enriched_tags",
+_ANALYSIS_ATTRIBUTE_FIELD_MAP = (
+    ("tags", "enriched_tags"),
+    ("target_audience", "target_audience"),
+    ("usage_scene", "usage_scene"),
+    ("season", "season"),
+    ("key_attributes", "key_attributes"),
+    ("material", "material"),
+    ("features", "features"),
+)
+_ANALYSIS_RESULT_FIELDS = (
+    "title",
+    "category_path",
+    "tags",
+    "target_audience",
+    "usage_scene",
+    "season",
+    "key_attributes",
+    "material",
+    "features",
+    "anchor_text",
+)
+_ANALYSIS_MEANINGFUL_FIELDS = (
+    "tags",
     "target_audience",
     "usage_scene",
     "season",
     "key_attributes",
     "material",
     "features",
+    "anchor_text",
 )
+_ANALYSIS_FIELD_ALIASES = {
+    "tags": ("tags", "enriched_tags"),
+}
  
  
 def split_multi_value_field(text: Optional[str]) -> List[str]:
@@ -195,25 +220,104 @@ def _append_enriched_attribute(
         target.append({"name": name, "value": {lang: value}})
  
  
+def _get_product_id(product: Dict[str, Any]) -> str:
+    return str(product.get("id") or product.get("spu_id") or "").strip()
+
+
+def _get_analysis_field_aliases(field_name: str) -> Tuple[str, ...]:
+    return _ANALYSIS_FIELD_ALIASES.get(field_name, (field_name,))
+
+
+def _get_analysis_field_value(row: Dict[str, Any], field_name: str) -> Any:
+    for alias in _get_analysis_field_aliases(field_name):
+        if alias in row:
+            return row.get(alias)
+    return None
+
+
+def _has_meaningful_value(value: Any) -> bool:
+    if value is None:
+        return False
+    if isinstance(value, str):
+        return bool(value.strip())
+    if isinstance(value, dict):
+        return any(_has_meaningful_value(v) for v in value.values())
+    if isinstance(value, list):
+        return any(_has_meaningful_value(v) for v in value)
+    return bool(value)
+
+
+def _make_empty_analysis_result(
+    product: Dict[str, Any],
+    target_lang: str,
+    error: Optional[str] = None,
+) -> Dict[str, Any]:
+    result = {
+        "id": _get_product_id(product),
+        "lang": target_lang,
+        "title_input": str(product.get("title") or "").strip(),
+    }
+    for field in _ANALYSIS_RESULT_FIELDS:
+        result[field] = ""
+    if error:
+        result["error"] = error
+    return result
+
+
+def _normalize_analysis_result(
+    result: Dict[str, Any],
+    product: Dict[str, Any],
+    target_lang: str,
+) -> Dict[str, Any]:
+    normalized = _make_empty_analysis_result(product, target_lang)
+    if not isinstance(result, dict):
+        return normalized
+
+    normalized["lang"] = str(result.get("lang") or target_lang).strip() or target_lang
+    normalized["title"] = str(result.get("title") or "").strip()
+    normalized["category_path"] = str(result.get("category_path") or "").strip()
+    normalized["title_input"] = str(
+        product.get("title") or result.get("title_input") or ""
+    ).strip()
+
+    for field in _ANALYSIS_RESULT_FIELDS:
+        if field in {"title", "category_path"}:
+            continue
+        normalized[field] = str(_get_analysis_field_value(result, field) or "").strip()
+
+    if result.get("error"):
+        normalized["error"] = str(result.get("error"))
+    return normalized
+
+
+def _has_meaningful_analysis_content(result: Dict[str, Any]) -> bool:
+    return any(_has_meaningful_value(result.get(field)) for field in _ANALYSIS_MEANINGFUL_FIELDS)
+
+
 def _apply_index_content_row(result: Dict[str, Any], row: Dict[str, Any], lang: str) -> None:
     if not row or row.get("error"):
         return
  
-    anchor_text = str(row.get("anchor_text") or "").strip()
+    anchor_text = str(_get_analysis_field_value(row, "anchor_text") or "").strip()
     if anchor_text:
         _append_lang_phrase_map(result["qanchors"], lang=lang, raw_value=anchor_text)
  
-    for name in _ENRICHED_ATTRIBUTE_DIMENSIONS:
-        raw = row.get(name)
+    for source_name, output_name in _ANALYSIS_ATTRIBUTE_FIELD_MAP:
+        raw = _get_analysis_field_value(row, source_name)
         if not raw:
             continue
-        _append_enriched_attribute(result["enriched_attributes"], name=name, lang=lang, raw_value=raw)
-        if name == "enriched_tags":
+        _append_enriched_attribute(
+            result["enriched_attributes"],
+            name=output_name,
+            lang=lang,
+            raw_value=raw,
+        )
+        if output_name == "enriched_tags":
             _append_lang_phrase_map(result["enriched_tags"], lang=lang, raw_value=raw)
  
  
 def _normalize_index_content_item(item: Dict[str, Any]) -> Dict[str, str]:
-    item_id = str(item.get("id") or item.get("spu_id") or "").strip()
+    item_id = _get_product_id(item)
     return {
         "id": item_id,
         "title": str(item.get("title") or "").strip(),
@@ -369,7 +473,10 @@ def _get_cached_anchor_result(
         raw = _anchor_redis.get(key)
         if not raw:
             return None
-        return json.loads(raw)
+        result = _normalize_analysis_result(json.loads(raw), product=product, target_lang=target_lang)
+        if not _has_meaningful_analysis_content(result):
+            return None
+        return result
     except Exception as e:
         logger.warning(f"Failed to get anchor cache: {e}")
         return None
@@ -383,9 +490,12 @@ def _set_cached_anchor_result(
     if not _anchor_redis:
         return
     try:
+        normalized = _normalize_analysis_result(result, product=product, target_lang=target_lang)
+        if not _has_meaningful_analysis_content(normalized):
+            return
         key = _make_anchor_cache_key(product, target_lang)
         ttl = ANCHOR_CACHE_EXPIRE_DAYS * 24 * 3600
-        _anchor_redis.setex(key, ttl, json.dumps(result, ensure_ascii=False))
+        _anchor_redis.setex(key, ttl, json.dumps(normalized, ensure_ascii=False))
     except Exception as e:
         logger.warning(f"Failed to set anchor cache: {e}")
  
@@ -654,7 +764,7 @@ def parse_markdown_table(markdown_content: str) -&gt; List[Dict[str, str]]:
                     "seq_no": parts[0],
                     "title": parts[1],  # 商品标题（按目标语言）
                     "category_path": parts[2] if len(parts) > 2 else "",  # 品类路径
-                    "enriched_tags": parts[3] if len(parts) > 3 else "",  # 细分标签
+                    "tags": parts[3] if len(parts) > 3 else "",  # 细分标签
                     "target_audience": parts[4] if len(parts) > 4 else "",  # 适用人群
                     "usage_scene": parts[5] if len(parts) > 5 else "",  # 使用场景
                     "season": parts[6] if len(parts) > 6 else "",  # 适用季节
@@ -705,7 +815,7 @@ def process_batch(
     batch_data: List[Dict[str, str]],
     batch_num: int,
     target_lang: str = "zh",
-) -> List[Dict[str, str]]:
+) -> List[Dict[str, Any]]:
     """处理一个批次的数据"""
     logger.info(f"\n{'#' * 80}")
     logger.info(f"Processing Batch {batch_num} ({len(batch_data)} items)")
@@ -725,22 +835,11 @@ def process_batch(
             target_lang,
         )
         return [
-            {
-                "id": item["id"],
-                "lang": target_lang,
-                "title_input": item.get("title", ""),
-                "title": "",
-                "category_path": "",
-                "enriched_tags": "",
-                "target_audience": "",
-                "usage_scene": "",
-                "season": "",
-                "key_attributes": "",
-                "material": "",
-                "features": "",
-                "anchor_text": "",
-                "error": f"prompt_creation_failed: unsupported target_lang={target_lang}",
-            }
+            _make_empty_analysis_result(
+                item,
+                target_lang,
+                error=f"prompt_creation_failed: unsupported target_lang={target_lang}",
+            )
             for item in batch_data
         ]
  
@@ -764,24 +863,18 @@ def process_batch(
         results_with_ids = []
         for i, parsed_item in enumerate(parsed_results):
             if i < len(batch_data):
-                original_id = batch_data[i]["id"]
-                result = {
-                    "id": original_id,
-                    "lang": target_lang,
-                    "title_input": batch_data[i]["title"],  # 原始输入标题
-                    "title": parsed_item.get("title", ""),  # 模型生成的标题
-                    "category_path": parsed_item.get("category_path", ""),  # 品类路径
-                    "enriched_tags": parsed_item.get("enriched_tags", ""),  # 细分标签
-                    "target_audience": parsed_item.get("target_audience", ""),  # 适用人群
-                    "usage_scene": parsed_item.get("usage_scene", ""),  # 使用场景
-                    "season": parsed_item.get("season", ""),  # 适用季节
-                    "key_attributes": parsed_item.get("key_attributes", ""),  # 关键属性
-                    "material": parsed_item.get("material", ""),  # 材质说明
-                    "features": parsed_item.get("features", ""),  # 功能特点
-                    "anchor_text": parsed_item.get("anchor_text", ""),  # 锚文本
-                }
+                source_product = batch_data[i]
+                result = _normalize_analysis_result(
+                    parsed_item,
+                    product=source_product,
+                    target_lang=target_lang,
+                )
                 results_with_ids.append(result)
-                logger.info(f"Mapped: seq={parsed_item['seq_no']} -> original_id={original_id}")
+                logger.info(
+                    "Mapped: seq=%s -> original_id=%s",
+                    parsed_item.get("seq_no"),
+                    source_product.get("id"),
+                )
  
         # 保存批次 JSON 日志到独立文件
         batch_log = {
@@ -808,22 +901,7 @@ def process_batch(
         logger.error(f"Error processing batch {batch_num}: {str(e)}", exc_info=True)
         # 返回空结果，保持ID映射
         return [
-            {
-                "id": item["id"],
-                "lang": target_lang,
-                "title_input": item["title"],
-                "title": "",
-                "category_path": "",
-                "enriched_tags": "",
-                "target_audience": "",
-                "usage_scene": "",
-                "season": "",
-                "key_attributes": "",
-                "material": "",
-                "features": "",
-                "anchor_text": "",
-                "error": str(e),
-            }
+            _make_empty_analysis_result(item, target_lang, error=str(e))
             for item in batch_data
         ]
  
@@ -23,7 +23,7 @@ This directory holds the offline annotation builder, the evaluation web UI/API, 
 | `fusion_experiments_round1.json` | Broader first-round experiments |
 | `queries/queries.txt` | Canonical evaluation queries |
 | `README_Requirement.md` | Product/requirements reference |
-| `quick_start_eval.sh` | Wrapper: `batch`, `batch-rebuild` (deep `build` + `--force-refresh-labels`), or `serve` |
+| `start_eval.sh` | Wrapper: `batch`, `batch-rebuild` (deep `build` + `--force-refresh-labels`), or `serve` |
 | `../start_eval_web.sh` | Same as `serve` with `activate.sh`; use `./scripts/service_ctl.sh start eval-web` (default port **6010**, override with `EVAL_WEB_PORT`). `./run.sh all` includes eval-web. |
  
 ## Quick start (repo root)
@@ -32,13 +32,13 @@ Set tenant if needed (`export TENANT_ID=163`). You need a live search API, DashS
  
 ```bash
 # Batch: live search for every query; only uncached (query, spu_id) pairs hit the LLM
-./scripts/evaluation/quick_start_eval.sh batch
+./scripts/evaluation/start_eval.sh batch
  
-# Deep rebuild: search recall top-500 (score 1) + full-corpus rerank outside pool + batched LLM (early stop; expensive)
-./scripts/evaluation/quick_start_eval.sh batch-rebuild
+# Deep rebuild: per-query full corpus rerank (outside search recall pool) + LLM in batches along global sort order (early stop; expensive)
+./scripts/evaluation/start_eval.sh batch-rebuild
  
 # UI: http://127.0.0.1:6010/
-./scripts/evaluation/quick_start_eval.sh serve
+./scripts/evaluation/start_eval.sh serve
 # or: ./scripts/service_ctl.sh start eval-web
 ```
  
@@ -69,9 +69,36 @@ Explicit equivalents:
   --port 6010
 ```
  
-Each `batch` run walks the full queries file. With `batch --force-refresh-labels`, every live top-`k` hit is re-judged by the LLM.
+Each `batch` run walks the full queries file and writes a **batch report** under `batch_reports/`. With `batch --force-refresh-labels`, every live top-`k` hit is re-judged by the LLM (still only those hits—not the deep rebuild pipeline).
  
-**Rebuild (`build --force-refresh-labels`):** For each query: take search top **500** as the recall pool (treated as rerank score **1**; those SKUs are not sent to the reranker). Rerank the rest of the tenant corpus; if more than **1000** non-pool docs have rerank score **> 0.5**, the query is **skipped** (logged as too easy / tail too relevant). Otherwise merge pool (search order) + non-pool (rerank score descending), then LLM-judge in batches of **50**, logging **exact_ratio** and **irrelevant_ratio** per batch. Stop after **3** consecutive batches with irrelevant_ratio **> 92%**, but only after at least **15** batches and at most **40** batches.
+### `start_eval.sh batch-rebuild` (deep annotation rebuild)
+
+This runs `build_annotation_set.py build` with **`--force-refresh-labels`** and **`--force-refresh-rerank`** (see the explicit command block below). It does **not** run the `batch` subcommand: there is **no** aggregate batch report for this step; outputs are per-query JSON under `query_builds/` plus updates in `search_eval.sqlite3`.
+
+For **each** query in `queries.txt`, in order:
+
+1. **Search recall** — Call the live search API with `size = max(--search-depth, --search-recall-top-k)` (the wrapper uses `--search-depth 500`). The first **`--search-recall-top-k`** hits (default **200**, see `eval_framework.constants.DEFAULT_SEARCH_RECALL_TOP_K`) form the **recall pool**; they are treated as rerank score **1** and are **not** sent to the reranker.
+2. **Full corpus** — Load the tenant’s product corpus from Elasticsearch (same tenant as `TENANT_ID` / `--tenant-id`, default **163**), via `corpus_docs()` (cached in SQLite after the first load).
+3. **Rerank outside pool** — Every corpus document whose `spu_id` is **not** in the pool is scored by the reranker API, **80 documents per request**. With `--force-refresh-rerank`, all those scores are recomputed and written to the **`rerank_scores`** table in `search_eval.sqlite3`. Without that flag, existing `(tenant_id, query, spu_id)` scores are reused and only missing rows hit the API.
+4. **Skip “too easy” queries** — If more than **1000** non-pool documents have rerank score **> 0.5**, that query is **skipped** (one log line: tail too relevant / easy to satisfy). No LLM calls for that query.
+5. **Global sort** — Order to label: pool in **search rank order**, then all remaining corpus docs in **descending rerank score** (dedupe by `spu_id`, pool wins).
+6. **LLM labeling** — Walk that list **from the head** in batches of **50** by default (`--rebuild-llm-batch-size`). Each batch log includes **exact_ratio**, **irrelevant_ratio**, **low_ratio**, and **irrelevant_plus_low_ratio**.
+
+   **Early stop** (defaults in `eval_framework.constants`; overridable via CLI):
+
+   - Run **at least** `--rebuild-min-batches` batches (**10** by default) before any early stop is allowed.
+   - After that, define a **bad batch** as one where the batch has **no** **Exact Match** label **and** either:
+     - **Irrelevant** proportion **≥ 0.94** (`--rebuild-irrelevant-stop-ratio`), or
+     - **(Irrelevant + Low Relevant)** proportion **≥ 0.96** (`--rebuild-irrel-low-combined-stop-ratio`).  
+       (“Low Relevant” is the weak tier; **High Relevant** does not count toward this combined ratio.)
+   - Count **consecutive** bad batches. **Reset** the count to 0 on any batch that is not bad.
+   - **Stop** when the consecutive bad count reaches **`--rebuild-irrelevant-stop-streak`** (**2** by default), or when **`--rebuild-max-batches`** (**40**) is reached—whichever comes first (up to **2000** docs per query at default batch size).
+
+   So labeling follows best-first order but **stops early** when the model sees two consecutive “dead” batches; the tail may never be judged.
+
+**Incremental pool (no full rebuild):** `build_annotation_set.py build` **without** `--force-refresh-labels` uses the older windowed pool (`--annotate-search-top-k`, `--annotate-rerank-top-k`) and fills missing labels in one pass—no rerank-skip rule and no LLM early-stop loop.
+
+**Tuning the rebuild path:** `--search-recall-top-k`, `--rerank-high-threshold`, `--rerank-high-skip-count`, `--rebuild-llm-batch-size`, `--rebuild-min-batches`, `--rebuild-max-batches`, `--rebuild-irrelevant-stop-ratio`, `--rebuild-irrel-low-combined-stop-ratio`, `--rebuild-irrelevant-stop-streak` on `build` (see `eval_framework/cli.py`). Rerank API chunk size is **80** docs per request in code (`full_corpus_rerank_outside_exclude`).
  
 ## Artifacts
  
@@ -95,7 +122,7 @@ Default root: `artifacts/search_evaluation/`
  
 **Standard:** Run `batch` without `--force-refresh-labels` to extend coverage, then use the UI or batch in cached mode. Single-query evaluation defaults to **no** auto-annotation: recall still hits the live API; scoring uses SQLite only, and unlabeled hits count as `Irrelevant`.
  
-**Incremental pool (no full rebuild):** `build_annotation_set.py build` without `--force-refresh-labels` merges search and full-corpus rerank windows before labeling (CLI `--search-depth`, `--rerank-depth`, `--annotate-*-top-k`). **Full rebuild** uses the recall-pool + rerank-skip + batched early-stop flow above; tune thresholds via `--search-recall-top-k`, `--rerank-high-threshold`, `--rerank-high-skip-count`, `--rebuild-*` flags on `build`.
+**Rebuild vs incremental `build`:** Deep rebuild is documented in the **`batch-rebuild`** subsection above. Incremental `build` (without `--force-refresh-labels`) uses `--annotate-search-top-k` / `--annotate-rerank-top-k` windows instead.
  
 **Fusion tuning:** `tune_fusion.py` writes experiment configs, restarts the backend, runs batch evaluation, and optionally applies the best variant (see `--experiments-file`, `--score-metric`, `--apply-best`).
  
@@ -12,15 +12,15 @@ ensure_project_on_path()
  
 from .constants import (  # noqa: E402
     DEFAULT_ARTIFACT_ROOT,
-    DEFAULT_LABELER_MODE,
     DEFAULT_QUERY_FILE,
-    JUDGE_PROMPT_VERSION_COMPLEX,
-    JUDGE_PROMPT_VERSION_SIMPLE,
     PROJECT_ROOT,
     RELEVANCE_EXACT,
+    RELEVANCE_HIGH,
     RELEVANCE_IRRELEVANT,
-    RELEVANCE_PARTIAL,
+    RELEVANCE_LOW,
+    RELEVANCE_NON_IRRELEVANT,
     VALID_LABELS,
+    normalize_stored_label,
 )
 from .framework import SearchEvaluationFramework  # noqa: E402
 from .store import EvalStore, QueryBuildResult  # noqa: E402
@@ -36,22 +36,22 @@ from .utils import (  # noqa: E402
  
 __all__ = [
     "DEFAULT_ARTIFACT_ROOT",
-    "DEFAULT_LABELER_MODE",
     "DEFAULT_QUERY_FILE",
     "EvalStore",
-    "JUDGE_PROMPT_VERSION_COMPLEX",
-    "JUDGE_PROMPT_VERSION_SIMPLE",
     "PROJECT_ROOT",
     "QueryBuildResult",
     "RELEVANCE_EXACT",
+    "RELEVANCE_HIGH",
     "RELEVANCE_IRRELEVANT",
-    "RELEVANCE_PARTIAL",
+    "RELEVANCE_LOW",
+    "RELEVANCE_NON_IRRELEVANT",
     "SearchEvaluationFramework",
     "VALID_LABELS",
     "build_cli_parser",
     "create_web_app",
     "ensure_dir",
     "main",
+    "normalize_stored_label",
     "render_batch_report_markdown",
     "sha1_text",
     "utc_now_iso",
@@ -5,10 +5,11 @@ from __future__ import annotations
 import argparse
 import json
 from pathlib import Path
+from typing import Any, Dict
  
 from .constants import (
-    DEFAULT_LABELER_MODE,
     DEFAULT_QUERY_FILE,
+    DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO,
     DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO,
     DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK,
     DEFAULT_REBUILD_LLM_BATCH_SIZE,
@@ -23,6 +24,38 @@ from .utils import ensure_dir, utc_now_iso, utc_timestamp
 from .web_app import create_web_app
  
  
+def add_judge_llm_args(p: argparse.ArgumentParser) -> None:
+    p.add_argument(
+        "--judge-model",
+        default=None,
+        metavar="MODEL",
+        help="Judge LLM model (default: eval_framework.constants.DEFAULT_JUDGE_MODEL).",
+    )
+    p.add_argument(
+        "--enable-thinking",
+        action=argparse.BooleanOptionalAction,
+        default=None,
+        help="enable_thinking for DashScope (default: DEFAULT_JUDGE_ENABLE_THINKING).",
+    )
+    p.add_argument(
+        "--dashscope-batch",
+        action=argparse.BooleanOptionalAction,
+        default=None,
+        help="DashScope Batch File API vs sync chat (default: DEFAULT_JUDGE_DASHSCOPE_BATCH).",
+    )
+
+
+def framework_kwargs_from_args(args: argparse.Namespace) -> Dict[str, Any]:
+    kw: Dict[str, Any] = {}
+    if args.judge_model is not None:
+        kw["judge_model"] = args.judge_model
+    if args.enable_thinking is not None:
+        kw["enable_thinking"] = args.enable_thinking
+    if args.dashscope_batch is not None:
+        kw["use_dashscope_batch"] = args.dashscope_batch
+    return kw
+
+
 def build_cli_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser(description="Search evaluation annotation builder and web UI")
     sub = parser.add_subparsers(dest="command", required=True)
@@ -38,7 +71,7 @@ def build_cli_parser() -&gt; argparse.ArgumentParser:
         "--search-recall-top-k",
         type=int,
         default=None,
-        help="Rebuild mode only: top-K search hits enter recall pool with score 1 (default when --force-refresh-labels: 500).",
+        help="Rebuild mode only: top-K search hits enter recall pool with score 1 (default when --force-refresh-labels: 200).",
     )
     build.add_argument(
         "--rerank-high-threshold",
@@ -53,24 +86,30 @@ def build_cli_parser() -&gt; argparse.ArgumentParser:
         help="Rebuild only: skip query if more than this many non-pool docs have rerank score > threshold (default 1000).",
     )
     build.add_argument("--rebuild-llm-batch-size", type=int, default=None, help="Rebuild only: LLM batch size (default 50).")
-    build.add_argument("--rebuild-min-batches", type=int, default=None, help="Rebuild only: min LLM batches before early stop (default 15).")
+    build.add_argument("--rebuild-min-batches", type=int, default=None, help="Rebuild only: min LLM batches before early stop (default 10).")
     build.add_argument("--rebuild-max-batches", type=int, default=None, help="Rebuild only: max LLM batches (default 40).")
     build.add_argument(
         "--rebuild-irrelevant-stop-ratio",
         type=float,
         default=None,
-        help="Rebuild only: irrelevant ratio above this counts toward early-stop streak (default 0.92).",
+        help="Rebuild only: irrelevant-only branch threshold (>=) for early-stop streak, requires no Exact (default 0.94).",
+    )
+    build.add_argument(
+        "--rebuild-irrel-low-combined-stop-ratio",
+        type=float,
+        default=None,
+        help="Rebuild only: (irrelevant+low)/n threshold (>=) for early-stop streak, requires no Exact (default 0.96).",
     )
     build.add_argument(
         "--rebuild-irrelevant-stop-streak",
         type=int,
         default=None,
-        help="Rebuild only: stop after this many consecutive batches above irrelevant ratio (default 3).",
+        help="Rebuild only: consecutive bad batches before early stop (default 2).",
     )
     build.add_argument("--language", default="en")
     build.add_argument("--force-refresh-rerank", action="store_true")
     build.add_argument("--force-refresh-labels", action="store_true")
-    build.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
+    add_judge_llm_args(build)
  
     batch = sub.add_parser("batch", help="Run batch evaluation against live search")
     batch.add_argument("--tenant-id", default="163")
@@ -78,7 +117,7 @@ def build_cli_parser() -&gt; argparse.ArgumentParser:
     batch.add_argument("--top-k", type=int, default=100)
     batch.add_argument("--language", default="en")
     batch.add_argument("--force-refresh-labels", action="store_true")
-    batch.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
+    add_judge_llm_args(batch)
  
     audit = sub.add_parser("audit", help="Audit annotation quality for queries")
     audit.add_argument("--tenant-id", default="163")
@@ -87,20 +126,20 @@ def build_cli_parser() -&gt; argparse.ArgumentParser:
     audit.add_argument("--language", default="en")
     audit.add_argument("--limit-suspicious", type=int, default=5)
     audit.add_argument("--force-refresh-labels", action="store_true")
-    audit.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
+    add_judge_llm_args(audit)
  
     serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010")
     serve.add_argument("--tenant-id", default="163")
     serve.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
     serve.add_argument("--host", default="0.0.0.0")
     serve.add_argument("--port", type=int, default=6010)
-    serve.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
+    add_judge_llm_args(serve)
  
     return parser
  
  
 def run_build(args: argparse.Namespace) -> None:
-    framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode)
+    framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
     queries = framework.queries_from_file(Path(args.queries_file))
     summary = []
     rebuild_kwargs = {}
@@ -115,6 +154,9 @@ def run_build(args: argparse.Namespace) -&gt; None:
             "rebuild_irrelevant_stop_ratio": args.rebuild_irrelevant_stop_ratio
             if args.rebuild_irrelevant_stop_ratio is not None
             else DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO,
+            "rebuild_irrel_low_combined_stop_ratio": args.rebuild_irrel_low_combined_stop_ratio
+            if args.rebuild_irrel_low_combined_stop_ratio is not None
+            else DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO,
             "rebuild_irrelevant_stop_streak": args.rebuild_irrelevant_stop_streak
             if args.rebuild_irrelevant_stop_streak is not None
             else DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK,
@@ -152,7 +194,7 @@ def run_build(args: argparse.Namespace) -&gt; None:
  
  
 def run_batch(args: argparse.Namespace) -> None:
-    framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode)
+    framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
     queries = framework.queries_from_file(Path(args.queries_file))
     payload = framework.batch_evaluate(
         queries=queries,
@@ -165,7 +207,7 @@ def run_batch(args: argparse.Namespace) -&gt; None:
  
  
 def run_audit(args: argparse.Namespace) -> None:
-    framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode)
+    framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
     queries = framework.queries_from_file(Path(args.queries_file))
     audit_items = []
     for query in queries:
@@ -215,7 +257,7 @@ def run_audit(args: argparse.Namespace) -&gt; None:
  
  
 def run_serve(args: argparse.Namespace) -> None:
-    framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode)
+    framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
     app = create_web_app(framework, Path(args.queries_file))
     import uvicorn
  
@@ -2,30 +2,49 @@
  
 from __future__ import annotations
  
+import io
+import json
+import time
+import uuid
 from typing import Any, Dict, List, Optional, Sequence, Tuple
  
 import requests
  
 from .constants import VALID_LABELS
-from .prompts import (
-    classify_batch_complex_prompt,
-    classify_batch_simple_prompt,
-    extract_query_profile_prompt,
-)
+from .prompts import classify_prompt
 from .utils import build_label_doc_line, extract_json_blob, safe_json_dumps
  
  
+def _canonicalize_judge_label(raw: str) -> str | None:
+    s = str(raw or "").strip().strip('"').strip("'")
+    if s in VALID_LABELS:
+        return s
+    low = s.lower()
+    for v in VALID_LABELS:
+        if v.lower() == low:
+            return v
+    return None
+
+
 class SearchServiceClient:
     def __init__(self, base_url: str, tenant_id: str):
         self.base_url = base_url.rstrip("/")
         self.tenant_id = str(tenant_id)
         self.session = requests.Session()
  
-    def search(self, query: str, size: int, from_: int = 0, language: str = "en") -> Dict[str, Any]:
+    def search(self, query: str, size: int, from_: int = 0, language: str = "en", *, debug: bool = False) -> Dict[str, Any]:
+        payload: Dict[str, Any] = {
+            "query": query,
+            "size": size,
+            "from": from_,
+            "language": language,
+        }
+        if debug:
+            payload["debug"] = True
         response = self.session.post(
             f"{self.base_url}/search/",
             headers={"Content-Type": "application/json", "X-Tenant-ID": self.tenant_id},
-            json={"query": query, "size": size, "from": from_, "language": language},
+            json=payload,
             timeout=120,
         )
         response.raise_for_status()
@@ -52,26 +71,55 @@ class RerankServiceClient:
  
  
 class DashScopeLabelClient:
-    def __init__(self, model: str, base_url: str, api_key: str, batch_size: int = 40):
+    """DashScope OpenAI-compatible chat: synchronous or Batch File API (JSONL job).
+
+    Batch flow: https://help.aliyun.com/zh/model-studio/batch-interfaces-compatible-with-openai/
+
+    Some regional endpoints (e.g. ``dashscope-us`` compatible-mode) do not implement ``/batches``;
+    on HTTP 404 from batch calls we fall back to synchronous ``/chat/completions`` and stop using batch
+    for subsequent requests on this client.
+    """
+
+    def __init__(
+        self,
+        model: str,
+        base_url: str,
+        api_key: str,
+        batch_size: int = 40,
+        *,
+        batch_completion_window: str = "24h",
+        batch_poll_interval_sec: float = 10.0,
+        enable_thinking: bool = True,
+        use_batch: bool = False,
+    ):
         self.model = model
         self.base_url = base_url.rstrip("/")
         self.api_key = api_key
         self.batch_size = int(batch_size)
+        self.batch_completion_window = str(batch_completion_window)
+        self.batch_poll_interval_sec = float(batch_poll_interval_sec)
+        self.enable_thinking = bool(enable_thinking)
+        self.use_batch = bool(use_batch)
         self.session = requests.Session()
  
-    def _chat(self, prompt: str) -> Tuple[str, str]:
+    def _auth_headers(self) -> Dict[str, str]:
+        return {"Authorization": f"Bearer {self.api_key}"}
+
+    def _completion_body(self, prompt: str) -> Dict[str, Any]:
+        body: Dict[str, Any] = {
+            "model": self.model,
+            "messages": [{"role": "user", "content": prompt}],
+            "temperature": 0,
+            "top_p": 0.1,
+            "enable_thinking": self.enable_thinking,
+        }
+        return body
+
+    def _chat_sync(self, prompt: str) -> Tuple[str, str]:
         response = self.session.post(
             f"{self.base_url}/chat/completions",
-            headers={
-                "Authorization": f"Bearer {self.api_key}",
-                "Content-Type": "application/json",
-            },
-            json={
-                "model": self.model,
-                "messages": [{"role": "user", "content": prompt}],
-                "temperature": 0,
-                "top_p": 0.1,
-            },
+            headers={**self._auth_headers(), "Content-Type": "application/json"},
+            json=self._completion_body(prompt),
             timeout=180,
         )
         response.raise_for_status()
@@ -79,71 +127,146 @@ class DashScopeLabelClient:
         content = str(((data.get("choices") or [{}])[0].get("message") or {}).get("content") or "").strip()
         return content, safe_json_dumps(data)
  
-    def classify_batch_simple(
+    def _chat_batch(self, prompt: str) -> Tuple[str, str]:
+        """One chat completion via Batch File API (single-line JSONL job)."""
+        custom_id = uuid.uuid4().hex
+        body = self._completion_body(prompt)
+        line_obj = {
+            "custom_id": custom_id,
+            "method": "POST",
+            "url": "/v1/chat/completions",
+            "body": body,
+        }
+        jsonl = json.dumps(line_obj, ensure_ascii=False, separators=(",", ":")) + "\n"
+        auth = self._auth_headers()
+
+        up = self.session.post(
+            f"{self.base_url}/files",
+            headers=auth,
+            files={
+                "file": (
+                    "eval_batch_input.jsonl",
+                    io.BytesIO(jsonl.encode("utf-8")),
+                    "application/octet-stream",
+                )
+            },
+            data={"purpose": "batch"},
+            timeout=300,
+        )
+        up.raise_for_status()
+        file_id = (up.json() or {}).get("id")
+        if not file_id:
+            raise RuntimeError(f"DashScope file upload returned no id: {up.text!r}")
+
+        cr = self.session.post(
+            f"{self.base_url}/batches",
+            headers={**auth, "Content-Type": "application/json"},
+            json={
+                "input_file_id": file_id,
+                "endpoint": "/v1/chat/completions",
+                "completion_window": self.batch_completion_window,
+            },
+            timeout=120,
+        )
+        cr.raise_for_status()
+        batch_payload = cr.json() or {}
+        batch_id = batch_payload.get("id")
+        if not batch_id:
+            raise RuntimeError(f"DashScope batches.create returned no id: {cr.text!r}")
+
+        terminal = frozenset({"completed", "failed", "expired", "cancelled"})
+        batch: Dict[str, Any] = dict(batch_payload)
+        status = str(batch.get("status") or "")
+        while status not in terminal:
+            time.sleep(self.batch_poll_interval_sec)
+            br = self.session.get(f"{self.base_url}/batches/{batch_id}", headers=auth, timeout=120)
+            br.raise_for_status()
+            batch = br.json() or {}
+            status = str(batch.get("status") or "")
+
+        if status != "completed":
+            raise RuntimeError(
+                f"DashScope batch {batch_id} ended with status={status!r} errors={batch.get('errors')!r}"
+            )
+
+        out_id = batch.get("output_file_id")
+        err_id = batch.get("error_file_id")
+
+        row = self._find_batch_line_for_custom_id(out_id, custom_id, auth)
+        if row is None:
+            err_row = self._find_batch_line_for_custom_id(err_id, custom_id, auth)
+            if err_row is not None:
+                raise RuntimeError(f"DashScope batch request failed: {err_row!r}")
+            raise RuntimeError(f"DashScope batch output missing custom_id={custom_id!r}")
+
+        resp = row.get("response") or {}
+        sc = resp.get("status_code")
+        if sc is not None and int(sc) != 200:
+            raise RuntimeError(f"DashScope batch line error: {row!r}")
+
+        data = resp.get("body") or {}
+        content = str(((data.get("choices") or [{}])[0].get("message") or {}).get("content") or "").strip()
+        return content, safe_json_dumps(row)
+
+    def _chat(self, prompt: str) -> Tuple[str, str]:
+        if not self.use_batch:
+            return self._chat_sync(prompt)
+        try:
+            return self._chat_batch(prompt)
+        except requests.exceptions.HTTPError as e:
+            resp = getattr(e, "response", None)
+            if resp is not None and resp.status_code == 404:
+                self.use_batch = False
+                return self._chat_sync(prompt)
+            raise
+
+    def _find_batch_line_for_custom_id(
+        self,
+        file_id: Optional[str],
+        custom_id: str,
+        auth: Dict[str, str],
+    ) -> Optional[Dict[str, Any]]:
+        if not file_id or str(file_id) in ("null", ""):
+            return None
+        r = self.session.get(f"{self.base_url}/files/{file_id}/content", headers=auth, timeout=300)
+        r.raise_for_status()
+        for raw in r.text.splitlines():
+            raw = raw.strip()
+            if not raw:
+                continue
+            try:
+                obj = json.loads(raw)
+            except json.JSONDecodeError:
+                continue
+            if str(obj.get("custom_id")) == custom_id:
+                return obj
+        return None
+
+    def classify_batch(
         self,
         query: str,
         docs: Sequence[Dict[str, Any]],
     ) -> Tuple[List[str], str]:
         numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)]
-        prompt = classify_batch_simple_prompt(query, numbered_docs)
+        prompt = classify_prompt(query, numbered_docs)
         content, raw_response = self._chat(prompt)
-        labels = []
+        labels: List[str] = []
         for line in str(content or "").splitlines():
-            label = line.strip()
-            if label in VALID_LABELS:
-                labels.append(label)
+            canon = _canonicalize_judge_label(line)
+            if canon is not None:
+                labels.append(canon)
         if len(labels) != len(docs):
             payload = extract_json_blob(content)
             if isinstance(payload, dict) and isinstance(payload.get("labels"), list):
                 labels = []
                 for item in payload["labels"][: len(docs)]:
                     if isinstance(item, dict):
-                        label = str(item.get("label") or "").strip()
+                        raw_l = str(item.get("label") or "").strip()
                     else:
-                        label = str(item).strip()
-                    if label in VALID_LABELS:
-                        labels.append(label)
-        if len(labels) != len(docs) or any(label not in VALID_LABELS for label in labels):
-            raise ValueError(f"unexpected simple label output: {content!r}")
-        return labels, raw_response
-
-    def extract_query_profile(
-        self,
-        query: str,
-        parser_hints: Dict[str, Any],
-    ) -> Tuple[Dict[str, Any], str]:
-        prompt = extract_query_profile_prompt(query, parser_hints)
-        content, raw_response = self._chat(prompt)
-        payload = extract_json_blob(content)
-        if not isinstance(payload, dict):
-            raise ValueError(f"unexpected query profile payload: {content!r}")
-        payload.setdefault("normalized_query_en", query)
-        payload.setdefault("primary_category", "")
-        payload.setdefault("allowed_categories", [])
-        payload.setdefault("required_attributes", [])
-        payload.setdefault("notes", [])
-        return payload, raw_response
-
-    def classify_batch_complex(
-        self,
-        query: str,
-        query_profile: Dict[str, Any],
-        docs: Sequence[Dict[str, Any]],
-    ) -> Tuple[List[str], str]:
-        numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)]
-        prompt = classify_batch_complex_prompt(query, query_profile, numbered_docs)
-        content, raw_response = self._chat(prompt)
-        payload = extract_json_blob(content)
-        if not isinstance(payload, dict) or not isinstance(payload.get("labels"), list):
-            raise ValueError(f"unexpected label payload: {content!r}")
-        labels_payload = payload["labels"]
-        labels: List[str] = []
-        for item in labels_payload[: len(docs)]:
-            if not isinstance(item, dict):
-                continue
-            label = str(item.get("label") or "").strip()
-            if label in VALID_LABELS:
-                labels.append(label)
+                        raw_l = str(item).strip()
+                    canon = _canonicalize_judge_label(raw_l)
+                    if canon is not None:
+                        labels.append(canon)
         if len(labels) != len(docs) or any(label not in VALID_LABELS for label in labels):
-            raise ValueError(f"unexpected label output: {content!r}")
+            raise ValueError(f"unexpected classify output: {content!r}")
         return labels, raw_response
@@ -6,24 +6,60 @@ _PKG_DIR = Path(__file__).resolve().parent
 _SCRIPTS_EVAL_DIR = _PKG_DIR.parent
 PROJECT_ROOT = _SCRIPTS_EVAL_DIR.parents[1]
  
-RELEVANCE_EXACT = "Exact"
-RELEVANCE_PARTIAL = "Partial"
+# Canonical English labels (must match LLM prompt output in prompts._CLASSIFY_TEMPLATE_EN)
+RELEVANCE_EXACT = "Exact Match"
+RELEVANCE_HIGH = "High Relevant"
+RELEVANCE_LOW = "Low Relevant"
 RELEVANCE_IRRELEVANT = "Irrelevant"
-VALID_LABELS = {RELEVANCE_EXACT, RELEVANCE_PARTIAL, RELEVANCE_IRRELEVANT}
+
+VALID_LABELS = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW, RELEVANCE_IRRELEVANT})
+
+# Precision / MAP "positive" set (all non-irrelevant tiers)
+RELEVANCE_NON_IRRELEVANT = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW})
+
+_LEGACY_LABEL_MAP = {
+    "Exact": RELEVANCE_EXACT,
+    "Partial": RELEVANCE_HIGH,
+}
+
+
+def normalize_stored_label(label: str) -> str:
+    """Map legacy 3-way SQLite labels to current 4-way strings; pass through canonical labels."""
+    s = str(label).strip()
+    if s in VALID_LABELS:
+        return s
+    return _LEGACY_LABEL_MAP.get(s, s)
+
  
 DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation"
 DEFAULT_QUERY_FILE = _SCRIPTS_EVAL_DIR / "queries" / "queries.txt"
  
-JUDGE_PROMPT_VERSION_SIMPLE = "v3_simple_20260331"
-JUDGE_PROMPT_VERSION_COMPLEX = "v2_structured_20260331"
-DEFAULT_LABELER_MODE = "simple"
+# Judge LLM (eval_framework only; override via CLI --judge-model / constructor kwargs)
+DEFAULT_JUDGE_MODEL = "qwen3.5-flash"
+DEFAULT_JUDGE_ENABLE_THINKING = True
+DEFAULT_JUDGE_DASHSCOPE_BATCH = False
+DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW = "24h"
+DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC = 10.0
  
-# Rebuild annotation pool (build --force-refresh-labels): search recall + full-corpus rerank + LLM batches
-DEFAULT_SEARCH_RECALL_TOP_K = 500
+# --- Rebuild annotation pool (``build --force-refresh-labels``) ---
+# Flow: search recall pool (rerank_score=1, no rerank API) + rerank rest of corpus +
+# LLM labels in fixed-size batches along global order (see ``framework._annotate_rebuild_batches``).
+DEFAULT_SEARCH_RECALL_TOP_K = 200
 DEFAULT_RERANK_HIGH_THRESHOLD = 0.5
 DEFAULT_RERANK_HIGH_SKIP_COUNT = 1000
 DEFAULT_REBUILD_LLM_BATCH_SIZE = 50
-DEFAULT_REBUILD_MIN_LLM_BATCHES = 15
+# At least this many LLM batches run before early-stop is considered.
+DEFAULT_REBUILD_MIN_LLM_BATCHES = 10
+# Hard cap on LLM batches per query (each batch labels up to ``DEFAULT_REBUILD_LLM_BATCH_SIZE`` docs).
 DEFAULT_REBUILD_MAX_LLM_BATCHES = 40
-DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.92
-DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK = 3
+
+# LLM early-stop (only after ``DEFAULT_REBUILD_MIN_LLM_BATCHES`` completed):
+# A batch is "bad" when it has **no** ``Exact Match`` label AND either:
+#   - irrelevant_ratio >= DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, or
+#   - (Irrelevant + Low Relevant) / n >= DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO.
+# ``irrelevant_ratio`` = Irrelevant count / n; weak relevance is ``RELEVANCE_LOW`` ("Low Relevant").
+# If a batch is bad, increment a streak; otherwise reset streak to 0. Stop when streak reaches
+# ``DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK`` (consecutive bad batches).
+DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.94
+DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO = 0.96
+DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK = 2
@@ -10,13 +10,18 @@ from typing import Any, Dict, List, Sequence, Tuple
 import requests
 from elasticsearch.helpers import scan
  
-from api.app import get_app_config, get_es_client, get_query_parser, init_service
+from api.app import get_app_config, get_es_client, init_service
 from indexer.mapping_generator import get_tenant_index_name
  
 from .clients import DashScopeLabelClient, RerankServiceClient, SearchServiceClient
 from .constants import (
     DEFAULT_ARTIFACT_ROOT,
-    DEFAULT_LABELER_MODE,
+    DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW,
+    DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC,
+    DEFAULT_JUDGE_DASHSCOPE_BATCH,
+    DEFAULT_JUDGE_ENABLE_THINKING,
+    DEFAULT_JUDGE_MODEL,
+    DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO,
     DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO,
     DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK,
     DEFAULT_REBUILD_LLM_BATCH_SIZE,
@@ -25,10 +30,11 @@ from .constants import (
     DEFAULT_RERANK_HIGH_SKIP_COUNT,
     DEFAULT_RERANK_HIGH_THRESHOLD,
     DEFAULT_SEARCH_RECALL_TOP_K,
-    JUDGE_PROMPT_VERSION_COMPLEX,
     RELEVANCE_EXACT,
+    RELEVANCE_HIGH,
     RELEVANCE_IRRELEVANT,
-    RELEVANCE_PARTIAL,
+    RELEVANCE_LOW,
+    RELEVANCE_NON_IRRELEVANT,
     VALID_LABELS,
 )
 from .metrics import aggregate_metrics, compute_query_metrics, label_distribution
@@ -40,26 +46,44 @@ from .utils import (
     compact_option_values,
     compact_product_payload,
     ensure_dir,
-    normalize_text,
-    pick_text,
     sha1_text,
     utc_now_iso,
     utc_timestamp,
+    zh_title_from_multilingual,
 )
  
  
+def _zh_titles_from_debug_per_result(debug_info: Any) -> Dict[str, str]:
+    """Map ``spu_id`` -> Chinese title from ``debug_info.per_result[].title_multilingual``."""
+    out: Dict[str, str] = {}
+    if not isinstance(debug_info, dict):
+        return out
+    for entry in debug_info.get("per_result") or []:
+        if not isinstance(entry, dict):
+            continue
+        spu_id = str(entry.get("spu_id") or "").strip()
+        if not spu_id:
+            continue
+        zh = zh_title_from_multilingual(entry.get("title_multilingual"))
+        if zh:
+            out[spu_id] = zh
+    return out
+
+
 class SearchEvaluationFramework:
     def __init__(
         self,
         tenant_id: str,
         artifact_root: Path = DEFAULT_ARTIFACT_ROOT,
         search_base_url: str = "http://localhost:6002",
-        labeler_mode: str = DEFAULT_LABELER_MODE,
+        *,
+        judge_model: str | None = None,
+        enable_thinking: bool | None = None,
+        use_dashscope_batch: bool | None = None,
     ):
         init_service(get_app_config().infrastructure.elasticsearch.host)
         self.tenant_id = str(tenant_id)
         self.artifact_root = ensure_dir(artifact_root)
-        self.labeler_mode = str(labeler_mode).strip().lower() or DEFAULT_LABELER_MODE
         self.store = EvalStore(self.artifact_root / "search_eval.sqlite3")
         self.search_client = SearchServiceClient(search_base_url, self.tenant_id)
         app_cfg = get_app_config()
@@ -71,183 +95,20 @@ class SearchEvaluationFramework:
         api_key = app_cfg.infrastructure.secrets.dashscope_api_key
         if not api_key:
             raise RuntimeError("dashscope_api_key is required for search evaluation annotation")
+        model = str(judge_model or DEFAULT_JUDGE_MODEL)
+        et = DEFAULT_JUDGE_ENABLE_THINKING if enable_thinking is None else enable_thinking
+        use_batch = DEFAULT_JUDGE_DASHSCOPE_BATCH if use_dashscope_batch is None else use_dashscope_batch
+        batch_window = DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW
+        batch_poll = float(DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC)
         self.label_client = DashScopeLabelClient(
-            model=str(llm_cfg["model"]),
+            model=model,
             base_url=str(llm_cfg["base_url"]),
             api_key=str(api_key),
+            batch_completion_window=batch_window,
+            batch_poll_interval_sec=batch_poll,
+            enable_thinking=et,
+            use_batch=use_batch,
         )
-        self.query_parser = None
-
-    def _get_query_parser(self):
-        if self.query_parser is None:
-            self.query_parser = get_query_parser()
-        return self.query_parser
-
-    def build_query_parser_hints(self, query: str) -> Dict[str, Any]:
-        parsed = self._get_query_parser().parse(query, generate_vector=False, target_languages=["en", "zh"])
-        payload = parsed.to_dict()
-        payload["text_for_rerank"] = parsed.text_for_rerank()
-        return payload
-
-    def get_query_profile(self, query: str, force_refresh: bool = False) -> Dict[str, Any]:
-        if self.labeler_mode != "complex":
-            raise RuntimeError("query profiles are only used in complex labeler mode")
-        if not force_refresh:
-            cached = self.store.get_query_profile(self.tenant_id, query, JUDGE_PROMPT_VERSION_COMPLEX)
-            if cached is not None:
-                return cached
-        parser_hints = self.build_query_parser_hints(query)
-        profile, raw_response = self.label_client.extract_query_profile(query, parser_hints)
-        profile["parser_hints"] = parser_hints
-        self.store.upsert_query_profile(
-            self.tenant_id,
-            query,
-            JUDGE_PROMPT_VERSION_COMPLEX,
-            self.label_client.model,
-            profile,
-            raw_response,
-        )
-        return profile
-
-    @staticmethod
-    def _doc_evidence_text(doc: Dict[str, Any]) -> str:
-        pieces: List[str] = [
-            build_display_title(doc),
-            pick_text(doc.get("vendor"), "en"),
-            pick_text(doc.get("category_path"), "en"),
-            pick_text(doc.get("category_name"), "en"),
-        ]
-        for sku in doc.get("skus") or []:
-            pieces.extend(
-                [
-                    str(sku.get("option1_value") or ""),
-                    str(sku.get("option2_value") or ""),
-                    str(sku.get("option3_value") or ""),
-                ]
-            )
-        for tag in doc.get("tags") or []:
-            pieces.append(str(tag))
-        return normalize_text(" | ".join(piece for piece in pieces if piece))
-
-    def _apply_rule_based_label_guardrails(
-        self,
-        label: str,
-        query_profile: Dict[str, Any],
-        doc: Dict[str, Any],
-    ) -> str:
-        if label not in VALID_LABELS:
-            return label
-        evidence = self._doc_evidence_text(doc)
-        category = normalize_text(query_profile.get("primary_category"))
-        allowed_categories = [normalize_text(item) for item in query_profile.get("allowed_categories") or [] if str(item).strip()]
-
-        primary_category_match = True
-        if category:
-            primary_category_match = category in evidence
-        allowed_category_match = True
-        if allowed_categories:
-            allowed_category_match = any(signal in evidence for signal in allowed_categories)
-
-        if label == RELEVANCE_EXACT and not primary_category_match:
-            if allowed_category_match:
-                label = RELEVANCE_PARTIAL
-            else:
-                return RELEVANCE_IRRELEVANT
-
-        for attr in query_profile.get("required_attributes") or []:
-            if not isinstance(attr, dict):
-                continue
-            attr_name = normalize_text(attr.get("name"))
-            if attr_name not in {"color", "fit", "length", "type", "product_type", "material", "size", "gender", "style", "waist_style", "rise"}:
-                continue
-            required_terms = [normalize_text(item) for item in attr.get("required_terms") or [] if normalize_text(item)]
-            conflicting_terms = [normalize_text(item) for item in attr.get("conflicting_terms") or [] if normalize_text(item)]
-            if attr_name == "fit":
-                if any(term in {"oversized", "oversize"} for term in required_terms):
-                    conflicting_terms.extend(["slim", "slimming", "fitted", "tight", "close-fitting"])
-                if any(term in {"fitted", "slim fit", "tight"} for term in required_terms):
-                    conflicting_terms.extend(["oversized", "oversize", "loose", "relaxed"])
-            has_required = any(term in evidence for term in required_terms) if required_terms else True
-            has_conflict = any(term in evidence for term in conflicting_terms)
-
-            if has_conflict:
-                return RELEVANCE_IRRELEVANT
-            if label == RELEVANCE_EXACT and not has_required:
-                label = RELEVANCE_PARTIAL
-
-        if label == RELEVANCE_PARTIAL and not primary_category_match and not allowed_category_match:
-            return RELEVANCE_IRRELEVANT
-
-        return label
-
-    @staticmethod
-    def _result_item_to_doc(item: Dict[str, Any]) -> Dict[str, Any]:
-        option_values = list(item.get("option_values") or [])
-        while len(option_values) < 3:
-            option_values.append("")
-        product = dict(item.get("product") or {})
-        return {
-            "spu_id": item.get("spu_id"),
-            "title": product.get("title") or item.get("title"),
-            "vendor": product.get("vendor"),
-            "category_path": product.get("category"),
-            "category_name": product.get("category"),
-            "image_url": item.get("image_url") or product.get("image_url"),
-            "tags": product.get("tags") or [],
-            "skus": [
-                {
-                    "option1_value": option_values[0],
-                    "option2_value": option_values[1],
-                    "option3_value": option_values[2],
-                }
-            ],
-        }
-
-    def _collect_label_issues(
-        self,
-        label: str,
-        query_profile: Dict[str, Any],
-        doc: Dict[str, Any],
-    ) -> List[str]:
-        evidence = self._doc_evidence_text(doc)
-        issues: List[str] = []
-        category = normalize_text(query_profile.get("primary_category"))
-        allowed_categories = [
-            normalize_text(item)
-            for item in query_profile.get("allowed_categories") or []
-            if str(item).strip()
-        ]
-
-        primary_category_match = True if not category else category in evidence
-        allowed_category_match = False if allowed_categories else primary_category_match
-        if allowed_categories:
-            allowed_category_match = any(signal in evidence for signal in allowed_categories)
-
-        if label == RELEVANCE_EXACT and not primary_category_match:
-            if allowed_category_match:
-                issues.append("Exact missing primary category evidence")
-            else:
-                issues.append("Exact has category mismatch")
-
-        if label == RELEVANCE_PARTIAL and not primary_category_match and not allowed_category_match:
-            issues.append("Partial has category mismatch")
-
-        for attr in query_profile.get("required_attributes") or []:
-            if not isinstance(attr, dict):
-                continue
-            attr_name = normalize_text(attr.get("name"))
-            if attr_name not in {"color", "fit", "length", "type", "product_type", "material", "size", "gender", "style"}:
-                continue
-            required_terms = [normalize_text(item) for item in attr.get("required_terms") or [] if normalize_text(item)]
-            conflicting_terms = [normalize_text(item) for item in attr.get("conflicting_terms") or [] if normalize_text(item)]
-            has_required = any(term in evidence for term in required_terms) if required_terms else True
-            has_conflict = any(term in evidence for term in conflicting_terms)
-
-            if has_conflict and label != RELEVANCE_IRRELEVANT:
-                issues.append(f"{label} conflicts on {attr_name}")
-            if label == RELEVANCE_EXACT and not has_required:
-                issues.append(f"Exact missing {attr_name}")
-        return issues
  
     def audit_live_query(
         self,
@@ -258,42 +119,6 @@ class SearchEvaluationFramework:
         auto_annotate: bool = False,
     ) -> Dict[str, Any]:
         live = self.evaluate_live_query(query=query, top_k=top_k, auto_annotate=auto_annotate, language=language)
-        if self.labeler_mode != "complex":
-            labels = [
-                item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
-                for item in live["results"]
-            ]
-            return {
-                "query": query,
-                "tenant_id": self.tenant_id,
-                "top_k": top_k,
-                "metrics": live["metrics"],
-                "distribution": label_distribution(labels),
-                "query_profile": None,
-                "suspicious": [],
-                "results": live["results"],
-            }
-        query_profile = self.get_query_profile(query, force_refresh=False)
-        suspicious: List[Dict[str, Any]] = []
-
-        for item in live["results"]:
-            doc = self._result_item_to_doc(item)
-            issues = self._collect_label_issues(item["label"] or "", query_profile, doc)
-            suggested_label = self._apply_rule_based_label_guardrails(item["label"] or "", query_profile, doc)
-            if suggested_label != (item["label"] or ""):
-                issues = list(issues) + [f"Suggested relabel: {item['label']} -> {suggested_label}"]
-            if issues:
-                suspicious.append(
-                    {
-                        "rank": item["rank"],
-                        "spu_id": item["spu_id"],
-                        "title": item["title"],
-                        "label": item["label"],
-                        "suggested_label": suggested_label,
-                        "issues": issues,
-                    }
-                )
-
         labels = [
             item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
             for item in live["results"]
@@ -304,8 +129,8 @@ class SearchEvaluationFramework:
             "top_k": top_k,
             "metrics": live["metrics"],
             "distribution": label_distribution(labels),
-            "query_profile": query_profile,
-            "suspicious": suspicious,
+            "query_profile": None,
+            "suspicious": [],
             "results": live["results"],
         }
  
@@ -485,15 +310,7 @@ class SearchEvaluationFramework:
         if not docs:
             return []
         try:
-            if self.labeler_mode == "complex":
-                query_profile = self.get_query_profile(query, force_refresh=force_refresh)
-                labels, raw_response = self.label_client.classify_batch_complex(query, query_profile, docs)
-                labels = [
-                    self._apply_rule_based_label_guardrails(label, query_profile, doc)
-                    for doc, label in zip(docs, labels)
-                ]
-            else:
-                labels, raw_response = self.label_client.classify_batch_simple(query, docs)
+            labels, raw_response = self.label_client.classify_batch(query, docs)
             return [(labels, raw_response, docs)]
         except Exception:
             if len(docs) == 1:
@@ -510,10 +327,28 @@ class SearchEvaluationFramework:
         min_batches: int = DEFAULT_REBUILD_MIN_LLM_BATCHES,
         max_batches: int = DEFAULT_REBUILD_MAX_LLM_BATCHES,
         irrelevant_stop_ratio: float = DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO,
+        irrelevant_low_combined_stop_ratio: float = DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO,
         stop_streak: int = DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK,
         force_refresh: bool = True,
     ) -> Tuple[Dict[str, str], List[Dict[str, Any]]]:
-        """LLM-label ``ordered_docs`` in fixed-size batches with early stop after enough irrelevant-heavy batches."""
+        """LLM-label ``ordered_docs`` in fixed-size batches along list order.
+
+        **Early stop** (only after ``min_batches`` full batches have completed):
+
+        Per batch, let *n* = batch size, and count labels among docs in that batch only.
+
+        - *bad batch* iff there is **no** ``Exact Match`` in the batch **and** at least one of:
+
+          - ``irrelevant_ratio = #(Irrelevant)/n >= irrelevant_stop_ratio`` (default 0.94), or
+          - ``( #(Irrelevant) + #(Low Relevant) ) / n >= irrelevant_low_combined_stop_ratio``
+            (default 0.96; weak relevance = ``RELEVANCE_LOW``).
+
+        Maintain a streak of consecutive *bad* batches; any non-bad batch resets the streak to 0.
+        Stop labeling when ``streak >= stop_streak`` (default 2) or when ``max_batches`` is reached
+        or the ordered list is exhausted.
+
+        Constants for defaults: ``eval_framework.constants`` (``DEFAULT_REBUILD_*``).
+        """
         batch_logs: List[Dict[str, Any]] = []
         streak = 0
         labels: Dict[str, str] = dict(self.store.get_labels(self.tenant_id, query))
@@ -541,32 +376,46 @@ class SearchEvaluationFramework:
             n = len(batch_docs)
             exact_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_EXACT)
             irrel_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_IRRELEVANT)
+            low_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_LOW)
             exact_ratio = exact_n / n if n else 0.0
             irrelevant_ratio = irrel_n / n if n else 0.0
+            low_ratio = low_n / n if n else 0.0
+            irrel_low_ratio = (irrel_n + low_n) / n if n else 0.0
             log_entry = {
                 "batch_index": batch_idx + 1,
                 "size": n,
                 "exact_ratio": round(exact_ratio, 6),
                 "irrelevant_ratio": round(irrelevant_ratio, 6),
+                "low_ratio": round(low_ratio, 6),
+                "irrelevant_plus_low_ratio": round(irrel_low_ratio, 6),
                 "offset_start": start,
                 "offset_end": min(start + n, total_ordered),
             }
             batch_logs.append(log_entry)
             print(
                 f"[eval-rebuild] query={query!r} llm_batch={batch_idx + 1}/{max_batches} "
-                f"size={n} exact_ratio={exact_ratio:.4f} irrelevant_ratio={irrelevant_ratio:.4f}",
+                f"size={n} exact_ratio={exact_ratio:.4f} irrelevant_ratio={irrelevant_ratio:.4f} "
+                f"irrel_plus_low_ratio={irrel_low_ratio:.4f}",
                 flush=True,
             )
  
+            # Early-stop streak: only evaluated after min_batches (warm-up before trusting tail quality).
             if batch_idx + 1 >= min_batches:
-                if irrelevant_ratio > irrelevant_stop_ratio:
+                no_exact = exact_n == 0
+                # Branch 1: high Irrelevant share, no Exact in this batch.
+                heavy_irrel = irrelevant_ratio >= irrelevant_stop_ratio
+                # Branch 2: Irrelevant + Low Relevant combined share, still no Exact.
+                heavy_irrel_low = irrel_low_ratio >= irrelevant_low_combined_stop_ratio
+                bad_batch = no_exact and (heavy_irrel or heavy_irrel_low)
+                if bad_batch:
                     streak += 1
                 else:
                     streak = 0
                 if streak >= stop_streak:
                     print(
                         f"[eval-rebuild] query={query!r} early_stop after {batch_idx + 1} batches "
-                        f"({stop_streak} consecutive batches with irrelevant_ratio > {irrelevant_stop_ratio})",
+                        f"({stop_streak} consecutive batches: no Exact and "
+                        f"(irrelevant>={irrelevant_stop_ratio} or irrel+low>={irrelevant_low_combined_stop_ratio}))",
                         flush=True,
                     )
                     break
@@ -591,8 +440,19 @@ class SearchEvaluationFramework:
         rebuild_min_batches: int = DEFAULT_REBUILD_MIN_LLM_BATCHES,
         rebuild_max_batches: int = DEFAULT_REBUILD_MAX_LLM_BATCHES,
         rebuild_irrelevant_stop_ratio: float = DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO,
+        rebuild_irrel_low_combined_stop_ratio: float = DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO,
         rebuild_irrelevant_stop_streak: int = DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK,
     ) -> QueryBuildResult:
+        """Build per-query annotation pool and write ``query_builds/*.json``.
+
+        Normal mode unions search + rerank windows and fills missing labels once.
+
+        **Rebuild mode** (``force_refresh_labels=True``): full recall pool + corpus rerank outside
+        pool, optional skip for "easy" queries, then batched LLM labeling with **early stop**;
+        see ``_build_query_annotation_set_rebuild`` and ``_annotate_rebuild_batches`` (docstring
+        spells out the bad-batch / streak rule). Rebuild tuning knobs: ``rebuild_*`` and
+        ``search_recall_top_k`` parameters below; CLI mirrors them under ``build --force-refresh-labels``.
+        """
         if force_refresh_labels:
             return self._build_query_annotation_set_rebuild(
                 query=query,
@@ -607,6 +467,7 @@ class SearchEvaluationFramework:
                 rebuild_min_batches=rebuild_min_batches,
                 rebuild_max_batches=rebuild_max_batches,
                 rebuild_irrelevant_stop_ratio=rebuild_irrelevant_stop_ratio,
+                rebuild_irrel_low_combined_stop_ratio=rebuild_irrel_low_combined_stop_ratio,
                 rebuild_irrelevant_stop_streak=rebuild_irrelevant_stop_streak,
             )
  
@@ -691,8 +552,6 @@ class SearchEvaluationFramework:
                 "annotate_rerank_top_k": annotate_rerank_top_k,
                 "pool_size": len(pool_docs),
             },
-            "labeler_mode": self.labeler_mode,
-            "query_profile": self.get_query_profile(query, force_refresh=force_refresh_labels) if self.labeler_mode == "complex" else None,
             "metrics_top100": metrics,
             "search_results": search_labeled_results,
             "full_rerank_top": rerank_top_results,
@@ -724,6 +583,7 @@ class SearchEvaluationFramework:
         rebuild_min_batches: int,
         rebuild_max_batches: int,
         rebuild_irrelevant_stop_ratio: float,
+        rebuild_irrel_low_combined_stop_ratio: float,
         rebuild_irrelevant_stop_streak: int,
     ) -> QueryBuildResult:
         search_size = max(int(search_depth), int(search_recall_top_k))
@@ -756,6 +616,7 @@ class SearchEvaluationFramework:
             "rebuild_min_batches": rebuild_min_batches,
             "rebuild_max_batches": rebuild_max_batches,
             "rebuild_irrelevant_stop_ratio": rebuild_irrelevant_stop_ratio,
+            "rebuild_irrel_low_combined_stop_ratio": rebuild_irrel_low_combined_stop_ratio,
             "rebuild_irrelevant_stop_streak": rebuild_irrelevant_stop_streak,
         }
  
@@ -797,6 +658,7 @@ class SearchEvaluationFramework:
                 min_batches=rebuild_min_batches,
                 max_batches=rebuild_max_batches,
                 irrelevant_stop_ratio=rebuild_irrelevant_stop_ratio,
+                irrelevant_low_combined_stop_ratio=rebuild_irrel_low_combined_stop_ratio,
                 stop_streak=rebuild_irrelevant_stop_streak,
                 force_refresh=True,
             )
@@ -867,8 +729,6 @@ class SearchEvaluationFramework:
                 "rebuild": rebuild_meta,
                 "ordered_union_size": pool_docs_count,
             },
-            "labeler_mode": self.labeler_mode,
-            "query_profile": self.get_query_profile(query, force_refresh=False) if self.labeler_mode == "complex" else None,
             "metrics_top100": metrics,
             "search_results": search_labeled_results,
             "full_rerank_top": rerank_top_results,
@@ -893,7 +753,10 @@ class SearchEvaluationFramework:
         language: str = "en",
         force_refresh_labels: bool = False,
     ) -> Dict[str, Any]:
-        search_payload = self.search_client.search(query=query, size=max(top_k, 100), from_=0, language=language)
+        search_payload = self.search_client.search(
+            query=query, size=max(top_k, 100), from_=0, language=language, debug=True
+        )
+        zh_by_spu = _zh_titles_from_debug_per_result(search_payload.get("debug_info"))
         results = list(search_payload.get("results") or [])
         if auto_annotate:
             self.annotate_missing_labels(query=query, docs=results[:top_k], force_refresh=force_refresh_labels)
@@ -906,11 +769,16 @@ class SearchEvaluationFramework:
             label = labels.get(spu_id)
             if label not in VALID_LABELS:
                 unlabeled_hits += 1
+            primary_title = build_display_title(doc)
+            title_zh = zh_by_spu.get(spu_id) or ""
+            if not title_zh and isinstance(doc.get("title"), dict):
+                title_zh = zh_title_from_multilingual(doc.get("title"))
             labeled.append(
                 {
                     "rank": rank,
                     "spu_id": spu_id,
-                    "title": build_display_title(doc),
+                    "title": primary_title,
+                    "title_zh": title_zh if title_zh and title_zh != primary_title else "",
                     "image_url": doc.get("image_url"),
                     "label": label,
                     "option_values": list(compact_option_values(doc.get("skus") or [])),
@@ -926,7 +794,7 @@ class SearchEvaluationFramework:
         relevant_missing_ids = [
             spu_id
             for spu_id, label in labels.items()
-            if label in {RELEVANCE_EXACT, RELEVANCE_PARTIAL} and spu_id not in recalled_spu_ids
+            if label in RELEVANCE_NON_IRRELEVANT and spu_id not in recalled_spu_ids
         ]
         missing_docs_map = self.store.get_corpus_docs_by_spu_ids(self.tenant_id, relevant_missing_ids)
         missing_relevant = []
@@ -934,18 +802,26 @@ class SearchEvaluationFramework:
             doc = missing_docs_map.get(spu_id)
             if not doc:
                 continue
+            miss_title = build_display_title(doc)
+            miss_zh = zh_title_from_multilingual(doc.get("title")) if isinstance(doc.get("title"), dict) else ""
             missing_relevant.append(
                 {
                     "spu_id": spu_id,
                     "label": labels[spu_id],
                     "rerank_score": rerank_scores.get(spu_id),
-                    "title": build_display_title(doc),
+                    "title": miss_title,
+                    "title_zh": miss_zh if miss_zh and miss_zh != miss_title else "",
                     "image_url": doc.get("image_url"),
                     "option_values": list(compact_option_values(doc.get("skus") or [])),
                     "product": compact_product_payload(doc),
                 }
             )
-        label_order = {RELEVANCE_EXACT: 0, RELEVANCE_PARTIAL: 1, RELEVANCE_IRRELEVANT: 2}
+        label_order = {
+            RELEVANCE_EXACT: 0,
+            RELEVANCE_HIGH: 1,
+            RELEVANCE_LOW: 2,
+            RELEVANCE_IRRELEVANT: 3,
+        }
         missing_relevant.sort(
             key=lambda item: (
                 label_order.get(str(item.get("label")), 9),
@@ -963,7 +839,7 @@ class SearchEvaluationFramework:
         if unlabeled_hits:
             tips.append(f"{unlabeled_hits} recalled results were not in the annotation set and were counted as Irrelevant.")
         if not missing_relevant:
-            tips.append("No cached Exact/Partial products were missed by this recall set.")
+            tips.append("No cached non-irrelevant products were missed by this recall set.")
         return {
             "query": query,
             "tenant_id": self.tenant_id,
@@ -977,7 +853,8 @@ class SearchEvaluationFramework:
                 "recalled_hits": len(labeled),
                 "missing_relevant_count": len(missing_relevant),
                 "missing_exact_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_EXACT),
-                "missing_partial_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_PARTIAL),
+                "missing_high_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_HIGH),
+                "missing_low_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LOW),
             },
             "tips": tips,
             "total": int(search_payload.get("total") or 0),
@@ -1018,7 +895,8 @@ class SearchEvaluationFramework:
         aggregate = aggregate_metrics([item["metrics"] for item in per_query])
         aggregate_distribution = {
             RELEVANCE_EXACT: sum(item["distribution"][RELEVANCE_EXACT] for item in per_query),
-            RELEVANCE_PARTIAL: sum(item["distribution"][RELEVANCE_PARTIAL] for item in per_query),
+            RELEVANCE_HIGH: sum(item["distribution"][RELEVANCE_HIGH] for item in per_query),
+            RELEVANCE_LOW: sum(item["distribution"][RELEVANCE_LOW] for item in per_query),
             RELEVANCE_IRRELEVANT: sum(item["distribution"][RELEVANCE_IRRELEVANT] for item in per_query),
         }
         batch_id = f"batch_{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + '|'.join(queries))[:10]}"
@@ -4,7 +4,7 @@ from __future__ import annotations
  
 from typing import Dict, Sequence
  
-from .constants import RELEVANCE_EXACT, RELEVANCE_IRRELEVANT, RELEVANCE_PARTIAL
+from .constants import RELEVANCE_EXACT, RELEVANCE_IRRELEVANT, RELEVANCE_HIGH, RELEVANCE_LOW, RELEVANCE_NON_IRRELEVANT
  
  
 def precision_at_k(labels: Sequence[str], k: int, relevant: Sequence[str]) -> float:
@@ -13,15 +13,17 @@ def precision_at_k(labels: Sequence[str], k: int, relevant: Sequence[str]) -&gt; fl
     sliced = list(labels[:k])
     if not sliced:
         return 0.0
-    hits = sum(1 for label in sliced if label in relevant)
+    rel = set(relevant)
+    hits = sum(1 for label in sliced if label in rel)
     return hits / float(min(k, len(sliced)))
  
  
 def average_precision(labels: Sequence[str], relevant: Sequence[str]) -> float:
+    rel = set(relevant)
     hit_count = 0
     precision_sum = 0.0
     for idx, label in enumerate(labels, start=1):
-        if label not in relevant:
+        if label not in rel:
             continue
         hit_count += 1
         precision_sum += hit_count / idx
@@ -31,12 +33,14 @@ def average_precision(labels: Sequence[str], relevant: Sequence[str]) -&gt; float:
  
  
 def compute_query_metrics(labels: Sequence[str]) -> Dict[str, float]:
+    """P@k / MAP_3: Exact Match only. P@k_2_3 / MAP_2_3: any non-irrelevant tier (legacy metric names)."""
     metrics: Dict[str, float] = {}
+    non_irrel = list(RELEVANCE_NON_IRRELEVANT)
     for k in (5, 10, 20, 50):
         metrics[f"P@{k}"] = round(precision_at_k(labels, k, [RELEVANCE_EXACT]), 6)
-        metrics[f"P@{k}_2_3"] = round(precision_at_k(labels, k, [RELEVANCE_EXACT, RELEVANCE_PARTIAL]), 6)
+        metrics[f"P@{k}_2_3"] = round(precision_at_k(labels, k, non_irrel), 6)
     metrics["MAP_3"] = round(average_precision(labels, [RELEVANCE_EXACT]), 6)
-    metrics["MAP_2_3"] = round(average_precision(labels, [RELEVANCE_EXACT, RELEVANCE_PARTIAL]), 6)
+    metrics["MAP_2_3"] = round(average_precision(labels, non_irrel), 6)
     return metrics
  
  
@@ -53,6 +57,7 @@ def aggregate_metrics(metric_items: Sequence[Dict[str, float]]) -&gt; Dict[str, flo
 def label_distribution(labels: Sequence[str]) -> Dict[str, int]:
     return {
         RELEVANCE_EXACT: sum(1 for label in labels if label == RELEVANCE_EXACT),
-        RELEVANCE_PARTIAL: sum(1 for label in labels if label == RELEVANCE_PARTIAL),
+        RELEVANCE_HIGH: sum(1 for label in labels if label == RELEVANCE_HIGH),
+        RELEVANCE_LOW: sum(1 for label in labels if label == RELEVANCE_LOW),
         RELEVANCE_IRRELEVANT: sum(1 for label in labels if label == RELEVANCE_IRRELEVANT),
     }
@@ -2,84 +2,139 @@
  
 from __future__ import annotations
  
-import json
-from typing import Any, Dict, Sequence
+from typing import Sequence
  
-_CLASSIFY_BATCH_SIMPLE_TEMPLATE = """You are a relevance evaluation assistant for an apparel e-commerce search system.
-Given the user query and each product's information, assign one relevance label to each product.
+_CLASSIFY_TEMPLATE_EN = """You are a relevance judgment assistant for a fashion e-commerce search system.
+Given a user query and the information for each product, assign a relevance label to each product.
+
+Your goal is to judge relevance from the perspective of e-commerce search ranking.
+The key question is whether the user would view the product as the intended item, or as an acceptable substitute.
  
 ## Relevance Labels
  
-### Exact
-The product fully satisfies the userâ€™s search intent: the core product type matches, all explicitly stated key attributes are supported by the product information.
+### Exact Match
+The product satisfies the userâ€™s core shopping intent: the core product type matches, and all explicitly stated key attributes in the query are supported by the product information, with no obvious conflict.
  
 Typical use cases:
 - The query contains only a product type, and the product is exactly that type.
-- The query contains product type + attributes, and the product matches both the type and all explicitly stated attributes.
+- The query contains â€œproduct type + attributesâ€, and the product matches both the type and all explicitly stated attributes.
  
-### Partial
-The product satisfies the user's primary intent because the core product type matches, but some explicit requirements in the query are missing, cannot be confirmed, or deviate from the query. Despite the mismatch, the product can still be considered a non-target but acceptable substitute.
+### High Relevant
+The product satisfies the userâ€™s main intent: the core product type matches, but some explicitly requested attributes are missing from the product information, cannot be confirmed, or show minor / non-critical deviations. The product is still a good substitute for the userâ€™s core need.
  
-Use Partial when:
-- The core product type matches, but some requested attributes cannot be confirmed.
-- The core product type matches, but some secondary requirements deviate or are inconsistent.
-- The product is not the ideal target, but it is still a plausible and acceptable substitute for the shopper.
+Use â€œHigh Relevantâ€ in the following cases:
+- The core product type matches, but some requested attributes are missing, not mentioned, or cannot be verified.
+- The core product type matches, but attributes such as color, material, style, fit, or length have minor deviations, as long as the deviation does not materially undermine the userâ€™s main shopping intent.
+- The product is not the userâ€™s ideal target, but in an e-commerce shopping context, it would still be considered an acceptable and strong substitute.
  
-Typical cases:
-- Query: "red fitted t-shirt", product: "Women's T-Shirt" â†’ color/fit cannot be confirmed.
-- Query: "red fitted t-shirt", product: "Blue Fitted T-Shirt" â†’ product type and fit match, but color differs.
+Typical examples:
+- Query: â€œred slim-fit T-shirtâ€
+  Product: â€œwomenâ€™s T-shirtâ€
+  â†’ Color and fit cannot be confirmed.
+- Query: â€œred slim-fit T-shirtâ€
+  Product: â€œblue slim-fit T-shirtâ€
+  â†’ Product type and fit match, but the color is different.
  
-Detailed example:
-- Query: "cotton long sleeve shirt"
-- Product: "J.VER Men's Linen Shirts Casual Button Down Long Sleeve Shirt Solid Spread Collar Summer Beach Shirts with Pocket"
+Detailed case:
+- Query: â€œcotton long-sleeve shirtâ€
+- Product: â€œJ.VER Men's Linen Shirt Casual Button Down Long Sleeve Solid Plain Collar Summer Beach Shirt with Pocketâ€
  
 Analysis:
-- Material mismatch: the query explicitly requires cotton, while the product is linen, so it cannot be Exact.
-- However, the core product type still matches: both are long sleeve shirts.
-- In an e-commerce setting, the shopper may still consider clicking this item because the style and use case are similar.
-- Therefore, it should be labeled Partial as a non-target but acceptable substitute.
+- Material mismatch: the query explicitly requires â€œcottonâ€, while the product is â€œlinenâ€, so it cannot be labeled as â€œExact Matchâ€.
+- However, the core category still matches: both are long-sleeve shirts.
+- In e-commerce search, users may still click this item because the style and wearing scenario are similar.
+- Therefore, it should be labeled as â€œHigh Relevantâ€: not the exact target, but a good substitute.
  
-### Irrelevant
-The product does not satisfy the user's main shopping intent.
+Detailed case:
+- Query: â€œblack mid-length skirtâ€
+- Product: â€œNew spring autumn loose slimming full long floral skirt pleated skirtâ€
  
-Use Irrelevant when:
-- The core product type does not match the query.
-- The product belongs to a broadly related category, but not the specific product subtype requested, and shoppers would not consider them interchangeable.
-- The core product type matches, but the product clearly contradicts an explicit and important requirement in the query.
+Analysis:
+- Category match: the product is a skirt, so the category matches.
+- Color mismatch: the product description does not indicate black and explicitly mentions â€œfloralâ€, which is substantially different from plain black.
+- Length deviation: the user asks for â€œmid-lengthâ€, while the product title emphasizes â€œlong skirtâ€, which is somewhat longer.
+- However, the core category â€œskirtâ€ still matches, and style features such as â€œslimmingâ€ and â€œfull skirtâ€ may still fit some preferences of users searching for a mid-length skirt. Also, â€œlongâ€ versus â€œmid-lengthâ€ is a deviation, but not a severe contradiction.
+- Therefore, this should be labeled as â€œHigh Relevantâ€: the core type matches, but there are several non-fatal attribute deviations.
  
-Typical cases:
-- Query: "pants", product: "shoes" â†’ wrong product type.
-- Query: "dress", product: "skirt" â†’ different product type.
-- Query: "fitted pants", product: "loose wide-leg pants" â†’ explicit contradiction on fit.
-- Query: "sleeveless dress", product: "long sleeve dress" â†’ explicit contradiction on sleeve style.
+### Low Relevant
+The product has a noticeable gap from the userâ€™s core target, but still shares some similarity with the query in style, scenario, function, or broader category. A small portion of users may still view it as a barely acceptable substitute. It is not the intended item, but still has some relevance.
  
-This label emphasizes clarity of user intent. When the query specifies a concrete product type or an important attribute, products that conflict with that intent should be judged Irrelevant even if they are related at a higher category level.
+Use â€œLow Relevantâ€ in the following cases:
+- The core product type does not match, but the two types are still very close in style, wearing scenario, or function, so there is still some substitutability.
+- The core product type matches, but the product differs from the userâ€™s ideal target on multiple attributes; it still has some relevance, but is no longer a strong substitute.
+- An important query requirement is clearly violated, but the product still retains a limited reason to be clicked.
  
-## Decision Principles
+Typical cases:
+- Query: â€œblack mid-length skirtâ€
+  Product: â€œNew high-waisted V-neck mid-length dress elegant printed black sexy dressâ€
+  â†’ The core product type differs (â€œskirtâ€ vs â€œdressâ€), but both belong to closely related apparel types and share a similar mid-length style, so it is â€œLow Relevantâ€.
  
-1. Product type is the highest-priority factor.
-   If the query clearly specifies a concrete product type, the result must match that product type to be Exact or Partial.
-   A different product type is usually Irrelevant, not Partial.
+- Query: â€œjeansâ€
+  Product: â€œcasual pantsâ€
+  â†’ The core product type is different, but both belong to the broader pants category, and the style / wearing scenario may still be close enough to be a weak substitute.
  
-2. Similar or related product types are not interchangeable when the query is specific.
+### Irrelevant
+The product does not satisfy the userâ€™s main shopping intent, and the likelihood of user engagement is very low.
+
+Typical situations:
+- The core product type does not match the query and is not a close substitute in style, scenario, or function.
+- The product belongs to a roughly related broader category, but not to an interchangeable subtype explicitly requested in the query, and the style or usage scenario differs significantly.
+- The core product type matches, but the product clearly violates an explicit and important requirement in the query, with little or no acceptable substitutability.
+
+Typical examples:
+- Query: â€œpantsâ€
+  Product: â€œshoesâ€
+  â†’ Wrong product type.
+- Query: â€œslim-fit pantsâ€
+  Product: â€œloose wide-leg pantsâ€
+  â†’ Clear contradiction in fit, with extremely low substitutability.
+- Query: â€œsleeveless dressâ€
+  Product: â€œlong-sleeve dressâ€
+  â†’ Clear contradiction in sleeve type.
+- Query: â€œjeansâ€
+  Product: â€œsweatpantsâ€
+  â†’ Different core category, with significantly different style and wearing scenario.
+- Query: â€œbootsâ€
+  Product: â€œsneakersâ€
+  â†’ Different core category, different function, and different usage scenario.
+
+## Judgment Principles
+
+1. **Product type is the highest-priority factor.**
+   If the query explicitly specifies a concrete product type, the result must match that product type in order to be labeled as â€œExact Matchâ€ or â€œHigh Relevantâ€.
+   Different product types should usually be labeled as â€œLow Relevantâ€ or â€œIrrelevantâ€.
+
+   - **Low Relevant**: use only when the two product types are very close in style, scenario, or function, and the user may still treat one as a barely acceptable substitute for the other.
+   - **Irrelevant**: all other product type mismatch cases.
+
+2. **Similar or related product types are usually not directly interchangeable when the query is explicit, but their closeness should determine whether the label is â€œLow Relevantâ€ or â€œIrrelevantâ€.**
    For example:
-   - dress vs skirt vs jumpsuit
-   - jeans vs pants
-   - t-shirt vs blouse
-   - cardigan vs sweater
-   - boots vs shoes
-   - bra vs top
-   - backpack vs bag
-   If the user explicitly searched for one of these, the others should usually be judged Irrelevant.
-
-3. If the core product type matches, then evaluate attributes.
-   - If all explicit attributes match â†’ Exact
-   - If some attributes are missing, uncertain, or partially mismatched, but the item is still an acceptable substitute â†’ Partial
-   - If an explicit and important attribute is clearly contradicted, and the item is not a reasonable substitute â†’ Irrelevant
-
-4. Distinguish carefully between "not mentioned" and "contradicted".
-   - If an attribute is not mentioned or cannot be verified, prefer Partial.
-   - If an attribute is explicitly opposite to the query, use Irrelevant unless the item is still reasonably acceptable as a substitute under the shopping context.
+   - **May be Low Relevant due to strong similarity in style / scenario**: dress vs skirt, long skirt vs mid-length skirt, jeans vs casual pants, sneakers vs skate shoes.
+   - **Should be Irrelevant due to substantial difference in style / scenario**: pants vs shoes, T-shirt vs hat, boots vs sneakers, jeans vs suit pants, backpack vs handbag.
+
+3. **Once the core product type matches, evaluate attributes.**
+   - All explicit attributes match â†’ **Exact Match**
+   - Some attributes are missing, not mentioned, cannot be verified, or show only minor deviations â†’ **High Relevant**
+   - There are multiple attribute deviations, or an important attribute is clearly violated, but the product still retains some substitutability â†’ **Low Relevant**
+   - There is a clear and important hard conflict, and substitutability is extremely low â†’ **Irrelevant**
+
+4. **Strictly distinguish among â€œnot mentioned / cannot confirmâ€, â€œminor deviationâ€, and â€œexplicit contradictionâ€.**
+   - If an attribute is not mentioned or cannot be verified, prefer **High Relevant**.
+   - If an attribute shows a minor deviation, such as different color, different material, or slightly different length, it should usually be labeled **High Relevant**.
+   - If an attribute is explicitly opposite to the query requirement, such as sleeveless vs long-sleeve or slim-fit vs loose wide-leg, decide between **Low Relevant** and **Irrelevant** based on the severity of the conflict and practical substitutability.
+   - If the conflict directly breaks the userâ€™s main shopping goal, it should usually be labeled **Irrelevant**.
+
+5. **Substitutability should be judged from real shopping intent, not just surface-level textual similarity.**
+   The question is whether the user would realistically accept the product in a shopping scenario.
+   - Good substitute â†’ **High Relevant**
+   - Barely acceptable substitute â†’ **Low Relevant**
+   - Hardly substitutable at all â†’ **Irrelevant**
+
+6. **When product information is insufficient, do not treat â€œcannot confirmâ€ as â€œconflictâ€.**
+   If a product does not mention an attribute, that does not mean the attribute is definitely violated.
+   Therefore:
+   - If the attribute is not mentioned or cannot be confirmed, prefer **High Relevant**;
+   - Only treat it as a conflict when the product information clearly shows the opposite of the query requirement.
  
 Query: {query}
  
@@ -87,88 +142,139 @@ Products:
 {lines}
  
 ## Output Format
-Strictly output {n} lines, each line containing exactly one of:
-Exact
-Partial
+Output exactly {n} lines.
+Each line must be exactly one of the following:
+Exact Match
+High Relevant
+Low Relevant
 Irrelevant
  
-The lines must correspond sequentially to the products above.
-Do not output any other information.
+The output lines must correspond to the products above in the same order.
+Do not output anything else.
 """
  
-_CLASSIFY_BATCH_SIMPLE_TEMPLATE__zh = _CLASSIFY_BATCH_SIMPLE_TEMPLATE_ZH = """ä½ æ˜¯ä¸€ä¸ªæœé¥°ç”µå•†æœç´¢ç³»ç»Ÿä¸çš„ç›¸å…³æ€§åˆ¤æ–åŠ©æ‰‹ã€‚
+_CLASSIFY_TEMPLATE_ZH = """ä½ æ˜¯ä¸€ä¸ªæœé¥°ç”µå•†æœç´¢ç³»ç»Ÿä¸çš„ç›¸å…³æ€§åˆ¤æ–åŠ©æ‰‹ã€‚
 ç»™å®šç”¨æˆ·æŸ¥è¯¢è¯ä»¥åŠæ¯ä¸ªå•†å“çš„ä¿¡æ¯ï¼Œè¯·ä¸ºæ¯ä¸ªå•†å“åˆ†é…ä¸€ä¸ªç›¸å…³æ€§æ ‡ç¾ã€‚
  
+ä½ çš„ç›®æ ‡æ˜¯ä»Žç”µå•†æœç´¢æŽ’åºçš„è§’åº¦ï¼Œåˆ¤æ–å•†å“æ˜¯å¦æ»¡è¶³ç”¨æˆ·çš„è´ç‰©æ„å›¾ã€‚
+åˆ¤æ–æ—¶åº”ä¼˜å…ˆè€ƒè™‘â€œç”¨æˆ·æ˜¯å¦ä¼šæŠŠè¯¥å•†å“è§†ä¸ºç›®æ ‡å•†å“ï¼Œæˆ–å¯æŽ¥å—çš„æ›¿ä»£å“â€ã€‚
+
 ## ç›¸å…³æ€§æ ‡ç¾
  
 ### å®Œå…¨ç›¸å…³
-æ ¸å¿ƒäº§å“ç±»åž‹åŒ¹é…ï¼Œæ‰€æœ‰æ˜Žç¡®æåŠçš„å…³é”®å±žæ€§å‡æœ‰äº§å“ä¿¡æ¯æ”¯æ’‘ã€‚
+å•†å“æ»¡è¶³ç”¨æˆ·çš„æ ¸å¿ƒè´ç‰©æ„å›¾ï¼šæ ¸å¿ƒå•†å“ç±»åž‹åŒ¹é…ï¼Œä¸”æŸ¥è¯¢ä¸æ‰€æœ‰æ˜Žç¡®æåŠçš„å…³é”®å±žæ€§å‡æœ‰å•†å“ä¿¡æ¯æ”¯æŒã€‚
  
 å…¸åž‹é€‚ç”¨åœºæ™¯ï¼š
-- æŸ¥è¯¢ä»…åŒ…å«äº§å“ç±»åž‹ï¼Œäº§å“å³ä¸ºè¯¥ç±»åž‹ã€‚
-- æŸ¥è¯¢åŒ…å«â€œäº§å“ç±»åž‹ + å±žæ€§â€ï¼Œäº§å“åœ¨ç±»åž‹åŠæ‰€æœ‰æ˜Žç¡®å±žæ€§ä¸Šå‡ç¬¦åˆã€‚
+- æŸ¥è¯¢ä»…åŒ…å«å•†å“ç±»åž‹ï¼Œå•†å“å³ä¸ºè¯¥ç±»åž‹ã€‚
+- æŸ¥è¯¢åŒ…å«â€œå•†å“ç±»åž‹ + å±žæ€§â€ï¼Œå•†å“åœ¨ç±»åž‹åŠæ‰€æœ‰æ˜Žç¡®å±žæ€§ä¸Šå‡ç¬¦åˆã€‚
  
-### éƒ¨åˆ†ç›¸å…³
-äº§å“æ»¡è¶³ç”¨æˆ·çš„ä¸»è¦æ„å›¾ï¼ˆæ ¸å¿ƒäº§å“ç±»åž‹åŒ¹é…ï¼‰ï¼Œä½†æŸ¥è¯¢ä¸æ˜Žç¡®çš„éƒ¨åˆ†è¦æ±‚æœªä½“çŽ°ï¼Œæˆ–å˜åœ¨åå·®ã€‚è™½ç„¶æœ‰ä¸ä¸€è‡´ï¼Œä½†ä»å±žäºŽâ€œéžç›®æ ‡ä½†å¯æŽ¥å—â€çš„æ›¿ä»£å“ã€‚
+### åŸºæœ¬ç›¸å…³
+å•†å“æ»¡è¶³ç”¨æˆ·çš„ä¸»è¦æ„å›¾ï¼šæ ¸å¿ƒå•†å“ç±»åž‹åŒ¹é…ï¼Œä½†æŸ¥è¯¢ä¸æ˜Žç¡®æå‡ºçš„éƒ¨åˆ†è¦æ±‚æœªåœ¨å•†å“ä¿¡æ¯ä¸ä½“çŽ°ã€æ— æ³•ç¡®è®¤ï¼Œæˆ–å˜åœ¨è½»å¾®åå·® / éžå…³é”®åå·®ã€‚è¯¥å•†å“ä»æ˜¯æ»¡è¶³ç”¨æˆ·æ ¸å¿ƒéœ€æ±‚çš„è‰¯å¥½æ›¿ä»£å“ã€‚
  
-åœ¨ä»¥ä¸‹æƒ…å†µä½¿ç”¨éƒ¨åˆ†ç›¸å…³ï¼š
-- æ ¸å¿ƒäº§å“ç±»åž‹åŒ¹é…ï¼Œä½†éƒ¨åˆ†è¯·æ±‚çš„å±žæ€§åœ¨å•†å“ä¿¡æ¯ä¸ç¼ºå¤±ã€æœªæåŠæˆ–æ— æ³•ç¡®è®¤ã€‚
-- æ ¸å¿ƒäº§å“ç±»åž‹åŒ¹é…ï¼Œä½†æè´¨ã€ç‰ˆåž‹ã€é£Žæ ¼ç‰æ¬¡è¦è¦æ±‚å˜åœ¨åå·®æˆ–ä¸ä¸€è‡´ã€‚
-- å•†å“ä¸æ˜¯ç”¨æˆ·æœ€ç†æƒ³çš„ç›®æ ‡ï¼Œä½†ä»Žç”µå•†è´ç‰©è§’åº¦çœ‹ï¼Œä»å¯èƒ½è¢«ç”¨æˆ·è§†ä¸ºå¯æŽ¥å—çš„æ›¿ä»£å“ã€‚
+åœ¨ä»¥ä¸‹æƒ…å†µä½¿ç”¨â€œåŸºæœ¬ç›¸å…³â€ï¼š
+- æ ¸å¿ƒå•†å“ç±»åž‹åŒ¹é…ï¼Œä½†éƒ¨åˆ†å±žæ€§ç¼ºå¤±ã€æœªæåŠæˆ–æ— æ³•ç¡®è®¤ã€‚
+- æ ¸å¿ƒå•†å“ç±»åž‹åŒ¹é…ï¼Œä½†é¢œè‰²ã€æè´¨ã€é£Žæ ¼ã€ç‰ˆåž‹ã€é•¿åº¦ç‰å±žæ€§å˜åœ¨è½»å¾®åå·®ï¼Œåªè¦è¿™ç§åå·®ä¸ä¼šæ˜Žæ˜¾ç ´åç”¨æˆ·çš„ä¸»è¦è´ä¹°æ„å›¾ã€‚
+- å•†å“ä¸æ˜¯ç”¨æˆ·æœ€ç†æƒ³çš„ç›®æ ‡ï¼Œä½†åœ¨ç”µå•†è´ç‰©åœºæ™¯ä¸‹ä»å¯èƒ½è¢«è§†ä¸ºå¯æŽ¥å—ã€ä¸”è¾ƒä¼˜çš„æ›¿ä»£å“ã€‚
  
 å…¸åž‹æƒ…å†µï¼š
-- æŸ¥è¯¢ï¼šâ€œçº¢è‰²ä¿®èº«Tæ¤â€ï¼Œäº§å“ï¼šâ€œå¥³å£«Tæ¤â€ â†’ é¢œè‰²/ç‰ˆåž‹æ— æ³•ç¡®è®¤ã€‚
-- æŸ¥è¯¢ï¼šâ€œçº¢è‰²ä¿®èº«Tæ¤â€ï¼Œäº§å“ï¼šâ€œè“è‰²ä¿®èº«Tæ¤â€ â†’ äº§å“ç±»åž‹å’Œç‰ˆåž‹åŒ¹é…ï¼Œä½†é¢œè‰²ä¸åŒã€‚
+- æŸ¥è¯¢ï¼šâ€œçº¢è‰²ä¿®èº«Tæ¤â€ï¼Œå•†å“ï¼šâ€œå¥³å£«Tæ¤â€
+  â†’ é¢œè‰²ã€ç‰ˆåž‹æ— æ³•ç¡®è®¤ã€‚
+- æŸ¥è¯¢ï¼šâ€œçº¢è‰²ä¿®èº«Tæ¤â€ï¼Œå•†å“ï¼šâ€œè“è‰²ä¿®èº«Tæ¤â€
+  â†’ å•†å“ç±»åž‹å’Œç‰ˆåž‹åŒ¹é…ï¼Œä½†é¢œè‰²ä¸åŒã€‚
  
 è¯¦ç»†æ¡ˆä¾‹ï¼š
 - æŸ¥è¯¢ï¼šâ€œæ£‰è´¨é•¿è¢–è¡¬è¡«â€
 - å•†å“ï¼šâ€œJ.VERç”·å¼äºšéº»è¡¬è¡«ä¼‘é—²çº½æ‰£é•¿è¢–è¡¬è¡«çº¯è‰²å¹³é¢†å¤å£æ²™æ»©è¡¬è¡«å¸¦å£è¢‹â€
  
 åˆ†æžï¼š
-- æè´¨ä¸ç¬¦ï¼šQuery æ˜Žç¡®æŒ‡å®šâ€œæ£‰è´¨â€ï¼Œè€Œå•†å“ä¸ºâ€œäºšéº»â€ï¼Œå› æ¤ä¸èƒ½åˆ¤ä¸ºå®Œå…¨ç›¸å…³ã€‚
+- æè´¨ä¸ç¬¦ï¼šQuery æ˜Žç¡®æŒ‡å®šâ€œæ£‰è´¨â€ï¼Œè€Œå•†å“ä¸ºâ€œäºšéº»â€ï¼Œå› æ¤ä¸èƒ½åˆ¤ä¸ºâ€œå®Œå…¨ç›¸å…³â€ã€‚
 - ä½†æ ¸å¿ƒå“ç±»ä»ç„¶åŒ¹é…ï¼šä¸¤è€…éƒ½æ˜¯â€œé•¿è¢–è¡¬è¡«â€ã€‚
 - åœ¨ç”µå•†æœç´¢ä¸ï¼Œç”¨æˆ·ä»å¯èƒ½å› ä¸ºæ¬¾å¼ã€ç©¿ç€åœºæ™¯ç›¸è¿‘è€Œç‚¹å‡»è¯¥å•†å“ã€‚
-- å› æ¤åº”åˆ¤ä¸ºéƒ¨åˆ†ç›¸å…³ï¼Œå³â€œéžç›®æ ‡ä½†å¯æŽ¥å—â€çš„æ›¿ä»£å“ã€‚
+- å› æ¤åº”åˆ¤ä¸ºâ€œåŸºæœ¬ç›¸å…³â€ï¼Œå³â€œéžç²¾ç¡®ç›®æ ‡ï¼Œä½†å±žäºŽè‰¯å¥½æ›¿ä»£å“â€ã€‚
  
-### ä¸ç›¸å…³
-äº§å“æœªæ»¡è¶³ç”¨æˆ·çš„ä¸»è¦è´ç‰©æ„å›¾ï¼Œä¸»è¦è¡¨çŽ°ä¸ºä»¥ä¸‹æƒ…å½¢ä¹‹ä¸€ï¼š
-- æ ¸å¿ƒäº§å“ç±»åž‹ä¸ŽæŸ¥è¯¢ä¸åŒ¹é…ã€‚
-- äº§å“è™½å±žå¤§è‡´ç›¸å…³çš„å¤§ç±»ï¼Œä½†ä¸ŽæŸ¥è¯¢æŒ‡å®šçš„å…·ä½“åç±»ä¸å¯äº’æ¢ã€‚
-- æ ¸å¿ƒäº§å“ç±»åž‹åŒ¹é…ï¼Œä½†äº§å“æ˜Žæ˜¾è¿èƒŒäº†æŸ¥è¯¢ä¸ä¸€ä¸ªæ˜Žç¡®ä¸”é‡è¦çš„å±žæ€§è¦æ±‚ã€‚
+è¯¦ç»†æ¡ˆä¾‹ï¼š
+- æŸ¥è¯¢ï¼šâ€œé»‘è‰²ä¸é•¿åŠèº«è£™â€
+- å•†å“ï¼šâ€œæ˜¥ç§‹å£æ–°æ¬¾å®½æ¾æ˜¾ç˜¦å¤§æ‘†é•¿è£™ç¢ŽèŠ±åŠèº«è£™è¤¶çš±è®¾è®¡è£™â€
+
+åˆ†æžï¼š
+- å“ç±»åŒ¹é…ï¼šå•†å“æ˜¯â€œåŠèº«è£™â€ï¼Œå“ç±»ç¬¦åˆã€‚
+- é¢œè‰²ä¸åŒ¹é…ï¼šå•†å“æè¿°æœªæåŠé»‘è‰²ï¼Œä¸”æ˜Žç¡®åŒ…å«â€œç¢ŽèŠ±â€ï¼Œä¸Žçº¯é»‘å·®å¼‚è¾ƒå¤§ã€‚
+- é•¿åº¦å˜åœ¨åå·®ï¼šç”¨æˆ·è¦æ±‚â€œä¸é•¿â€ï¼Œè€Œå•†å“æ ‡é¢˜å¼ºè°ƒâ€œé•¿è£™â€ï¼Œé•¿åº¦åé•¿ã€‚
+- ä½†æ ¸å¿ƒå“ç±»â€œåŠèº«è£™â€åŒ¹é…ï¼Œâ€œæ˜¾ç˜¦â€â€œå¤§æ‘†â€ç‰é£Žæ ¼ç‰¹å¾ä»å¯èƒ½ç¬¦åˆéƒ¨åˆ†æœç´¢â€œä¸é•¿åŠèº«è£™â€ç”¨æˆ·çš„æ½œåœ¨åå¥½ï¼›åŒæ—¶â€œé•¿è£™â€å’Œâ€œä¸é•¿â€è™½æœ‰åå·®ï¼Œä½†ä¸æž„æˆä¸¥é‡å¯¹ç«‹ã€‚
+- å› æ¤åº”åˆ¤ä¸ºâ€œåŸºæœ¬ç›¸å…³â€ï¼šæ ¸å¿ƒå“ç±»åŒ¹é…ï¼Œä½†å˜åœ¨è‹¥å¹²éžè‡´å‘½å±žæ€§åå·®ã€‚
+
+### å¼±ç›¸å…³
+å•†å“ä¸Žç”¨æˆ·çš„æ ¸å¿ƒç›®æ ‡å˜åœ¨æ˜Žæ˜¾å·®è·ï¼Œä½†ä»ä¸ŽæŸ¥è¯¢åœ¨é£Žæ ¼ã€åœºæ™¯ã€åŠŸèƒ½æˆ–å¤§ç±»ä¸Šå…·æœ‰ä¸€å®šç›¸ä¼¼æ€§ï¼Œå¯èƒ½è¢«å°‘é‡ç”¨æˆ·è§†ä¸ºå‹‰å¼ºå¯æŽ¥å—çš„æ›¿ä»£å“ã€‚å±žäºŽâ€œéžç›®æ ‡ï¼Œä½†ä»æœ‰ä¸€å®šå…³è”â€ã€‚
+
+åœ¨ä»¥ä¸‹æƒ…å†µä½¿ç”¨â€œå¼±ç›¸å…³â€ï¼š
+- æ ¸å¿ƒå•†å“ç±»åž‹ä¸ä¸€è‡´ï¼Œä½†ä¸¤è€…åœ¨é£Žæ ¼ã€ç©¿ç€åœºæ™¯æˆ–åŠŸèƒ½ä¸Šéžå¸¸æŽ¥è¿‘ï¼Œä»å…·æœ‰ä¸€å®šæ›¿ä»£æ€§ã€‚
+- æ ¸å¿ƒå•†å“ç±»åž‹åŒ¹é…ï¼Œä½†åœ¨å¤šä¸ªå±žæ€§ä¸Šä¸Žç”¨æˆ·ç†æƒ³ç›®æ ‡å·®è·è¾ƒå¤§ï¼Œè™½ä»æœ‰ä¸€å®šå…³è”æ€§ï¼Œä½†å·²ä¸æ˜¯é«˜è´¨é‡æ›¿ä»£å“ã€‚
+- æŸ¥è¯¢è¦æ±‚ä¸çš„æŸä¸ªé‡è¦å±žæ€§è¢«æ˜Žæ˜¾è¿èƒŒï¼Œä½†å•†å“ä»ä¿ç•™å°‘é‡è¢«ç‚¹å‡»çš„ç†ç”±ã€‚
  
 å…¸åž‹æƒ…å†µï¼š
-- æŸ¥è¯¢ï¼šâ€œè£¤åâ€ï¼Œäº§å“ï¼šâ€œéž‹åâ€ â†’ äº§å“ç±»åž‹é”™è¯¯ã€‚
-- æŸ¥è¯¢ï¼šâ€œè¿žè¡£è£™â€ï¼Œäº§å“ï¼šâ€œåŠèº«è£™â€ â†’ å…·ä½“äº§å“ç±»åž‹ä¸åŒã€‚
-- æŸ¥è¯¢ï¼šâ€œä¿®èº«è£¤â€ï¼Œäº§å“ï¼šâ€œå®½æ¾é˜”è…¿è£¤â€ â†’ ä¸Žç‰ˆåž‹è¦æ±‚æ˜Žæ˜¾å†²çªã€‚
-- æŸ¥è¯¢ï¼šâ€œæ— è¢–è¿žè¡£è£™â€ï¼Œäº§å“ï¼šâ€œé•¿è¢–è¿žè¡£è£™â€ â†’ ä¸Žè¢–åž‹è¦æ±‚æ˜Žæ˜¾å†²çªã€‚
+- æŸ¥è¯¢ï¼šâ€œé»‘è‰²ä¸é•¿åŠèº«è£™â€ï¼Œå•†å“ï¼šâ€œæ–°æ¬¾é«˜è…°Vé¢†ä¸é•¿æ¬¾è¿žè¡£è£™ ä¼˜é›…å°èŠ±é»‘è‰²æ€§æ„Ÿè¿žè¡£è£™â€
+  â†’ æ ¸å¿ƒå•†å“ç±»åž‹â€œåŠèº«è£™â€ä¸Žâ€œè¿žè¡£è£™â€ä¸åŒï¼Œä½†ä¸¤è€…åŒå±žè£™è£…ï¼Œä¸”æ¬¾å¼ä¸Šå‡ä¸ºâ€œä¸é•¿æ¬¾â€ï¼Œåœ¨ç©¿æåœºæ™¯ä¸ŠæŽ¥è¿‘ï¼Œå› æ¤å±žäºŽâ€œå¼±ç›¸å…³â€ã€‚
+
+- æŸ¥è¯¢ï¼šâ€œç‰›ä»”è£¤â€ï¼Œå•†å“ï¼šâ€œä¼‘é—²è£¤â€
+  â†’ æ ¸å¿ƒå•†å“ç±»åž‹ä¸åŒï¼Œä½†åŒå±žè£¤è£…å¤§ç±»ï¼Œé£Žæ ¼å’Œç©¿ç€åœºæ™¯å¯èƒ½æŽ¥è¿‘ï¼Œå¯ä½œä¸ºè¾ƒå¼±æ›¿ä»£å“ã€‚
+
+### ä¸ç›¸å…³
+å•†å“æœªæ»¡è¶³ç”¨æˆ·çš„ä¸»è¦è´ç‰©æ„å›¾ï¼Œç”¨æˆ·ç‚¹å‡»åŠ¨æœºæžä½Žã€‚
+
+ä¸»è¦è¡¨çŽ°ä¸ºä»¥ä¸‹æƒ…å½¢ä¹‹ä¸€ï¼š
+- æ ¸å¿ƒå•†å“ç±»åž‹ä¸ŽæŸ¥è¯¢ä¸åŒ¹é…ï¼Œä¸”ä¸å±žäºŽé£Žæ ¼ / åœºæ™¯ / åŠŸèƒ½æŽ¥è¿‘çš„å¯æ›¿ä»£å“ã€‚
+- å•†å“è™½å±žäºŽå¤§è‡´ç›¸å…³çš„å¤§ç±»ï¼Œä½†ä¸ŽæŸ¥è¯¢æ˜Žç¡®æŒ‡å®šçš„å…·ä½“åç±»ä¸å¯äº’æ¢ï¼Œä¸”é£Žæ ¼æˆ–åœºæ™¯å·®å¼‚å¤§ã€‚
+- æ ¸å¿ƒå•†å“ç±»åž‹åŒ¹é…ï¼Œä½†å•†å“æ˜Žæ˜¾è¿èƒŒäº†æŸ¥è¯¢ä¸ä¸€ä¸ªæ˜Žç¡®ä¸”é‡è¦çš„è¦æ±‚ï¼Œä¸”å‡ ä¹Žä¸å…·å¤‡å¯æŽ¥å—çš„æ›¿ä»£æ€§ã€‚
  
-è¯¥æ ‡ç¾å¼ºè°ƒç”¨æˆ·æ„å›¾çš„æ˜Žç¡®æ€§ã€‚å½“æŸ¥è¯¢æŒ‡å‘å…·ä½“ç±»åž‹æˆ–å…³é”®å±žæ€§æ—¶ï¼Œå³ä½¿äº§å“åœ¨æ›´é«˜å±‚çº§ç±»åˆ«ä¸Šç›¸å…³ï¼Œä¹Ÿåº”æŒ‰ä¸ç›¸å…³å¤„ç†ã€‚
+å…¸åž‹æƒ…å†µï¼š
+- æŸ¥è¯¢ï¼šâ€œè£¤åâ€ï¼Œå•†å“ï¼šâ€œéž‹åâ€
+  â†’ å•†å“ç±»åž‹é”™è¯¯ã€‚
+- æŸ¥è¯¢ï¼šâ€œä¿®èº«è£¤â€ï¼Œå•†å“ï¼šâ€œå®½æ¾é˜”è…¿è£¤â€
+  â†’ ä¸Žç‰ˆåž‹è¦æ±‚æ˜Žæ˜¾å†²çªï¼Œæ›¿ä»£æ€§æžä½Žã€‚
+- æŸ¥è¯¢ï¼šâ€œæ— è¢–è¿žè¡£è£™â€ï¼Œå•†å“ï¼šâ€œé•¿è¢–è¿žè¡£è£™â€
+  â†’ ä¸Žè¢–åž‹è¦æ±‚æ˜Žæ˜¾å†²çªã€‚
+- æŸ¥è¯¢ï¼šâ€œç‰›ä»”è£¤â€ï¼Œå•†å“ï¼šâ€œè¿åŠ¨è£¤â€
+  â†’ æ ¸å¿ƒå“ç±»ä¸åŒï¼ˆç‰›ä»”è£¤ vs è¿åŠ¨è£¤ï¼‰ï¼Œé£Žæ ¼å’Œåœºæ™¯å·®å¼‚å¤§ã€‚
+- æŸ¥è¯¢ï¼šâ€œé´åâ€ï¼Œå•†å“ï¼šâ€œè¿åŠ¨éž‹â€
+  â†’ æ ¸å¿ƒå“ç±»ä¸åŒï¼ŒåŠŸèƒ½å’Œé€‚ç”¨åœºæ™¯å·®å¼‚å¤§ã€‚
  
 ## åˆ¤æ–åŽŸåˆ™
  
-1. äº§å“ç±»åž‹æ˜¯æœ€é«˜ä¼˜å…ˆçº§å› ç´ ã€‚
-   å¦‚æžœæŸ¥è¯¢æ˜Žç¡®æŒ‡å®šäº†å…·ä½“äº§å“ç±»åž‹ï¼Œé‚£ä¹ˆç»“æžœå¿…é¡»åŒ¹é…è¯¥äº§å“ç±»åž‹ï¼Œæ‰å¯èƒ½åˆ¤ä¸ºâ€œå®Œå…¨ç›¸å…³â€æˆ–â€œéƒ¨åˆ†ç›¸å…³â€ã€‚
-   ä¸åŒäº§å“ç±»åž‹é€šå¸¸åº”åˆ¤ä¸ºâ€œä¸ç›¸å…³â€ï¼Œè€Œä¸æ˜¯â€œéƒ¨åˆ†ç›¸å…³â€ã€‚
+1. **å•†å“ç±»åž‹æ˜¯æœ€é«˜ä¼˜å…ˆçº§å› ç´ ã€‚**
+   å¦‚æžœæŸ¥è¯¢æ˜Žç¡®æŒ‡å®šäº†å…·ä½“å•†å“ç±»åž‹ï¼Œé‚£ä¹ˆç»“æžœå¿…é¡»åŒ¹é…è¯¥å•†å“ç±»åž‹ï¼Œæ‰å¯èƒ½åˆ¤ä¸ºâ€œå®Œå…¨ç›¸å…³â€æˆ–â€œåŸºæœ¬ç›¸å…³â€ã€‚
+   ä¸åŒå•†å“ç±»åž‹é€šå¸¸åº”åˆ¤ä¸ºâ€œå¼±ç›¸å…³â€æˆ–â€œä¸ç›¸å…³â€ã€‚
+
+   - **å¼±ç›¸å…³**ï¼šä»…å½“ä¸¤ç§å•†å“ç±»åž‹åœ¨é£Žæ ¼ã€åœºæ™¯ã€åŠŸèƒ½ä¸Šéžå¸¸æŽ¥è¿‘ï¼Œç”¨æˆ·æœ‰ä¸€å®šæ¦‚çŽ‡å°†å…¶è§†ä¸ºå‹‰å¼ºå¯æŽ¥å—çš„æ›¿ä»£å“æ—¶ä½¿ç”¨ã€‚
+   - **ä¸ç›¸å…³**ï¼šå…¶ä»–æ‰€æœ‰å•†å“ç±»åž‹ä¸åŒ¹é…çš„æƒ…å†µã€‚
  
-2. ç›¸ä¼¼æˆ–ç›¸å…³çš„äº§å“ç±»åž‹ï¼Œåœ¨æŸ¥è¯¢æ˜Žç¡®æ—¶é€šå¸¸ä¸å¯äº’æ¢ã€‚
+2. **ç›¸ä¼¼æˆ–ç›¸å…³çš„å•†å“ç±»åž‹ï¼Œåœ¨æŸ¥è¯¢æ˜Žç¡®æ—¶é€šå¸¸ä¸å¯ç›´æŽ¥äº’æ¢ï¼Œä½†è¦æ ¹æ®æŽ¥è¿‘ç¨‹åº¦åŒºåˆ†â€œå¼±ç›¸å…³â€ä¸Žâ€œä¸ç›¸å…³â€ã€‚**
    ä¾‹å¦‚ï¼š
-   - è¿žè¡£è£™ vs åŠèº«è£™ vs è¿žä½“è£¤
-   - ç‰›ä»”è£¤ vs è£¤å
-   - Tæ¤ vs è¡¬è¡«/ä¸Šè¡£
-   - å¼€è¡« vs æ¯›è¡£
-   - é´å vs éž‹å
-   - æ–‡èƒ¸ vs ä¸Šè¡£
-   - åŒè‚©åŒ… vs åŒ…
-   å¦‚æžœç”¨æˆ·æ˜Žç¡®æœç´¢å…¶ä¸ä¸€ç§ï¼Œå…¶ä»–ç±»åž‹é€šå¸¸åº”åˆ¤ä¸ºâ€œä¸ç›¸å…³â€ã€‚
-
-3. å½“æ ¸å¿ƒäº§å“ç±»åž‹åŒ¹é…åŽï¼Œå†è¯„ä¼°å±žæ€§ã€‚
-   - æ‰€æœ‰æ˜Žç¡®å±žæ€§éƒ½åŒ¹é… â†’ å®Œå…¨ç›¸å…³
-   - éƒ¨åˆ†å±žæ€§ç¼ºå¤±ã€æ— æ³•ç¡®è®¤ï¼Œæˆ–å˜åœ¨ä¸€å®šåå·®ï¼Œä½†ä»æ˜¯å¯æŽ¥å—æ›¿ä»£å“ â†’ éƒ¨åˆ†ç›¸å…³
-   - æ˜Žç¡®ä¸”é‡è¦çš„å±žæ€§è¢«æ˜Žæ˜¾è¿èƒŒï¼Œä¸”ä¸èƒ½ä½œä¸ºåˆç†æ›¿ä»£å“ â†’ ä¸ç›¸å…³
-
-4. è¦ä¸¥æ ¼åŒºåˆ†â€œæœªæåŠ/æ— æ³•ç¡®è®¤â€å’Œâ€œæ˜Žç¡®å†²çªâ€ã€‚
-   - å¦‚æžœæŸå±žæ€§æ²¡æœ‰æåŠï¼Œæˆ–æ— æ³•éªŒè¯ï¼Œä¼˜å…ˆåˆ¤ä¸ºâ€œéƒ¨åˆ†ç›¸å…³â€ã€‚
-   - å¦‚æžœæŸå±žæ€§ä¸ŽæŸ¥è¯¢è¦æ±‚æ˜Žç¡®ç›¸åï¼Œåˆ™åˆ¤ä¸ºâ€œä¸ç›¸å…³â€ï¼›é™¤éžåœ¨è´ç‰©è¯å¢ƒä¸‹å®ƒä»æ˜Žæ˜¾å±žäºŽå¯æŽ¥å—æ›¿ä»£å“ã€‚
+   - **é£Žæ ¼ / åœºæ™¯é«˜åº¦æŽ¥è¿‘ï¼Œå¯åˆ¤ä¸ºå¼±ç›¸å…³**ï¼šè¿žè¡£è£™ vs åŠèº«è£™ã€é•¿è£™ vs ä¸é•¿è£™ã€ç‰›ä»”è£¤ vs ä¼‘é—²è£¤ã€è¿åŠ¨éž‹ vs æ¿éž‹ã€‚
+   - **é£Žæ ¼ / åœºæ™¯å·®å¼‚å¤§ï¼Œåº”åˆ¤ä¸ºä¸ç›¸å…³**ï¼šè£¤å vs éž‹åã€Tæ¤ vs å¸½åã€é´å vs è¿åŠ¨éž‹ã€ç‰›ä»”è£¤ vs è¥¿è£…è£¤ã€åŒè‚©åŒ… vs æ‰‹æåŒ…ã€‚
+
+3. **å½“æ ¸å¿ƒå•†å“ç±»åž‹åŒ¹é…åŽï¼Œå†è¯„ä¼°å±žæ€§ã€‚**
+   - æ‰€æœ‰æ˜Žç¡®å±žæ€§éƒ½åŒ¹é… â†’ **å®Œå…¨ç›¸å…³**
+   - éƒ¨åˆ†å±žæ€§ç¼ºå¤±ã€æœªæåŠã€æ— æ³•ç¡®è®¤ï¼Œæˆ–å˜åœ¨è½»å¾®åå·® â†’ **åŸºæœ¬ç›¸å…³**
+   - å˜åœ¨å¤šä¸ªå±žæ€§åå·®ï¼Œæˆ–æŸä¸ªé‡è¦å±žæ€§è¢«æ˜Žæ˜¾è¿èƒŒï¼Œä½†å•†å“ä»ä¿ç•™ä¸€å®šæ›¿ä»£æ€§ â†’ **å¼±ç›¸å…³**
+   - å˜åœ¨æ˜Žç¡®ä¸”é‡è¦çš„å¼ºå†²çªï¼Œä¸”æ›¿ä»£æ€§æžä½Ž â†’ **ä¸ç›¸å…³**
+
+4. **è¦ä¸¥æ ¼åŒºåˆ†â€œæœªæåŠ / æ— æ³•ç¡®è®¤â€â€œè½»å¾®åå·®â€â€œæ˜Žç¡®å†²çªâ€ã€‚**
+   - å¦‚æžœæŸå±žæ€§æ²¡æœ‰æåŠï¼Œæˆ–æ— æ³•éªŒè¯ï¼Œä¼˜å…ˆåˆ¤ä¸ºâ€œåŸºæœ¬ç›¸å…³â€ã€‚
+   - å¦‚æžœæŸå±žæ€§å˜åœ¨è½»å¾®åå·®ï¼Œå¦‚é¢œè‰²ä¸åŒã€æè´¨ä¸åŒã€é•¿åº¦ç•¥æœ‰å·®å¼‚ï¼Œé€šå¸¸åˆ¤ä¸ºâ€œåŸºæœ¬ç›¸å…³â€ã€‚
+   - å¦‚æžœæŸå±žæ€§ä¸ŽæŸ¥è¯¢è¦æ±‚æ˜Žç¡®ç›¸åï¼Œå¦‚æ— è¢– vs é•¿è¢–ã€ä¿®èº« vs å®½æ¾é˜”è…¿ï¼Œåˆ™è¦æ ¹æ®å†²çªä¸¥é‡æ€§ä¸Žæ›¿ä»£æ€§ï¼Œåœ¨â€œå¼±ç›¸å…³â€ä¸Žâ€œä¸ç›¸å…³â€ä¹‹é—´åˆ¤æ–ã€‚
+   - è‹¥è¯¥å†²çªä¼šç›´æŽ¥ç ´åç”¨æˆ·çš„ä¸»è¦è´ä¹°ç›®æ ‡ï¼Œé€šå¸¸åˆ¤ä¸ºâ€œä¸ç›¸å…³â€ã€‚
+
+5. **â€œæ˜¯å¦å¯æ›¿ä»£â€åº”ä»ŽçœŸå®žç”µå•†è´ç‰©æ„å›¾å‡ºå‘åˆ¤æ–ã€‚**
+   ä¸æ˜¯åªçœ‹å—é¢ç›¸ä¼¼ï¼Œè€Œè¦çœ‹ç”¨æˆ·åœ¨è´ç‰©åœºæ™¯ä¸‹æ˜¯å¦å¯èƒ½æŽ¥å—è¯¥å•†å“ã€‚
+   - è‰¯å¥½æ›¿ä»£å“ â†’ **åŸºæœ¬ç›¸å…³**
+   - å‹‰å¼ºæ›¿ä»£å“ â†’ **å¼±ç›¸å…³**
+   - å‡ ä¹Žä¸å¯æ›¿ä»£ â†’ **ä¸ç›¸å…³**
+
+6. **è‹¥å•†å“ä¿¡æ¯ä¸è¶³ï¼Œä¸è¦æŠŠâ€œæ— æ³•ç¡®è®¤â€è¯¯åˆ¤ä¸ºâ€œå†²çªâ€ã€‚**
+   å•†å“æœªå†™æ˜ŽæŸå±žæ€§ï¼Œä¸ç‰äºŽè¯¥å±žæ€§ä¸€å®šä¸ç¬¦åˆã€‚
+   å› æ¤ï¼š
+   - æœªæåŠ / æ— æ³•ç¡®è®¤ï¼Œä¼˜å…ˆæŒ‰â€œåŸºæœ¬ç›¸å…³â€å¤„ç†ï¼›
+   - åªæœ‰å½“å•†å“ä¿¡æ¯æ˜Žç¡®æ˜¾ç¤ºä¸ŽæŸ¥è¯¢è¦æ±‚ç›¸åæ—¶ï¼Œæ‰è§†ä¸ºå±žæ€§å†²çªã€‚
  
 æŸ¥è¯¢ï¼š{query}
  
@@ -176,9 +282,10 @@ _CLASSIFY_BATCH_SIMPLE_TEMPLATE__zh = _CLASSIFY_BATCH_SIMPLE_TEMPLATE_ZH = &quot;&quot;&quot;ä½
 {lines}
  
 ## è¾“å‡ºæ ¼å¼
-ä¸¥æ ¼è¾“å‡º {n} è¡Œï¼Œæ¯è¡Œåªèƒ½æ˜¯ä»¥ä¸‹ä¸‰è€…ä¹‹ä¸€ï¼š
+ä¸¥æ ¼è¾“å‡º {n} è¡Œï¼Œæ¯è¡Œåªèƒ½æ˜¯ä»¥ä¸‹å››è€…ä¹‹ä¸€ï¼š
 å®Œå…¨ç›¸å…³
-éƒ¨åˆ†ç›¸å…³
+åŸºæœ¬ç›¸å…³
+å¼±ç›¸å…³
 ä¸ç›¸å…³
  
 è¾“å‡ºè¡Œå¿…é¡»ä¸Žä¸Šæ–¹å•†å“é¡ºåºä¸€ä¸€å¯¹åº”ã€‚
@@ -186,77 +293,7 @@ _CLASSIFY_BATCH_SIMPLE_TEMPLATE__zh = _CLASSIFY_BATCH_SIMPLE_TEMPLATE_ZH = &quot;&quot;&quot;ä½
 """
  
  
-
-def classify_batch_simple_prompt(query: str, numbered_doc_lines: Sequence[str]) -> str:
+def classify_prompt(query: str, numbered_doc_lines: Sequence[str]) -> str:
     lines = "\n".join(numbered_doc_lines)
     n = len(numbered_doc_lines)
-    return _CLASSIFY_BATCH_SIMPLE_TEMPLATE.format(query=query, lines=lines, n=n)
-
-
-_EXTRACT_QUERY_PROFILE_TEMPLATE = """You are building a structured intent profile for e-commerce relevance judging.
-Use the original user query as the source of truth. Parser hints may help, but if a hint conflicts with the original query, trust the original query.
-Be conservative: only mark an attribute as required if the user explicitly asked for it.
-
-Return JSON with this schema:
-{{
-  "normalized_query_en": string,
-  "primary_category": string,
-  "allowed_categories": [string],
-  "required_attributes": [
-    {{"name": string, "required_terms": [string], "conflicting_terms": [string], "match_mode": "explicit"}}
-  ],
-  "notes": [string]
-}}
-
-Guidelines:
-- Exact later will require explicit evidence for all required attributes.
-- allowed_categories should contain only near-synonyms of the same product type, not substitutes. For example dress can allow midi dress/cocktail dress, but not skirt, top, jumpsuit, or outfit unless the query explicitly asks for them.
-- If the query asks for dress/skirt/jeans/t-shirt, near but different product types are not Exact.
-- If the query includes color, fit, silhouette, or length, include them as required_attributes.
-- For fit words, include conflicting terms when obvious, e.g. fitted conflicts with oversized/loose; oversized conflicts with fitted/tight.
-- For color, include conflicting colors only when clear from the query.
-
-Original query: {query}
-Parser hints JSON: {hints_json}
-"""
-
-
-def extract_query_profile_prompt(query: str, parser_hints: Dict[str, Any]) -> str:
-    hints_json = json.dumps(parser_hints, ensure_ascii=False)
-    return _EXTRACT_QUERY_PROFILE_TEMPLATE.format(query=query, hints_json=hints_json)
-
-
-_CLASSIFY_BATCH_COMPLEX_TEMPLATE = """You are an e-commerce search relevance judge.
-Judge each product against the structured query profile below.
-
-Relevance rules:
-- Exact: product type matches the target intent, and every explicit required attribute is positively supported by the title/options/tags/category. If an attribute is missing or only guessed, it is NOT Exact.
-- Partial: main product type/use case matches, but some required attribute is missing, weaker, uncertain, or only approximately matched.
-- Irrelevant: product type/use case mismatched, or an explicit required attribute clearly conflicts.
-- Be conservative with Exact.
-- Graphic/holiday/message tees are not Exact for a plain color/style tee query unless that graphic/theme was requested.
-- Jumpsuit/romper/set is not Exact for dress/skirt/jeans queries.
-
-Original query: {query}
-Structured query profile JSON: {profile_json}
-
-Products:
-{lines}
-
-Return JSON only, with schema:
-{{"labels":[{{"index":1,"label":"Exact","reason":"short phrase"}}]}}
-"""
-
-
-def classify_batch_complex_prompt(
-    query: str,
-    query_profile: Dict[str, Any],
-    numbered_doc_lines: Sequence[str],
-) -> str:
-    lines = "\n".join(numbered_doc_lines)
-    profile_json = json.dumps(query_profile, ensure_ascii=False)
-    return _CLASSIFY_BATCH_COMPLEX_TEMPLATE.format(
-        query=query,
-        profile_json=profile_json,
-        lines=lines,
-    )
+    return _CLASSIFY_TEMPLATE_EN.format(query=query, lines=lines, n=n)
@@ -4,7 +4,7 @@ from __future__ import annotations
  
 from typing import Any, Dict
  
-from .constants import RELEVANCE_EXACT, RELEVANCE_IRRELEVANT, RELEVANCE_PARTIAL
+from .constants import RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_IRRELEVANT, RELEVANCE_LOW
  
  
 def render_batch_report_markdown(payload: Dict[str, Any]) -> str:
@@ -29,8 +29,9 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -&gt; str:
                 "",
                 "## Label Distribution",
                 "",
-                f"- Exact: {distribution.get(RELEVANCE_EXACT, 0)}",
-                f"- Partial: {distribution.get(RELEVANCE_PARTIAL, 0)}",
+                f"- Exact Match: {distribution.get(RELEVANCE_EXACT, 0)}",
+                f"- High Relevant: {distribution.get(RELEVANCE_HIGH, 0)}",
+                f"- Low Relevant: {distribution.get(RELEVANCE_LOW, 0)}",
                 f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}",
             ]
         )
@@ -41,8 +42,9 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -&gt; str:
         for key, value in sorted((item.get("metrics") or {}).items()):
             lines.append(f"- {key}: {value}")
         distribution = item.get("distribution") or {}
-        lines.append(f"- Exact: {distribution.get(RELEVANCE_EXACT, 0)}")
-        lines.append(f"- Partial: {distribution.get(RELEVANCE_PARTIAL, 0)}")
+        lines.append(f"- Exact Match: {distribution.get(RELEVANCE_EXACT, 0)}")
+        lines.append(f"- High Relevant: {distribution.get(RELEVANCE_HIGH, 0)}")
+        lines.append(f"- Low Relevant: {distribution.get(RELEVANCE_LOW, 0)}")
         lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}")
         lines.append("")
     return "\n".join(lines)
@@ -35,12 +35,14 @@
     .results { display: grid; gap: 10px; }
     .result { display: grid; grid-template-columns: 110px 100px 1fr; gap: 14px; align-items: center; background: var(--panel); border: 1px solid var(--line); border-radius: 18px; padding: 12px; }
     .badge { display: inline-block; padding: 8px 10px; border-radius: 999px; color: white; font-weight: 700; text-align: center; }
-    .Exact { background: var(--exact); }
-    .Partial { background: var(--partial); }
-    .Irrelevant { background: var(--irrelevant); }
-    .Unknown { background: #637381; }
+    .label-exact-match { background: var(--exact); }
+    .label-high-relevant { background: var(--partial); }
+    .label-low-relevant { background: #6b5b95; }
+    .label-irrelevant { background: var(--irrelevant); }
+    .badge-unknown { background: #637381; }
     .thumb { width: 100px; height: 100px; object-fit: cover; border-radius: 14px; background: #e7e1d4; }
-    .title { font-size: 16px; font-weight: 700; margin-bottom: 8px; }
+    .title { font-size: 16px; font-weight: 700; margin-bottom: 4px; }
+    .title-zh { font-size: 14px; font-weight: 500; color: var(--muted); margin-bottom: 8px; line-height: 1.4; }
     .options { color: var(--muted); line-height: 1.5; font-size: 14px; }
     .section { margin-bottom: 28px; }
     .history { font-size: 13px; line-height: 1.5; }
@@ -13,6 +13,10 @@
         root.appendChild(card);
       });
     }
+    function labelBadgeClass(label) {
+      if (!label || label === 'Unknown') return 'badge-unknown';
+      return 'label-' + String(label).toLowerCase().replace(/\s+/g, '-');
+    }
     function renderResults(results, rootId='results', showRank=true) {
       const mount = document.getElementById(rootId);
       mount.innerHTML = '';
@@ -21,10 +25,11 @@
         const box = document.createElement('div');
         box.className = 'result';
         box.innerHTML = `
-          <div><span class="badge ${label}">${label}</span><div class="muted" style="margin-top:8px">${showRank ? `#${item.rank || '-'}` : (item.rerank_score != null ? `rerank=${item.rerank_score.toFixed ? item.rerank_score.toFixed(4) : item.rerank_score}` : 'not recalled')}</div></div>
+          <div><span class="badge ${labelBadgeClass(label)}">${label}</span><div class="muted" style="margin-top:8px">${showRank ? `#${item.rank || '-'}` : (item.rerank_score != null ? `rerank=${item.rerank_score.toFixed ? item.rerank_score.toFixed(4) : item.rerank_score}` : 'not recalled')}</div></div>
           <img class="thumb" src="${item.image_url || ''}" alt="" />
           <div>
             <div class="title">${item.title || ''}</div>
+            ${item.title_zh ? `<div class="title-zh">${item.title_zh}</div>` : ''}
             <div class="options">
               <div>${(item.option_values || [])[0] || ''}</div>
               <div>${(item.option_values || [])[1] || ''}</div>
@@ -41,7 +46,7 @@
       const root = document.getElementById('tips');
       const tips = [...(data.tips || [])];
       const stats = data.label_stats || {};
-      tips.unshift(`Cached labels for query: ${stats.total || 0}. Recalled hits: ${stats.recalled_hits || 0}. Missed Exact: ${stats.missing_exact_count || 0}. Missed Partial: ${stats.missing_partial_count || 0}.`);
+      tips.unshift(`Cached labels for query: ${stats.total || 0}. Recalled hits: ${stats.recalled_hits || 0}. Missed (non-irrelevant): ${stats.missing_relevant_count || 0} — Exact: ${stats.missing_exact_count || 0}, High: ${stats.missing_high_count || 0}, Low: ${stats.missing_low_count || 0}.`);
       root.innerHTML = tips.map(text => `<div class="tip">${text}</div>`).join('');
     }
     async function loadQueries() {
@@ -37,7 +37,7 @@
         <div id="results" class="results"></div>
       </section>
       <section class="section">
-        <h2>Missed Exact / Partial</h2>
+        <h2>Missed non-irrelevant (cached)</h2>
         <div id="missingRelevant" class="results"></div>
       </section>
       <section class="section">
@@ -8,7 +8,7 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Sequence
  
-from .constants import VALID_LABELS
+from .constants import VALID_LABELS, normalize_stored_label
 from .utils import ensure_dir, safe_json_dumps, utc_now_iso
  
  
@@ -220,7 +220,7 @@ class EvalStore:
             """,
             (tenant_id, query_text),
         ).fetchall()
-        return {str(row["spu_id"]): str(row["label"]) for row in rows}
+        return {str(row["spu_id"]): normalize_stored_label(str(row["label"])) for row in rows}
  
     def upsert_labels(
         self,
@@ -379,8 +379,9 @@ class EvalStore:
             SELECT
               query_text,
               COUNT(*) AS total,
-              SUM(CASE WHEN label='Exact' THEN 1 ELSE 0 END) AS exact_count,
-              SUM(CASE WHEN label='Partial' THEN 1 ELSE 0 END) AS partial_count,
+              SUM(CASE WHEN label IN ('Exact Match','Exact') THEN 1 ELSE 0 END) AS exact_count,
+              SUM(CASE WHEN label IN ('High Relevant','Partial') THEN 1 ELSE 0 END) AS high_relevant_count,
+              SUM(CASE WHEN label='Low Relevant' THEN 1 ELSE 0 END) AS low_relevant_count,
               SUM(CASE WHEN label='Irrelevant' THEN 1 ELSE 0 END) AS irrelevant_count,
               MAX(updated_at) AS updated_at
             FROM relevance_labels
@@ -395,7 +396,8 @@ class EvalStore:
                 "query": str(row["query_text"]),
                 "total": int(row["total"]),
                 "exact_count": int(row["exact_count"] or 0),
-                "partial_count": int(row["partial_count"] or 0),
+                "high_relevant_count": int(row["high_relevant_count"] or 0),
+                "low_relevant_count": int(row["low_relevant_count"] or 0),
                 "irrelevant_count": int(row["irrelevant_count"] or 0),
                 "updated_at": row["updated_at"],
             }
@@ -407,8 +409,9 @@ class EvalStore:
             """
             SELECT
               COUNT(*) AS total,
-              SUM(CASE WHEN label='Exact' THEN 1 ELSE 0 END) AS exact_count,
-              SUM(CASE WHEN label='Partial' THEN 1 ELSE 0 END) AS partial_count,
+              SUM(CASE WHEN label IN ('Exact Match','Exact') THEN 1 ELSE 0 END) AS exact_count,
+              SUM(CASE WHEN label IN ('High Relevant','Partial') THEN 1 ELSE 0 END) AS high_relevant_count,
+              SUM(CASE WHEN label='Low Relevant' THEN 1 ELSE 0 END) AS low_relevant_count,
               SUM(CASE WHEN label='Irrelevant' THEN 1 ELSE 0 END) AS irrelevant_count,
               MAX(updated_at) AS updated_at
             FROM relevance_labels
@@ -420,7 +423,8 @@ class EvalStore:
             "query": query_text,
             "total": int((row["total"] or 0) if row else 0),
             "exact_count": int((row["exact_count"] or 0) if row else 0),
-            "partial_count": int((row["partial_count"] or 0) if row else 0),
+            "high_relevant_count": int((row["high_relevant_count"] or 0) if row else 0),
+            "low_relevant_count": int((row["low_relevant_count"] or 0) if row else 0),
             "irrelevant_count": int((row["irrelevant_count"] or 0) if row else 0),
             "updated_at": row["updated_at"] if row else None,
         }
@@ -42,6 +42,14 @@ def pick_text(value: Any, preferred_lang: str = &quot;en&quot;) -&gt; str:
     return str(value).strip()
  
  
+def zh_title_from_multilingual(title_multilingual: Any) -> str:
+    """Chinese title string from API debug ``title_multilingual`` (ES-style dict)."""
+    if not isinstance(title_multilingual, dict):
+        return ""
+    zh = str(title_multilingual.get("zh") or "").strip()
+    return zh
+
+
 def safe_json_dumps(data: Any) -> str:
     return json.dumps(data, ensure_ascii=False, separators=(",", ":"))
  
@@ -10,7 +10,7 @@ QUERIES=&quot;${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}&quot;
  
 usage() {
   echo "Usage: $0 batch|batch-rebuild|serve"
-  echo "  batch          — batch eval: live search every query, LLM only for missing labels (top_k=50, simple)"
+  echo "  batch          — batch eval: live search every query, LLM only for missing labels (top_k=50)"
   echo "  batch-rebuild  — deep rebuild: build --force-refresh-labels (search recall pool + full-corpus rerank + batched LLM; expensive)"
   echo "  serve          — eval UI (default http://0.0.0.0:\${EVAL_WEB_PORT:-6010}/; also: ./scripts/start_eval_web.sh)"
   echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)"
@@ -22,8 +22,7 @@ case &quot;${1:-}&quot; in
       --tenant-id "$TENANT_ID" \
       --queries-file "$QUERIES" \
       --top-k 50 \
-      --language en \
-      --labeler-mode simple
+      --language en
     ;;
   batch-rebuild)
     exec "$PY" scripts/evaluation/build_annotation_set.py build \
@@ -33,8 +32,7 @@ case &quot;${1:-}&quot; in
       --rerank-depth 10000 \
       --force-refresh-rerank \
       --force-refresh-labels \
-      --language en \
-      --labeler-mode simple
+      --language en
     ;;
   serve)
     EVAL_WEB_PORT="${EVAL_WEB_PORT:-6010}"
@@ -322,6 +322,109 @@ def test_analyze_products_uses_product_level_cache_across_batch_requests():
     assert third[1]["anchor_text"] == "anchor:shirt"
  
  
+def test_analyze_products_reuses_cached_content_with_current_product_identity():
+    cached_result = {
+        "id": "1165",
+        "lang": "zh",
+        "title_input": "old-title",
+        "title": "法式连衣裙",
+        "category_path": "女装>连衣裙",
+        "enriched_tags": "法式,收腰",
+        "target_audience": "年轻女性",
+        "usage_scene": "通勤,约会",
+        "season": "春季,夏季",
+        "key_attributes": "中长款",
+        "material": "聚酯纤维",
+        "features": "透气",
+        "anchor_text": "法式收腰连衣裙",
+    }
+    products = [{"id": "69960", "title": "dress"}]
+
+    with mock.patch.object(product_enrich, "API_KEY", "fake-key"), mock.patch.object(
+        product_enrich,
+        "_get_cached_anchor_result",
+        wraps=lambda product, target_lang: product_enrich._normalize_analysis_result(
+            cached_result,
+            product=product,
+            target_lang=target_lang,
+        ),
+    ), mock.patch.object(
+        product_enrich,
+        "process_batch",
+        side_effect=AssertionError("process_batch should not be called on cache hit"),
+    ):
+        result = product_enrich.analyze_products(
+            products,
+            target_lang="zh",
+            tenant_id="170",
+        )
+
+    assert result == [
+        {
+            "id": "69960",
+            "lang": "zh",
+            "title_input": "dress",
+            "title": "法式连衣裙",
+            "category_path": "女装>连衣裙",
+            "tags": "法式,收腰",
+            "target_audience": "年轻女性",
+            "usage_scene": "通勤,约会",
+            "season": "春季,夏季",
+            "key_attributes": "中长款",
+            "material": "聚酯纤维",
+            "features": "透气",
+            "anchor_text": "法式收腰连衣裙",
+        }
+    ]
+
+
+def test_build_index_content_fields_maps_internal_tags_to_enriched_tags_output():
+    def fake_analyze_products(products, target_lang="zh", batch_size=None, tenant_id=None):
+        return [
+            {
+                "id": products[0]["id"],
+                "lang": target_lang,
+                "title_input": products[0]["title"],
+                "title": products[0]["title"],
+                "category_path": "玩具>滑行玩具",
+                "tags": f"{target_lang}-tag1,{target_lang}-tag2",
+                "target_audience": f"{target_lang}-audience",
+                "usage_scene": "",
+                "season": "",
+                "key_attributes": "",
+                "material": "",
+                "features": "",
+                "anchor_text": f"{target_lang}-anchor",
+            }
+        ]
+
+    with mock.patch.object(
+        product_enrich,
+        "analyze_products",
+        side_effect=fake_analyze_products,
+    ):
+        result = product_enrich.build_index_content_fields(
+            items=[{"spu_id": "69960", "title": "dress"}],
+            tenant_id="170",
+        )
+
+    assert result == [
+        {
+            "id": "69960",
+            "qanchors": {"zh": ["zh-anchor"], "en": ["en-anchor"]},
+            "enriched_tags": {"zh": ["zh-tag1", "zh-tag2"], "en": ["en-tag1", "en-tag2"]},
+            "enriched_attributes": [
+                {"name": "enriched_tags", "value": {"zh": "zh-tag1"}},
+                {"name": "enriched_tags", "value": {"zh": "zh-tag2"}},
+                {"name": "target_audience", "value": {"zh": "zh-audience"}},
+                {"name": "enriched_tags", "value": {"en": "en-tag1"}},
+                {"name": "enriched_tags", "value": {"en": "en-tag2"}},
+                {"name": "target_audience", "value": {"en": "en-audience"}},
+            ],
+        }
+    ]
+
+
 def test_anchor_cache_key_depends_on_product_input_not_identifiers():
     product_a = {
         "id": "1",