Compare View
Commits (7)
-
`indexer/product_enrich.py`,不是再补一层判断。 根因有两个:缓存 key 按内容复用,但缓存值里还带着旧商品的 `id/title_input`;同时内部分析结果在历史上混用了 `tags` 和 `enriched_tags`。这样一旦命中旧缓存,`build_index_content_fields()` 会因为 `id` 对不上把结果丢掉,最后对外就变成全空。 现在的处理是: - 内部分析结果统一用 `tags` 作为 LLM/缓存层字段。 - 对外只在 `build_index_content_fields()` 封装时映射成 `enriched_tags`,`enriched_attributes` 里也统一产出 `name="enriched_tags"`。 - 读取缓存时会先做归一化:把旧缓存里的 `enriched_tags` 兼容成内部 `tags`,并把命中的缓存结果重绑到当前请求商品的 `id/title_input`。 - 写缓存时也统一写成归一化后的内部结构,并且空内容不再写入缓存。
-
scripts/evaluation/eval_framework/constants.py:500 → 200 Rebuild 里 rank <= recall_n 的 rerank_score: 1.0 仍按该 K 生效。 2. LLM 批次上下限 最少批次:DEFAULT_REBUILD_MIN_LLM_BATCHES 20 → 10 最多批次:仍为 40(未改) 3. 提前结束条件(_annotate_rebuild_batches) 在已跑满 min_batches 之后,对每个批次: 本批无 Exact(exact_n == 0),且满足其一即视为 bad batch: irrelevant_ratio >= 0.94 或 (irrelevant + Low Relevant) / n >= 0.96(弱相关用 RELEVANCE_LOW) 连续 2 个 bad batch 则 early stop(原先是连续 3 次、irrelevant > 0.92)。 批次日志里增加了 low_ratio、irrelevant_plus_low_ratio;rebuild 元数据里增加了 rebuild_irrel_low_combined_stop_ratio。 4. CLI --search-recall-top-k 说明改为默认 200 --rebuild-min-batches 说明改为默认 10 --rebuild-irrelevant-stop-ratio / --rebuild-irrelevant-stop-streak 说明与新逻辑一致 新增 --rebuild-irrel-low-combined-stop-ratio(默认 0.96)
Showing
20 changed files
Show diff stats
docs/Usage-Guide.md
| ... | ... | @@ -202,7 +202,7 @@ python -m pytest -q tests/test_rerank_client.py tests/test_es_query_builder.py t |
| 202 | 202 | ./scripts/service_ctl.sh restart backend |
| 203 | 203 | sleep 3 |
| 204 | 204 | ./scripts/service_ctl.sh status backend |
| 205 | -./scripts/evaluation/quick_start_eval.sh batch | |
| 205 | +./scripts/evaluation/start_eval.sh.sh batch | |
| 206 | 206 | ``` |
| 207 | 207 | |
| 208 | 208 | 离线批量评估会把标注与报表写到 `artifacts/search_evaluation/`(SQLite、`batch_reports/` 下的 JSON/Markdown 等)。说明与命令见 [scripts/evaluation/README.md](../scripts/evaluation/README.md)。 | ... | ... |
docs/issue-2026-03-31-评估框架-done-0331.md
| ... | ... | @@ -138,7 +138,7 @@ queries默认是queries/queries.txt,填入左侧列表框,点击其中任何 |
| 138 | 138 | |
| 139 | 139 | |
| 140 | 140 | @scripts/evaluation/README.md @scripts/evaluation/eval_framework/framework.py |
| 141 | -@quick_start_eval.sh (29-35) | |
| 141 | +@start_eval.sh.sh (29-35) | |
| 142 | 142 | 请以如下流程为准,进行改造: |
| 143 | 143 | 如果重建的话,对每个query: |
| 144 | 144 | 每个搜索结果应该会扫描全库, | ... | ... |
docs/相关性检索优化说明.md
| ... | ... | @@ -240,7 +240,7 @@ python -m pytest -q tests/test_rerank_client.py tests/test_es_query_builder.py t |
| 240 | 240 | ./scripts/service_ctl.sh restart backend |
| 241 | 241 | sleep 3 |
| 242 | 242 | ./scripts/service_ctl.sh status backend |
| 243 | -./scripts/evaluation/quick_start_eval.sh batch | |
| 243 | +./scripts/evaluation/start_eval.sh.sh batch | |
| 244 | 244 | ``` |
| 245 | 245 | |
| 246 | 246 | 评估产物在 `artifacts/search_evaluation/`(如 `search_eval.sqlite3`、`batch_reports/` 下的 JSON/Markdown)。流程与参数说明见 [scripts/evaluation/README.md](../scripts/evaluation/README.md)。 | ... | ... |
indexer/product_enrich.py
| ... | ... | @@ -147,15 +147,40 @@ if _missing_prompt_langs: |
| 147 | 147 | # 多值字段分隔:英文逗号、中文逗号、顿号,及历史约定的 ; | / 与空白 |
| 148 | 148 | _MULTI_VALUE_FIELD_SPLIT_RE = re.compile(r"[,、,;|/\n\t]+") |
| 149 | 149 | _CORE_INDEX_LANGUAGES = ("zh", "en") |
| 150 | -_ENRICHED_ATTRIBUTE_DIMENSIONS = ( | |
| 151 | - "enriched_tags", | |
| 150 | +_ANALYSIS_ATTRIBUTE_FIELD_MAP = ( | |
| 151 | + ("tags", "enriched_tags"), | |
| 152 | + ("target_audience", "target_audience"), | |
| 153 | + ("usage_scene", "usage_scene"), | |
| 154 | + ("season", "season"), | |
| 155 | + ("key_attributes", "key_attributes"), | |
| 156 | + ("material", "material"), | |
| 157 | + ("features", "features"), | |
| 158 | +) | |
| 159 | +_ANALYSIS_RESULT_FIELDS = ( | |
| 160 | + "title", | |
| 161 | + "category_path", | |
| 162 | + "tags", | |
| 163 | + "target_audience", | |
| 164 | + "usage_scene", | |
| 165 | + "season", | |
| 166 | + "key_attributes", | |
| 167 | + "material", | |
| 168 | + "features", | |
| 169 | + "anchor_text", | |
| 170 | +) | |
| 171 | +_ANALYSIS_MEANINGFUL_FIELDS = ( | |
| 172 | + "tags", | |
| 152 | 173 | "target_audience", |
| 153 | 174 | "usage_scene", |
| 154 | 175 | "season", |
| 155 | 176 | "key_attributes", |
| 156 | 177 | "material", |
| 157 | 178 | "features", |
| 179 | + "anchor_text", | |
| 158 | 180 | ) |
| 181 | +_ANALYSIS_FIELD_ALIASES = { | |
| 182 | + "tags": ("tags", "enriched_tags"), | |
| 183 | +} | |
| 159 | 184 | |
| 160 | 185 | |
| 161 | 186 | def split_multi_value_field(text: Optional[str]) -> List[str]: |
| ... | ... | @@ -195,25 +220,104 @@ def _append_enriched_attribute( |
| 195 | 220 | target.append({"name": name, "value": {lang: value}}) |
| 196 | 221 | |
| 197 | 222 | |
| 223 | +def _get_product_id(product: Dict[str, Any]) -> str: | |
| 224 | + return str(product.get("id") or product.get("spu_id") or "").strip() | |
| 225 | + | |
| 226 | + | |
| 227 | +def _get_analysis_field_aliases(field_name: str) -> Tuple[str, ...]: | |
| 228 | + return _ANALYSIS_FIELD_ALIASES.get(field_name, (field_name,)) | |
| 229 | + | |
| 230 | + | |
| 231 | +def _get_analysis_field_value(row: Dict[str, Any], field_name: str) -> Any: | |
| 232 | + for alias in _get_analysis_field_aliases(field_name): | |
| 233 | + if alias in row: | |
| 234 | + return row.get(alias) | |
| 235 | + return None | |
| 236 | + | |
| 237 | + | |
| 238 | +def _has_meaningful_value(value: Any) -> bool: | |
| 239 | + if value is None: | |
| 240 | + return False | |
| 241 | + if isinstance(value, str): | |
| 242 | + return bool(value.strip()) | |
| 243 | + if isinstance(value, dict): | |
| 244 | + return any(_has_meaningful_value(v) for v in value.values()) | |
| 245 | + if isinstance(value, list): | |
| 246 | + return any(_has_meaningful_value(v) for v in value) | |
| 247 | + return bool(value) | |
| 248 | + | |
| 249 | + | |
| 250 | +def _make_empty_analysis_result( | |
| 251 | + product: Dict[str, Any], | |
| 252 | + target_lang: str, | |
| 253 | + error: Optional[str] = None, | |
| 254 | +) -> Dict[str, Any]: | |
| 255 | + result = { | |
| 256 | + "id": _get_product_id(product), | |
| 257 | + "lang": target_lang, | |
| 258 | + "title_input": str(product.get("title") or "").strip(), | |
| 259 | + } | |
| 260 | + for field in _ANALYSIS_RESULT_FIELDS: | |
| 261 | + result[field] = "" | |
| 262 | + if error: | |
| 263 | + result["error"] = error | |
| 264 | + return result | |
| 265 | + | |
| 266 | + | |
| 267 | +def _normalize_analysis_result( | |
| 268 | + result: Dict[str, Any], | |
| 269 | + product: Dict[str, Any], | |
| 270 | + target_lang: str, | |
| 271 | +) -> Dict[str, Any]: | |
| 272 | + normalized = _make_empty_analysis_result(product, target_lang) | |
| 273 | + if not isinstance(result, dict): | |
| 274 | + return normalized | |
| 275 | + | |
| 276 | + normalized["lang"] = str(result.get("lang") or target_lang).strip() or target_lang | |
| 277 | + normalized["title"] = str(result.get("title") or "").strip() | |
| 278 | + normalized["category_path"] = str(result.get("category_path") or "").strip() | |
| 279 | + normalized["title_input"] = str( | |
| 280 | + product.get("title") or result.get("title_input") or "" | |
| 281 | + ).strip() | |
| 282 | + | |
| 283 | + for field in _ANALYSIS_RESULT_FIELDS: | |
| 284 | + if field in {"title", "category_path"}: | |
| 285 | + continue | |
| 286 | + normalized[field] = str(_get_analysis_field_value(result, field) or "").strip() | |
| 287 | + | |
| 288 | + if result.get("error"): | |
| 289 | + normalized["error"] = str(result.get("error")) | |
| 290 | + return normalized | |
| 291 | + | |
| 292 | + | |
| 293 | +def _has_meaningful_analysis_content(result: Dict[str, Any]) -> bool: | |
| 294 | + return any(_has_meaningful_value(result.get(field)) for field in _ANALYSIS_MEANINGFUL_FIELDS) | |
| 295 | + | |
| 296 | + | |
| 198 | 297 | def _apply_index_content_row(result: Dict[str, Any], row: Dict[str, Any], lang: str) -> None: |
| 199 | 298 | if not row or row.get("error"): |
| 200 | 299 | return |
| 201 | 300 | |
| 202 | - anchor_text = str(row.get("anchor_text") or "").strip() | |
| 301 | + anchor_text = str(_get_analysis_field_value(row, "anchor_text") or "").strip() | |
| 203 | 302 | if anchor_text: |
| 204 | 303 | _append_lang_phrase_map(result["qanchors"], lang=lang, raw_value=anchor_text) |
| 205 | 304 | |
| 206 | - for name in _ENRICHED_ATTRIBUTE_DIMENSIONS: | |
| 207 | - raw = row.get(name) | |
| 305 | + for source_name, output_name in _ANALYSIS_ATTRIBUTE_FIELD_MAP: | |
| 306 | + raw = _get_analysis_field_value(row, source_name) | |
| 208 | 307 | if not raw: |
| 209 | 308 | continue |
| 210 | - _append_enriched_attribute(result["enriched_attributes"], name=name, lang=lang, raw_value=raw) | |
| 211 | - if name == "enriched_tags": | |
| 309 | + _append_enriched_attribute( | |
| 310 | + result["enriched_attributes"], | |
| 311 | + name=output_name, | |
| 312 | + lang=lang, | |
| 313 | + raw_value=raw, | |
| 314 | + ) | |
| 315 | + if output_name == "enriched_tags": | |
| 212 | 316 | _append_lang_phrase_map(result["enriched_tags"], lang=lang, raw_value=raw) |
| 213 | 317 | |
| 214 | 318 | |
| 215 | 319 | def _normalize_index_content_item(item: Dict[str, Any]) -> Dict[str, str]: |
| 216 | - item_id = str(item.get("id") or item.get("spu_id") or "").strip() | |
| 320 | + item_id = _get_product_id(item) | |
| 217 | 321 | return { |
| 218 | 322 | "id": item_id, |
| 219 | 323 | "title": str(item.get("title") or "").strip(), |
| ... | ... | @@ -369,7 +473,10 @@ def _get_cached_anchor_result( |
| 369 | 473 | raw = _anchor_redis.get(key) |
| 370 | 474 | if not raw: |
| 371 | 475 | return None |
| 372 | - return json.loads(raw) | |
| 476 | + result = _normalize_analysis_result(json.loads(raw), product=product, target_lang=target_lang) | |
| 477 | + if not _has_meaningful_analysis_content(result): | |
| 478 | + return None | |
| 479 | + return result | |
| 373 | 480 | except Exception as e: |
| 374 | 481 | logger.warning(f"Failed to get anchor cache: {e}") |
| 375 | 482 | return None |
| ... | ... | @@ -383,9 +490,12 @@ def _set_cached_anchor_result( |
| 383 | 490 | if not _anchor_redis: |
| 384 | 491 | return |
| 385 | 492 | try: |
| 493 | + normalized = _normalize_analysis_result(result, product=product, target_lang=target_lang) | |
| 494 | + if not _has_meaningful_analysis_content(normalized): | |
| 495 | + return | |
| 386 | 496 | key = _make_anchor_cache_key(product, target_lang) |
| 387 | 497 | ttl = ANCHOR_CACHE_EXPIRE_DAYS * 24 * 3600 |
| 388 | - _anchor_redis.setex(key, ttl, json.dumps(result, ensure_ascii=False)) | |
| 498 | + _anchor_redis.setex(key, ttl, json.dumps(normalized, ensure_ascii=False)) | |
| 389 | 499 | except Exception as e: |
| 390 | 500 | logger.warning(f"Failed to set anchor cache: {e}") |
| 391 | 501 | |
| ... | ... | @@ -654,7 +764,7 @@ def parse_markdown_table(markdown_content: str) -> List[Dict[str, str]]: |
| 654 | 764 | "seq_no": parts[0], |
| 655 | 765 | "title": parts[1], # 商品标题(按目标语言) |
| 656 | 766 | "category_path": parts[2] if len(parts) > 2 else "", # 品类路径 |
| 657 | - "enriched_tags": parts[3] if len(parts) > 3 else "", # 细分标签 | |
| 767 | + "tags": parts[3] if len(parts) > 3 else "", # 细分标签 | |
| 658 | 768 | "target_audience": parts[4] if len(parts) > 4 else "", # 适用人群 |
| 659 | 769 | "usage_scene": parts[5] if len(parts) > 5 else "", # 使用场景 |
| 660 | 770 | "season": parts[6] if len(parts) > 6 else "", # 适用季节 |
| ... | ... | @@ -705,7 +815,7 @@ def process_batch( |
| 705 | 815 | batch_data: List[Dict[str, str]], |
| 706 | 816 | batch_num: int, |
| 707 | 817 | target_lang: str = "zh", |
| 708 | -) -> List[Dict[str, str]]: | |
| 818 | +) -> List[Dict[str, Any]]: | |
| 709 | 819 | """处理一个批次的数据""" |
| 710 | 820 | logger.info(f"\n{'#' * 80}") |
| 711 | 821 | logger.info(f"Processing Batch {batch_num} ({len(batch_data)} items)") |
| ... | ... | @@ -725,22 +835,11 @@ def process_batch( |
| 725 | 835 | target_lang, |
| 726 | 836 | ) |
| 727 | 837 | return [ |
| 728 | - { | |
| 729 | - "id": item["id"], | |
| 730 | - "lang": target_lang, | |
| 731 | - "title_input": item.get("title", ""), | |
| 732 | - "title": "", | |
| 733 | - "category_path": "", | |
| 734 | - "enriched_tags": "", | |
| 735 | - "target_audience": "", | |
| 736 | - "usage_scene": "", | |
| 737 | - "season": "", | |
| 738 | - "key_attributes": "", | |
| 739 | - "material": "", | |
| 740 | - "features": "", | |
| 741 | - "anchor_text": "", | |
| 742 | - "error": f"prompt_creation_failed: unsupported target_lang={target_lang}", | |
| 743 | - } | |
| 838 | + _make_empty_analysis_result( | |
| 839 | + item, | |
| 840 | + target_lang, | |
| 841 | + error=f"prompt_creation_failed: unsupported target_lang={target_lang}", | |
| 842 | + ) | |
| 744 | 843 | for item in batch_data |
| 745 | 844 | ] |
| 746 | 845 | |
| ... | ... | @@ -764,24 +863,18 @@ def process_batch( |
| 764 | 863 | results_with_ids = [] |
| 765 | 864 | for i, parsed_item in enumerate(parsed_results): |
| 766 | 865 | if i < len(batch_data): |
| 767 | - original_id = batch_data[i]["id"] | |
| 768 | - result = { | |
| 769 | - "id": original_id, | |
| 770 | - "lang": target_lang, | |
| 771 | - "title_input": batch_data[i]["title"], # 原始输入标题 | |
| 772 | - "title": parsed_item.get("title", ""), # 模型生成的标题 | |
| 773 | - "category_path": parsed_item.get("category_path", ""), # 品类路径 | |
| 774 | - "enriched_tags": parsed_item.get("enriched_tags", ""), # 细分标签 | |
| 775 | - "target_audience": parsed_item.get("target_audience", ""), # 适用人群 | |
| 776 | - "usage_scene": parsed_item.get("usage_scene", ""), # 使用场景 | |
| 777 | - "season": parsed_item.get("season", ""), # 适用季节 | |
| 778 | - "key_attributes": parsed_item.get("key_attributes", ""), # 关键属性 | |
| 779 | - "material": parsed_item.get("material", ""), # 材质说明 | |
| 780 | - "features": parsed_item.get("features", ""), # 功能特点 | |
| 781 | - "anchor_text": parsed_item.get("anchor_text", ""), # 锚文本 | |
| 782 | - } | |
| 866 | + source_product = batch_data[i] | |
| 867 | + result = _normalize_analysis_result( | |
| 868 | + parsed_item, | |
| 869 | + product=source_product, | |
| 870 | + target_lang=target_lang, | |
| 871 | + ) | |
| 783 | 872 | results_with_ids.append(result) |
| 784 | - logger.info(f"Mapped: seq={parsed_item['seq_no']} -> original_id={original_id}") | |
| 873 | + logger.info( | |
| 874 | + "Mapped: seq=%s -> original_id=%s", | |
| 875 | + parsed_item.get("seq_no"), | |
| 876 | + source_product.get("id"), | |
| 877 | + ) | |
| 785 | 878 | |
| 786 | 879 | # 保存批次 JSON 日志到独立文件 |
| 787 | 880 | batch_log = { |
| ... | ... | @@ -808,22 +901,7 @@ def process_batch( |
| 808 | 901 | logger.error(f"Error processing batch {batch_num}: {str(e)}", exc_info=True) |
| 809 | 902 | # 返回空结果,保持ID映射 |
| 810 | 903 | return [ |
| 811 | - { | |
| 812 | - "id": item["id"], | |
| 813 | - "lang": target_lang, | |
| 814 | - "title_input": item["title"], | |
| 815 | - "title": "", | |
| 816 | - "category_path": "", | |
| 817 | - "enriched_tags": "", | |
| 818 | - "target_audience": "", | |
| 819 | - "usage_scene": "", | |
| 820 | - "season": "", | |
| 821 | - "key_attributes": "", | |
| 822 | - "material": "", | |
| 823 | - "features": "", | |
| 824 | - "anchor_text": "", | |
| 825 | - "error": str(e), | |
| 826 | - } | |
| 904 | + _make_empty_analysis_result(item, target_lang, error=str(e)) | |
| 827 | 905 | for item in batch_data |
| 828 | 906 | ] |
| 829 | 907 | ... | ... |
scripts/evaluation/README.md
| ... | ... | @@ -23,7 +23,7 @@ This directory holds the offline annotation builder, the evaluation web UI/API, |
| 23 | 23 | | `fusion_experiments_round1.json` | Broader first-round experiments | |
| 24 | 24 | | `queries/queries.txt` | Canonical evaluation queries | |
| 25 | 25 | | `README_Requirement.md` | Product/requirements reference | |
| 26 | -| `quick_start_eval.sh` | Wrapper: `batch`, `batch-rebuild` (deep `build` + `--force-refresh-labels`), or `serve` | | |
| 26 | +| `start_eval.sh` | Wrapper: `batch`, `batch-rebuild` (deep `build` + `--force-refresh-labels`), or `serve` | | |
| 27 | 27 | | `../start_eval_web.sh` | Same as `serve` with `activate.sh`; use `./scripts/service_ctl.sh start eval-web` (default port **6010**, override with `EVAL_WEB_PORT`). `./run.sh all` includes eval-web. | |
| 28 | 28 | |
| 29 | 29 | ## Quick start (repo root) |
| ... | ... | @@ -32,13 +32,13 @@ Set tenant if needed (`export TENANT_ID=163`). You need a live search API, DashS |
| 32 | 32 | |
| 33 | 33 | ```bash |
| 34 | 34 | # Batch: live search for every query; only uncached (query, spu_id) pairs hit the LLM |
| 35 | -./scripts/evaluation/quick_start_eval.sh batch | |
| 35 | +./scripts/evaluation/start_eval.sh batch | |
| 36 | 36 | |
| 37 | -# Deep rebuild: search recall top-500 (score 1) + full-corpus rerank outside pool + batched LLM (early stop; expensive) | |
| 38 | -./scripts/evaluation/quick_start_eval.sh batch-rebuild | |
| 37 | +# Deep rebuild: per-query full corpus rerank (outside search recall pool) + LLM in batches along global sort order (early stop; expensive) | |
| 38 | +./scripts/evaluation/start_eval.sh batch-rebuild | |
| 39 | 39 | |
| 40 | 40 | # UI: http://127.0.0.1:6010/ |
| 41 | -./scripts/evaluation/quick_start_eval.sh serve | |
| 41 | +./scripts/evaluation/start_eval.sh serve | |
| 42 | 42 | # or: ./scripts/service_ctl.sh start eval-web |
| 43 | 43 | ``` |
| 44 | 44 | |
| ... | ... | @@ -69,9 +69,36 @@ Explicit equivalents: |
| 69 | 69 | --port 6010 |
| 70 | 70 | ``` |
| 71 | 71 | |
| 72 | -Each `batch` run walks the full queries file. With `batch --force-refresh-labels`, every live top-`k` hit is re-judged by the LLM. | |
| 72 | +Each `batch` run walks the full queries file and writes a **batch report** under `batch_reports/`. With `batch --force-refresh-labels`, every live top-`k` hit is re-judged by the LLM (still only those hits—not the deep rebuild pipeline). | |
| 73 | 73 | |
| 74 | -**Rebuild (`build --force-refresh-labels`):** For each query: take search top **500** as the recall pool (treated as rerank score **1**; those SKUs are not sent to the reranker). Rerank the rest of the tenant corpus; if more than **1000** non-pool docs have rerank score **> 0.5**, the query is **skipped** (logged as too easy / tail too relevant). Otherwise merge pool (search order) + non-pool (rerank score descending), then LLM-judge in batches of **50**, logging **exact_ratio** and **irrelevant_ratio** per batch. Stop after **3** consecutive batches with irrelevant_ratio **> 92%**, but only after at least **15** batches and at most **40** batches. | |
| 74 | +### `start_eval.sh batch-rebuild` (deep annotation rebuild) | |
| 75 | + | |
| 76 | +This runs `build_annotation_set.py build` with **`--force-refresh-labels`** and **`--force-refresh-rerank`** (see the explicit command block below). It does **not** run the `batch` subcommand: there is **no** aggregate batch report for this step; outputs are per-query JSON under `query_builds/` plus updates in `search_eval.sqlite3`. | |
| 77 | + | |
| 78 | +For **each** query in `queries.txt`, in order: | |
| 79 | + | |
| 80 | +1. **Search recall** — Call the live search API with `size = max(--search-depth, --search-recall-top-k)` (the wrapper uses `--search-depth 500`). The first **`--search-recall-top-k`** hits (default **200**, see `eval_framework.constants.DEFAULT_SEARCH_RECALL_TOP_K`) form the **recall pool**; they are treated as rerank score **1** and are **not** sent to the reranker. | |
| 81 | +2. **Full corpus** — Load the tenant’s product corpus from Elasticsearch (same tenant as `TENANT_ID` / `--tenant-id`, default **163**), via `corpus_docs()` (cached in SQLite after the first load). | |
| 82 | +3. **Rerank outside pool** — Every corpus document whose `spu_id` is **not** in the pool is scored by the reranker API, **80 documents per request**. With `--force-refresh-rerank`, all those scores are recomputed and written to the **`rerank_scores`** table in `search_eval.sqlite3`. Without that flag, existing `(tenant_id, query, spu_id)` scores are reused and only missing rows hit the API. | |
| 83 | +4. **Skip “too easy” queries** — If more than **1000** non-pool documents have rerank score **> 0.5**, that query is **skipped** (one log line: tail too relevant / easy to satisfy). No LLM calls for that query. | |
| 84 | +5. **Global sort** — Order to label: pool in **search rank order**, then all remaining corpus docs in **descending rerank score** (dedupe by `spu_id`, pool wins). | |
| 85 | +6. **LLM labeling** — Walk that list **from the head** in batches of **50** by default (`--rebuild-llm-batch-size`). Each batch log includes **exact_ratio**, **irrelevant_ratio**, **low_ratio**, and **irrelevant_plus_low_ratio**. | |
| 86 | + | |
| 87 | + **Early stop** (defaults in `eval_framework.constants`; overridable via CLI): | |
| 88 | + | |
| 89 | + - Run **at least** `--rebuild-min-batches` batches (**10** by default) before any early stop is allowed. | |
| 90 | + - After that, define a **bad batch** as one where the batch has **no** **Exact Match** label **and** either: | |
| 91 | + - **Irrelevant** proportion **≥ 0.94** (`--rebuild-irrelevant-stop-ratio`), or | |
| 92 | + - **(Irrelevant + Low Relevant)** proportion **≥ 0.96** (`--rebuild-irrel-low-combined-stop-ratio`). | |
| 93 | + (“Low Relevant” is the weak tier; **High Relevant** does not count toward this combined ratio.) | |
| 94 | + - Count **consecutive** bad batches. **Reset** the count to 0 on any batch that is not bad. | |
| 95 | + - **Stop** when the consecutive bad count reaches **`--rebuild-irrelevant-stop-streak`** (**2** by default), or when **`--rebuild-max-batches`** (**40**) is reached—whichever comes first (up to **2000** docs per query at default batch size). | |
| 96 | + | |
| 97 | + So labeling follows best-first order but **stops early** when the model sees two consecutive “dead” batches; the tail may never be judged. | |
| 98 | + | |
| 99 | +**Incremental pool (no full rebuild):** `build_annotation_set.py build` **without** `--force-refresh-labels` uses the older windowed pool (`--annotate-search-top-k`, `--annotate-rerank-top-k`) and fills missing labels in one pass—no rerank-skip rule and no LLM early-stop loop. | |
| 100 | + | |
| 101 | +**Tuning the rebuild path:** `--search-recall-top-k`, `--rerank-high-threshold`, `--rerank-high-skip-count`, `--rebuild-llm-batch-size`, `--rebuild-min-batches`, `--rebuild-max-batches`, `--rebuild-irrelevant-stop-ratio`, `--rebuild-irrel-low-combined-stop-ratio`, `--rebuild-irrelevant-stop-streak` on `build` (see `eval_framework/cli.py`). Rerank API chunk size is **80** docs per request in code (`full_corpus_rerank_outside_exclude`). | |
| 75 | 102 | |
| 76 | 103 | ## Artifacts |
| 77 | 104 | |
| ... | ... | @@ -95,7 +122,7 @@ Default root: `artifacts/search_evaluation/` |
| 95 | 122 | |
| 96 | 123 | **Standard:** Run `batch` without `--force-refresh-labels` to extend coverage, then use the UI or batch in cached mode. Single-query evaluation defaults to **no** auto-annotation: recall still hits the live API; scoring uses SQLite only, and unlabeled hits count as `Irrelevant`. |
| 97 | 124 | |
| 98 | -**Incremental pool (no full rebuild):** `build_annotation_set.py build` without `--force-refresh-labels` merges search and full-corpus rerank windows before labeling (CLI `--search-depth`, `--rerank-depth`, `--annotate-*-top-k`). **Full rebuild** uses the recall-pool + rerank-skip + batched early-stop flow above; tune thresholds via `--search-recall-top-k`, `--rerank-high-threshold`, `--rerank-high-skip-count`, `--rebuild-*` flags on `build`. | |
| 125 | +**Rebuild vs incremental `build`:** Deep rebuild is documented in the **`batch-rebuild`** subsection above. Incremental `build` (without `--force-refresh-labels`) uses `--annotate-search-top-k` / `--annotate-rerank-top-k` windows instead. | |
| 99 | 126 | |
| 100 | 127 | **Fusion tuning:** `tune_fusion.py` writes experiment configs, restarts the backend, runs batch evaluation, and optionally applies the best variant (see `--experiments-file`, `--score-metric`, `--apply-best`). |
| 101 | 128 | ... | ... |
scripts/evaluation/eval_framework/__init__.py
| ... | ... | @@ -12,15 +12,15 @@ ensure_project_on_path() |
| 12 | 12 | |
| 13 | 13 | from .constants import ( # noqa: E402 |
| 14 | 14 | DEFAULT_ARTIFACT_ROOT, |
| 15 | - DEFAULT_LABELER_MODE, | |
| 16 | 15 | DEFAULT_QUERY_FILE, |
| 17 | - JUDGE_PROMPT_VERSION_COMPLEX, | |
| 18 | - JUDGE_PROMPT_VERSION_SIMPLE, | |
| 19 | 16 | PROJECT_ROOT, |
| 20 | 17 | RELEVANCE_EXACT, |
| 18 | + RELEVANCE_HIGH, | |
| 21 | 19 | RELEVANCE_IRRELEVANT, |
| 22 | - RELEVANCE_PARTIAL, | |
| 20 | + RELEVANCE_LOW, | |
| 21 | + RELEVANCE_NON_IRRELEVANT, | |
| 23 | 22 | VALID_LABELS, |
| 23 | + normalize_stored_label, | |
| 24 | 24 | ) |
| 25 | 25 | from .framework import SearchEvaluationFramework # noqa: E402 |
| 26 | 26 | from .store import EvalStore, QueryBuildResult # noqa: E402 |
| ... | ... | @@ -36,22 +36,22 @@ from .utils import ( # noqa: E402 |
| 36 | 36 | |
| 37 | 37 | __all__ = [ |
| 38 | 38 | "DEFAULT_ARTIFACT_ROOT", |
| 39 | - "DEFAULT_LABELER_MODE", | |
| 40 | 39 | "DEFAULT_QUERY_FILE", |
| 41 | 40 | "EvalStore", |
| 42 | - "JUDGE_PROMPT_VERSION_COMPLEX", | |
| 43 | - "JUDGE_PROMPT_VERSION_SIMPLE", | |
| 44 | 41 | "PROJECT_ROOT", |
| 45 | 42 | "QueryBuildResult", |
| 46 | 43 | "RELEVANCE_EXACT", |
| 44 | + "RELEVANCE_HIGH", | |
| 47 | 45 | "RELEVANCE_IRRELEVANT", |
| 48 | - "RELEVANCE_PARTIAL", | |
| 46 | + "RELEVANCE_LOW", | |
| 47 | + "RELEVANCE_NON_IRRELEVANT", | |
| 49 | 48 | "SearchEvaluationFramework", |
| 50 | 49 | "VALID_LABELS", |
| 51 | 50 | "build_cli_parser", |
| 52 | 51 | "create_web_app", |
| 53 | 52 | "ensure_dir", |
| 54 | 53 | "main", |
| 54 | + "normalize_stored_label", | |
| 55 | 55 | "render_batch_report_markdown", |
| 56 | 56 | "sha1_text", |
| 57 | 57 | "utc_now_iso", | ... | ... |
scripts/evaluation/eval_framework/cli.py
| ... | ... | @@ -5,10 +5,11 @@ from __future__ import annotations |
| 5 | 5 | import argparse |
| 6 | 6 | import json |
| 7 | 7 | from pathlib import Path |
| 8 | +from typing import Any, Dict | |
| 8 | 9 | |
| 9 | 10 | from .constants import ( |
| 10 | - DEFAULT_LABELER_MODE, | |
| 11 | 11 | DEFAULT_QUERY_FILE, |
| 12 | + DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO, | |
| 12 | 13 | DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, |
| 13 | 14 | DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK, |
| 14 | 15 | DEFAULT_REBUILD_LLM_BATCH_SIZE, |
| ... | ... | @@ -23,6 +24,38 @@ from .utils import ensure_dir, utc_now_iso, utc_timestamp |
| 23 | 24 | from .web_app import create_web_app |
| 24 | 25 | |
| 25 | 26 | |
| 27 | +def add_judge_llm_args(p: argparse.ArgumentParser) -> None: | |
| 28 | + p.add_argument( | |
| 29 | + "--judge-model", | |
| 30 | + default=None, | |
| 31 | + metavar="MODEL", | |
| 32 | + help="Judge LLM model (default: eval_framework.constants.DEFAULT_JUDGE_MODEL).", | |
| 33 | + ) | |
| 34 | + p.add_argument( | |
| 35 | + "--enable-thinking", | |
| 36 | + action=argparse.BooleanOptionalAction, | |
| 37 | + default=None, | |
| 38 | + help="enable_thinking for DashScope (default: DEFAULT_JUDGE_ENABLE_THINKING).", | |
| 39 | + ) | |
| 40 | + p.add_argument( | |
| 41 | + "--dashscope-batch", | |
| 42 | + action=argparse.BooleanOptionalAction, | |
| 43 | + default=None, | |
| 44 | + help="DashScope Batch File API vs sync chat (default: DEFAULT_JUDGE_DASHSCOPE_BATCH).", | |
| 45 | + ) | |
| 46 | + | |
| 47 | + | |
| 48 | +def framework_kwargs_from_args(args: argparse.Namespace) -> Dict[str, Any]: | |
| 49 | + kw: Dict[str, Any] = {} | |
| 50 | + if args.judge_model is not None: | |
| 51 | + kw["judge_model"] = args.judge_model | |
| 52 | + if args.enable_thinking is not None: | |
| 53 | + kw["enable_thinking"] = args.enable_thinking | |
| 54 | + if args.dashscope_batch is not None: | |
| 55 | + kw["use_dashscope_batch"] = args.dashscope_batch | |
| 56 | + return kw | |
| 57 | + | |
| 58 | + | |
| 26 | 59 | def build_cli_parser() -> argparse.ArgumentParser: |
| 27 | 60 | parser = argparse.ArgumentParser(description="Search evaluation annotation builder and web UI") |
| 28 | 61 | sub = parser.add_subparsers(dest="command", required=True) |
| ... | ... | @@ -38,7 +71,7 @@ def build_cli_parser() -> argparse.ArgumentParser: |
| 38 | 71 | "--search-recall-top-k", |
| 39 | 72 | type=int, |
| 40 | 73 | default=None, |
| 41 | - help="Rebuild mode only: top-K search hits enter recall pool with score 1 (default when --force-refresh-labels: 500).", | |
| 74 | + help="Rebuild mode only: top-K search hits enter recall pool with score 1 (default when --force-refresh-labels: 200).", | |
| 42 | 75 | ) |
| 43 | 76 | build.add_argument( |
| 44 | 77 | "--rerank-high-threshold", |
| ... | ... | @@ -53,24 +86,30 @@ def build_cli_parser() -> argparse.ArgumentParser: |
| 53 | 86 | help="Rebuild only: skip query if more than this many non-pool docs have rerank score > threshold (default 1000).", |
| 54 | 87 | ) |
| 55 | 88 | build.add_argument("--rebuild-llm-batch-size", type=int, default=None, help="Rebuild only: LLM batch size (default 50).") |
| 56 | - build.add_argument("--rebuild-min-batches", type=int, default=None, help="Rebuild only: min LLM batches before early stop (default 15).") | |
| 89 | + build.add_argument("--rebuild-min-batches", type=int, default=None, help="Rebuild only: min LLM batches before early stop (default 10).") | |
| 57 | 90 | build.add_argument("--rebuild-max-batches", type=int, default=None, help="Rebuild only: max LLM batches (default 40).") |
| 58 | 91 | build.add_argument( |
| 59 | 92 | "--rebuild-irrelevant-stop-ratio", |
| 60 | 93 | type=float, |
| 61 | 94 | default=None, |
| 62 | - help="Rebuild only: irrelevant ratio above this counts toward early-stop streak (default 0.92).", | |
| 95 | + help="Rebuild only: irrelevant-only branch threshold (>=) for early-stop streak, requires no Exact (default 0.94).", | |
| 96 | + ) | |
| 97 | + build.add_argument( | |
| 98 | + "--rebuild-irrel-low-combined-stop-ratio", | |
| 99 | + type=float, | |
| 100 | + default=None, | |
| 101 | + help="Rebuild only: (irrelevant+low)/n threshold (>=) for early-stop streak, requires no Exact (default 0.96).", | |
| 63 | 102 | ) |
| 64 | 103 | build.add_argument( |
| 65 | 104 | "--rebuild-irrelevant-stop-streak", |
| 66 | 105 | type=int, |
| 67 | 106 | default=None, |
| 68 | - help="Rebuild only: stop after this many consecutive batches above irrelevant ratio (default 3).", | |
| 107 | + help="Rebuild only: consecutive bad batches before early stop (default 2).", | |
| 69 | 108 | ) |
| 70 | 109 | build.add_argument("--language", default="en") |
| 71 | 110 | build.add_argument("--force-refresh-rerank", action="store_true") |
| 72 | 111 | build.add_argument("--force-refresh-labels", action="store_true") |
| 73 | - build.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"]) | |
| 112 | + add_judge_llm_args(build) | |
| 74 | 113 | |
| 75 | 114 | batch = sub.add_parser("batch", help="Run batch evaluation against live search") |
| 76 | 115 | batch.add_argument("--tenant-id", default="163") |
| ... | ... | @@ -78,7 +117,7 @@ def build_cli_parser() -> argparse.ArgumentParser: |
| 78 | 117 | batch.add_argument("--top-k", type=int, default=100) |
| 79 | 118 | batch.add_argument("--language", default="en") |
| 80 | 119 | batch.add_argument("--force-refresh-labels", action="store_true") |
| 81 | - batch.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"]) | |
| 120 | + add_judge_llm_args(batch) | |
| 82 | 121 | |
| 83 | 122 | audit = sub.add_parser("audit", help="Audit annotation quality for queries") |
| 84 | 123 | audit.add_argument("--tenant-id", default="163") |
| ... | ... | @@ -87,20 +126,20 @@ def build_cli_parser() -> argparse.ArgumentParser: |
| 87 | 126 | audit.add_argument("--language", default="en") |
| 88 | 127 | audit.add_argument("--limit-suspicious", type=int, default=5) |
| 89 | 128 | audit.add_argument("--force-refresh-labels", action="store_true") |
| 90 | - audit.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"]) | |
| 129 | + add_judge_llm_args(audit) | |
| 91 | 130 | |
| 92 | 131 | serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010") |
| 93 | 132 | serve.add_argument("--tenant-id", default="163") |
| 94 | 133 | serve.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE)) |
| 95 | 134 | serve.add_argument("--host", default="0.0.0.0") |
| 96 | 135 | serve.add_argument("--port", type=int, default=6010) |
| 97 | - serve.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"]) | |
| 136 | + add_judge_llm_args(serve) | |
| 98 | 137 | |
| 99 | 138 | return parser |
| 100 | 139 | |
| 101 | 140 | |
| 102 | 141 | def run_build(args: argparse.Namespace) -> None: |
| 103 | - framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode) | |
| 142 | + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) | |
| 104 | 143 | queries = framework.queries_from_file(Path(args.queries_file)) |
| 105 | 144 | summary = [] |
| 106 | 145 | rebuild_kwargs = {} |
| ... | ... | @@ -115,6 +154,9 @@ def run_build(args: argparse.Namespace) -> None: |
| 115 | 154 | "rebuild_irrelevant_stop_ratio": args.rebuild_irrelevant_stop_ratio |
| 116 | 155 | if args.rebuild_irrelevant_stop_ratio is not None |
| 117 | 156 | else DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, |
| 157 | + "rebuild_irrel_low_combined_stop_ratio": args.rebuild_irrel_low_combined_stop_ratio | |
| 158 | + if args.rebuild_irrel_low_combined_stop_ratio is not None | |
| 159 | + else DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO, | |
| 118 | 160 | "rebuild_irrelevant_stop_streak": args.rebuild_irrelevant_stop_streak |
| 119 | 161 | if args.rebuild_irrelevant_stop_streak is not None |
| 120 | 162 | else DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK, |
| ... | ... | @@ -152,7 +194,7 @@ def run_build(args: argparse.Namespace) -> None: |
| 152 | 194 | |
| 153 | 195 | |
| 154 | 196 | def run_batch(args: argparse.Namespace) -> None: |
| 155 | - framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode) | |
| 197 | + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) | |
| 156 | 198 | queries = framework.queries_from_file(Path(args.queries_file)) |
| 157 | 199 | payload = framework.batch_evaluate( |
| 158 | 200 | queries=queries, |
| ... | ... | @@ -165,7 +207,7 @@ def run_batch(args: argparse.Namespace) -> None: |
| 165 | 207 | |
| 166 | 208 | |
| 167 | 209 | def run_audit(args: argparse.Namespace) -> None: |
| 168 | - framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode) | |
| 210 | + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) | |
| 169 | 211 | queries = framework.queries_from_file(Path(args.queries_file)) |
| 170 | 212 | audit_items = [] |
| 171 | 213 | for query in queries: |
| ... | ... | @@ -215,7 +257,7 @@ def run_audit(args: argparse.Namespace) -> None: |
| 215 | 257 | |
| 216 | 258 | |
| 217 | 259 | def run_serve(args: argparse.Namespace) -> None: |
| 218 | - framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode) | |
| 260 | + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) | |
| 219 | 261 | app = create_web_app(framework, Path(args.queries_file)) |
| 220 | 262 | import uvicorn |
| 221 | 263 | ... | ... |
scripts/evaluation/eval_framework/clients.py
| ... | ... | @@ -2,30 +2,49 @@ |
| 2 | 2 | |
| 3 | 3 | from __future__ import annotations |
| 4 | 4 | |
| 5 | +import io | |
| 6 | +import json | |
| 7 | +import time | |
| 8 | +import uuid | |
| 5 | 9 | from typing import Any, Dict, List, Optional, Sequence, Tuple |
| 6 | 10 | |
| 7 | 11 | import requests |
| 8 | 12 | |
| 9 | 13 | from .constants import VALID_LABELS |
| 10 | -from .prompts import ( | |
| 11 | - classify_batch_complex_prompt, | |
| 12 | - classify_batch_simple_prompt, | |
| 13 | - extract_query_profile_prompt, | |
| 14 | -) | |
| 14 | +from .prompts import classify_prompt | |
| 15 | 15 | from .utils import build_label_doc_line, extract_json_blob, safe_json_dumps |
| 16 | 16 | |
| 17 | 17 | |
| 18 | +def _canonicalize_judge_label(raw: str) -> str | None: | |
| 19 | + s = str(raw or "").strip().strip('"').strip("'") | |
| 20 | + if s in VALID_LABELS: | |
| 21 | + return s | |
| 22 | + low = s.lower() | |
| 23 | + for v in VALID_LABELS: | |
| 24 | + if v.lower() == low: | |
| 25 | + return v | |
| 26 | + return None | |
| 27 | + | |
| 28 | + | |
| 18 | 29 | class SearchServiceClient: |
| 19 | 30 | def __init__(self, base_url: str, tenant_id: str): |
| 20 | 31 | self.base_url = base_url.rstrip("/") |
| 21 | 32 | self.tenant_id = str(tenant_id) |
| 22 | 33 | self.session = requests.Session() |
| 23 | 34 | |
| 24 | - def search(self, query: str, size: int, from_: int = 0, language: str = "en") -> Dict[str, Any]: | |
| 35 | + def search(self, query: str, size: int, from_: int = 0, language: str = "en", *, debug: bool = False) -> Dict[str, Any]: | |
| 36 | + payload: Dict[str, Any] = { | |
| 37 | + "query": query, | |
| 38 | + "size": size, | |
| 39 | + "from": from_, | |
| 40 | + "language": language, | |
| 41 | + } | |
| 42 | + if debug: | |
| 43 | + payload["debug"] = True | |
| 25 | 44 | response = self.session.post( |
| 26 | 45 | f"{self.base_url}/search/", |
| 27 | 46 | headers={"Content-Type": "application/json", "X-Tenant-ID": self.tenant_id}, |
| 28 | - json={"query": query, "size": size, "from": from_, "language": language}, | |
| 47 | + json=payload, | |
| 29 | 48 | timeout=120, |
| 30 | 49 | ) |
| 31 | 50 | response.raise_for_status() |
| ... | ... | @@ -52,26 +71,55 @@ class RerankServiceClient: |
| 52 | 71 | |
| 53 | 72 | |
| 54 | 73 | class DashScopeLabelClient: |
| 55 | - def __init__(self, model: str, base_url: str, api_key: str, batch_size: int = 40): | |
| 74 | + """DashScope OpenAI-compatible chat: synchronous or Batch File API (JSONL job). | |
| 75 | + | |
| 76 | + Batch flow: https://help.aliyun.com/zh/model-studio/batch-interfaces-compatible-with-openai/ | |
| 77 | + | |
| 78 | + Some regional endpoints (e.g. ``dashscope-us`` compatible-mode) do not implement ``/batches``; | |
| 79 | + on HTTP 404 from batch calls we fall back to synchronous ``/chat/completions`` and stop using batch | |
| 80 | + for subsequent requests on this client. | |
| 81 | + """ | |
| 82 | + | |
| 83 | + def __init__( | |
| 84 | + self, | |
| 85 | + model: str, | |
| 86 | + base_url: str, | |
| 87 | + api_key: str, | |
| 88 | + batch_size: int = 40, | |
| 89 | + *, | |
| 90 | + batch_completion_window: str = "24h", | |
| 91 | + batch_poll_interval_sec: float = 10.0, | |
| 92 | + enable_thinking: bool = True, | |
| 93 | + use_batch: bool = False, | |
| 94 | + ): | |
| 56 | 95 | self.model = model |
| 57 | 96 | self.base_url = base_url.rstrip("/") |
| 58 | 97 | self.api_key = api_key |
| 59 | 98 | self.batch_size = int(batch_size) |
| 99 | + self.batch_completion_window = str(batch_completion_window) | |
| 100 | + self.batch_poll_interval_sec = float(batch_poll_interval_sec) | |
| 101 | + self.enable_thinking = bool(enable_thinking) | |
| 102 | + self.use_batch = bool(use_batch) | |
| 60 | 103 | self.session = requests.Session() |
| 61 | 104 | |
| 62 | - def _chat(self, prompt: str) -> Tuple[str, str]: | |
| 105 | + def _auth_headers(self) -> Dict[str, str]: | |
| 106 | + return {"Authorization": f"Bearer {self.api_key}"} | |
| 107 | + | |
| 108 | + def _completion_body(self, prompt: str) -> Dict[str, Any]: | |
| 109 | + body: Dict[str, Any] = { | |
| 110 | + "model": self.model, | |
| 111 | + "messages": [{"role": "user", "content": prompt}], | |
| 112 | + "temperature": 0, | |
| 113 | + "top_p": 0.1, | |
| 114 | + "enable_thinking": self.enable_thinking, | |
| 115 | + } | |
| 116 | + return body | |
| 117 | + | |
| 118 | + def _chat_sync(self, prompt: str) -> Tuple[str, str]: | |
| 63 | 119 | response = self.session.post( |
| 64 | 120 | f"{self.base_url}/chat/completions", |
| 65 | - headers={ | |
| 66 | - "Authorization": f"Bearer {self.api_key}", | |
| 67 | - "Content-Type": "application/json", | |
| 68 | - }, | |
| 69 | - json={ | |
| 70 | - "model": self.model, | |
| 71 | - "messages": [{"role": "user", "content": prompt}], | |
| 72 | - "temperature": 0, | |
| 73 | - "top_p": 0.1, | |
| 74 | - }, | |
| 121 | + headers={**self._auth_headers(), "Content-Type": "application/json"}, | |
| 122 | + json=self._completion_body(prompt), | |
| 75 | 123 | timeout=180, |
| 76 | 124 | ) |
| 77 | 125 | response.raise_for_status() |
| ... | ... | @@ -79,71 +127,146 @@ class DashScopeLabelClient: |
| 79 | 127 | content = str(((data.get("choices") or [{}])[0].get("message") or {}).get("content") or "").strip() |
| 80 | 128 | return content, safe_json_dumps(data) |
| 81 | 129 | |
| 82 | - def classify_batch_simple( | |
| 130 | + def _chat_batch(self, prompt: str) -> Tuple[str, str]: | |
| 131 | + """One chat completion via Batch File API (single-line JSONL job).""" | |
| 132 | + custom_id = uuid.uuid4().hex | |
| 133 | + body = self._completion_body(prompt) | |
| 134 | + line_obj = { | |
| 135 | + "custom_id": custom_id, | |
| 136 | + "method": "POST", | |
| 137 | + "url": "/v1/chat/completions", | |
| 138 | + "body": body, | |
| 139 | + } | |
| 140 | + jsonl = json.dumps(line_obj, ensure_ascii=False, separators=(",", ":")) + "\n" | |
| 141 | + auth = self._auth_headers() | |
| 142 | + | |
| 143 | + up = self.session.post( | |
| 144 | + f"{self.base_url}/files", | |
| 145 | + headers=auth, | |
| 146 | + files={ | |
| 147 | + "file": ( | |
| 148 | + "eval_batch_input.jsonl", | |
| 149 | + io.BytesIO(jsonl.encode("utf-8")), | |
| 150 | + "application/octet-stream", | |
| 151 | + ) | |
| 152 | + }, | |
| 153 | + data={"purpose": "batch"}, | |
| 154 | + timeout=300, | |
| 155 | + ) | |
| 156 | + up.raise_for_status() | |
| 157 | + file_id = (up.json() or {}).get("id") | |
| 158 | + if not file_id: | |
| 159 | + raise RuntimeError(f"DashScope file upload returned no id: {up.text!r}") | |
| 160 | + | |
| 161 | + cr = self.session.post( | |
| 162 | + f"{self.base_url}/batches", | |
| 163 | + headers={**auth, "Content-Type": "application/json"}, | |
| 164 | + json={ | |
| 165 | + "input_file_id": file_id, | |
| 166 | + "endpoint": "/v1/chat/completions", | |
| 167 | + "completion_window": self.batch_completion_window, | |
| 168 | + }, | |
| 169 | + timeout=120, | |
| 170 | + ) | |
| 171 | + cr.raise_for_status() | |
| 172 | + batch_payload = cr.json() or {} | |
| 173 | + batch_id = batch_payload.get("id") | |
| 174 | + if not batch_id: | |
| 175 | + raise RuntimeError(f"DashScope batches.create returned no id: {cr.text!r}") | |
| 176 | + | |
| 177 | + terminal = frozenset({"completed", "failed", "expired", "cancelled"}) | |
| 178 | + batch: Dict[str, Any] = dict(batch_payload) | |
| 179 | + status = str(batch.get("status") or "") | |
| 180 | + while status not in terminal: | |
| 181 | + time.sleep(self.batch_poll_interval_sec) | |
| 182 | + br = self.session.get(f"{self.base_url}/batches/{batch_id}", headers=auth, timeout=120) | |
| 183 | + br.raise_for_status() | |
| 184 | + batch = br.json() or {} | |
| 185 | + status = str(batch.get("status") or "") | |
| 186 | + | |
| 187 | + if status != "completed": | |
| 188 | + raise RuntimeError( | |
| 189 | + f"DashScope batch {batch_id} ended with status={status!r} errors={batch.get('errors')!r}" | |
| 190 | + ) | |
| 191 | + | |
| 192 | + out_id = batch.get("output_file_id") | |
| 193 | + err_id = batch.get("error_file_id") | |
| 194 | + | |
| 195 | + row = self._find_batch_line_for_custom_id(out_id, custom_id, auth) | |
| 196 | + if row is None: | |
| 197 | + err_row = self._find_batch_line_for_custom_id(err_id, custom_id, auth) | |
| 198 | + if err_row is not None: | |
| 199 | + raise RuntimeError(f"DashScope batch request failed: {err_row!r}") | |
| 200 | + raise RuntimeError(f"DashScope batch output missing custom_id={custom_id!r}") | |
| 201 | + | |
| 202 | + resp = row.get("response") or {} | |
| 203 | + sc = resp.get("status_code") | |
| 204 | + if sc is not None and int(sc) != 200: | |
| 205 | + raise RuntimeError(f"DashScope batch line error: {row!r}") | |
| 206 | + | |
| 207 | + data = resp.get("body") or {} | |
| 208 | + content = str(((data.get("choices") or [{}])[0].get("message") or {}).get("content") or "").strip() | |
| 209 | + return content, safe_json_dumps(row) | |
| 210 | + | |
| 211 | + def _chat(self, prompt: str) -> Tuple[str, str]: | |
| 212 | + if not self.use_batch: | |
| 213 | + return self._chat_sync(prompt) | |
| 214 | + try: | |
| 215 | + return self._chat_batch(prompt) | |
| 216 | + except requests.exceptions.HTTPError as e: | |
| 217 | + resp = getattr(e, "response", None) | |
| 218 | + if resp is not None and resp.status_code == 404: | |
| 219 | + self.use_batch = False | |
| 220 | + return self._chat_sync(prompt) | |
| 221 | + raise | |
| 222 | + | |
| 223 | + def _find_batch_line_for_custom_id( | |
| 224 | + self, | |
| 225 | + file_id: Optional[str], | |
| 226 | + custom_id: str, | |
| 227 | + auth: Dict[str, str], | |
| 228 | + ) -> Optional[Dict[str, Any]]: | |
| 229 | + if not file_id or str(file_id) in ("null", ""): | |
| 230 | + return None | |
| 231 | + r = self.session.get(f"{self.base_url}/files/{file_id}/content", headers=auth, timeout=300) | |
| 232 | + r.raise_for_status() | |
| 233 | + for raw in r.text.splitlines(): | |
| 234 | + raw = raw.strip() | |
| 235 | + if not raw: | |
| 236 | + continue | |
| 237 | + try: | |
| 238 | + obj = json.loads(raw) | |
| 239 | + except json.JSONDecodeError: | |
| 240 | + continue | |
| 241 | + if str(obj.get("custom_id")) == custom_id: | |
| 242 | + return obj | |
| 243 | + return None | |
| 244 | + | |
| 245 | + def classify_batch( | |
| 83 | 246 | self, |
| 84 | 247 | query: str, |
| 85 | 248 | docs: Sequence[Dict[str, Any]], |
| 86 | 249 | ) -> Tuple[List[str], str]: |
| 87 | 250 | numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)] |
| 88 | - prompt = classify_batch_simple_prompt(query, numbered_docs) | |
| 251 | + prompt = classify_prompt(query, numbered_docs) | |
| 89 | 252 | content, raw_response = self._chat(prompt) |
| 90 | - labels = [] | |
| 253 | + labels: List[str] = [] | |
| 91 | 254 | for line in str(content or "").splitlines(): |
| 92 | - label = line.strip() | |
| 93 | - if label in VALID_LABELS: | |
| 94 | - labels.append(label) | |
| 255 | + canon = _canonicalize_judge_label(line) | |
| 256 | + if canon is not None: | |
| 257 | + labels.append(canon) | |
| 95 | 258 | if len(labels) != len(docs): |
| 96 | 259 | payload = extract_json_blob(content) |
| 97 | 260 | if isinstance(payload, dict) and isinstance(payload.get("labels"), list): |
| 98 | 261 | labels = [] |
| 99 | 262 | for item in payload["labels"][: len(docs)]: |
| 100 | 263 | if isinstance(item, dict): |
| 101 | - label = str(item.get("label") or "").strip() | |
| 264 | + raw_l = str(item.get("label") or "").strip() | |
| 102 | 265 | else: |
| 103 | - label = str(item).strip() | |
| 104 | - if label in VALID_LABELS: | |
| 105 | - labels.append(label) | |
| 106 | - if len(labels) != len(docs) or any(label not in VALID_LABELS for label in labels): | |
| 107 | - raise ValueError(f"unexpected simple label output: {content!r}") | |
| 108 | - return labels, raw_response | |
| 109 | - | |
| 110 | - def extract_query_profile( | |
| 111 | - self, | |
| 112 | - query: str, | |
| 113 | - parser_hints: Dict[str, Any], | |
| 114 | - ) -> Tuple[Dict[str, Any], str]: | |
| 115 | - prompt = extract_query_profile_prompt(query, parser_hints) | |
| 116 | - content, raw_response = self._chat(prompt) | |
| 117 | - payload = extract_json_blob(content) | |
| 118 | - if not isinstance(payload, dict): | |
| 119 | - raise ValueError(f"unexpected query profile payload: {content!r}") | |
| 120 | - payload.setdefault("normalized_query_en", query) | |
| 121 | - payload.setdefault("primary_category", "") | |
| 122 | - payload.setdefault("allowed_categories", []) | |
| 123 | - payload.setdefault("required_attributes", []) | |
| 124 | - payload.setdefault("notes", []) | |
| 125 | - return payload, raw_response | |
| 126 | - | |
| 127 | - def classify_batch_complex( | |
| 128 | - self, | |
| 129 | - query: str, | |
| 130 | - query_profile: Dict[str, Any], | |
| 131 | - docs: Sequence[Dict[str, Any]], | |
| 132 | - ) -> Tuple[List[str], str]: | |
| 133 | - numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)] | |
| 134 | - prompt = classify_batch_complex_prompt(query, query_profile, numbered_docs) | |
| 135 | - content, raw_response = self._chat(prompt) | |
| 136 | - payload = extract_json_blob(content) | |
| 137 | - if not isinstance(payload, dict) or not isinstance(payload.get("labels"), list): | |
| 138 | - raise ValueError(f"unexpected label payload: {content!r}") | |
| 139 | - labels_payload = payload["labels"] | |
| 140 | - labels: List[str] = [] | |
| 141 | - for item in labels_payload[: len(docs)]: | |
| 142 | - if not isinstance(item, dict): | |
| 143 | - continue | |
| 144 | - label = str(item.get("label") or "").strip() | |
| 145 | - if label in VALID_LABELS: | |
| 146 | - labels.append(label) | |
| 266 | + raw_l = str(item).strip() | |
| 267 | + canon = _canonicalize_judge_label(raw_l) | |
| 268 | + if canon is not None: | |
| 269 | + labels.append(canon) | |
| 147 | 270 | if len(labels) != len(docs) or any(label not in VALID_LABELS for label in labels): |
| 148 | - raise ValueError(f"unexpected label output: {content!r}") | |
| 271 | + raise ValueError(f"unexpected classify output: {content!r}") | |
| 149 | 272 | return labels, raw_response | ... | ... |
scripts/evaluation/eval_framework/constants.py
| ... | ... | @@ -6,24 +6,60 @@ _PKG_DIR = Path(__file__).resolve().parent |
| 6 | 6 | _SCRIPTS_EVAL_DIR = _PKG_DIR.parent |
| 7 | 7 | PROJECT_ROOT = _SCRIPTS_EVAL_DIR.parents[1] |
| 8 | 8 | |
| 9 | -RELEVANCE_EXACT = "Exact" | |
| 10 | -RELEVANCE_PARTIAL = "Partial" | |
| 9 | +# Canonical English labels (must match LLM prompt output in prompts._CLASSIFY_TEMPLATE_EN) | |
| 10 | +RELEVANCE_EXACT = "Exact Match" | |
| 11 | +RELEVANCE_HIGH = "High Relevant" | |
| 12 | +RELEVANCE_LOW = "Low Relevant" | |
| 11 | 13 | RELEVANCE_IRRELEVANT = "Irrelevant" |
| 12 | -VALID_LABELS = {RELEVANCE_EXACT, RELEVANCE_PARTIAL, RELEVANCE_IRRELEVANT} | |
| 14 | + | |
| 15 | +VALID_LABELS = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW, RELEVANCE_IRRELEVANT}) | |
| 16 | + | |
| 17 | +# Precision / MAP "positive" set (all non-irrelevant tiers) | |
| 18 | +RELEVANCE_NON_IRRELEVANT = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW}) | |
| 19 | + | |
| 20 | +_LEGACY_LABEL_MAP = { | |
| 21 | + "Exact": RELEVANCE_EXACT, | |
| 22 | + "Partial": RELEVANCE_HIGH, | |
| 23 | +} | |
| 24 | + | |
| 25 | + | |
| 26 | +def normalize_stored_label(label: str) -> str: | |
| 27 | + """Map legacy 3-way SQLite labels to current 4-way strings; pass through canonical labels.""" | |
| 28 | + s = str(label).strip() | |
| 29 | + if s in VALID_LABELS: | |
| 30 | + return s | |
| 31 | + return _LEGACY_LABEL_MAP.get(s, s) | |
| 32 | + | |
| 13 | 33 | |
| 14 | 34 | DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation" |
| 15 | 35 | DEFAULT_QUERY_FILE = _SCRIPTS_EVAL_DIR / "queries" / "queries.txt" |
| 16 | 36 | |
| 17 | -JUDGE_PROMPT_VERSION_SIMPLE = "v3_simple_20260331" | |
| 18 | -JUDGE_PROMPT_VERSION_COMPLEX = "v2_structured_20260331" | |
| 19 | -DEFAULT_LABELER_MODE = "simple" | |
| 37 | +# Judge LLM (eval_framework only; override via CLI --judge-model / constructor kwargs) | |
| 38 | +DEFAULT_JUDGE_MODEL = "qwen3.5-flash" | |
| 39 | +DEFAULT_JUDGE_ENABLE_THINKING = True | |
| 40 | +DEFAULT_JUDGE_DASHSCOPE_BATCH = False | |
| 41 | +DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW = "24h" | |
| 42 | +DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC = 10.0 | |
| 20 | 43 | |
| 21 | -# Rebuild annotation pool (build --force-refresh-labels): search recall + full-corpus rerank + LLM batches | |
| 22 | -DEFAULT_SEARCH_RECALL_TOP_K = 500 | |
| 44 | +# --- Rebuild annotation pool (``build --force-refresh-labels``) --- | |
| 45 | +# Flow: search recall pool (rerank_score=1, no rerank API) + rerank rest of corpus + | |
| 46 | +# LLM labels in fixed-size batches along global order (see ``framework._annotate_rebuild_batches``). | |
| 47 | +DEFAULT_SEARCH_RECALL_TOP_K = 200 | |
| 23 | 48 | DEFAULT_RERANK_HIGH_THRESHOLD = 0.5 |
| 24 | 49 | DEFAULT_RERANK_HIGH_SKIP_COUNT = 1000 |
| 25 | 50 | DEFAULT_REBUILD_LLM_BATCH_SIZE = 50 |
| 26 | -DEFAULT_REBUILD_MIN_LLM_BATCHES = 15 | |
| 51 | +# At least this many LLM batches run before early-stop is considered. | |
| 52 | +DEFAULT_REBUILD_MIN_LLM_BATCHES = 10 | |
| 53 | +# Hard cap on LLM batches per query (each batch labels up to ``DEFAULT_REBUILD_LLM_BATCH_SIZE`` docs). | |
| 27 | 54 | DEFAULT_REBUILD_MAX_LLM_BATCHES = 40 |
| 28 | -DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.92 | |
| 29 | -DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK = 3 | |
| 55 | + | |
| 56 | +# LLM early-stop (only after ``DEFAULT_REBUILD_MIN_LLM_BATCHES`` completed): | |
| 57 | +# A batch is "bad" when it has **no** ``Exact Match`` label AND either: | |
| 58 | +# - irrelevant_ratio >= DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, or | |
| 59 | +# - (Irrelevant + Low Relevant) / n >= DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO. | |
| 60 | +# ``irrelevant_ratio`` = Irrelevant count / n; weak relevance is ``RELEVANCE_LOW`` ("Low Relevant"). | |
| 61 | +# If a batch is bad, increment a streak; otherwise reset streak to 0. Stop when streak reaches | |
| 62 | +# ``DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK`` (consecutive bad batches). | |
| 63 | +DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.94 | |
| 64 | +DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO = 0.96 | |
| 65 | +DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK = 2 | ... | ... |
scripts/evaluation/eval_framework/framework.py
| ... | ... | @@ -10,13 +10,18 @@ from typing import Any, Dict, List, Sequence, Tuple |
| 10 | 10 | import requests |
| 11 | 11 | from elasticsearch.helpers import scan |
| 12 | 12 | |
| 13 | -from api.app import get_app_config, get_es_client, get_query_parser, init_service | |
| 13 | +from api.app import get_app_config, get_es_client, init_service | |
| 14 | 14 | from indexer.mapping_generator import get_tenant_index_name |
| 15 | 15 | |
| 16 | 16 | from .clients import DashScopeLabelClient, RerankServiceClient, SearchServiceClient |
| 17 | 17 | from .constants import ( |
| 18 | 18 | DEFAULT_ARTIFACT_ROOT, |
| 19 | - DEFAULT_LABELER_MODE, | |
| 19 | + DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW, | |
| 20 | + DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC, | |
| 21 | + DEFAULT_JUDGE_DASHSCOPE_BATCH, | |
| 22 | + DEFAULT_JUDGE_ENABLE_THINKING, | |
| 23 | + DEFAULT_JUDGE_MODEL, | |
| 24 | + DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO, | |
| 20 | 25 | DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, |
| 21 | 26 | DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK, |
| 22 | 27 | DEFAULT_REBUILD_LLM_BATCH_SIZE, |
| ... | ... | @@ -25,10 +30,11 @@ from .constants import ( |
| 25 | 30 | DEFAULT_RERANK_HIGH_SKIP_COUNT, |
| 26 | 31 | DEFAULT_RERANK_HIGH_THRESHOLD, |
| 27 | 32 | DEFAULT_SEARCH_RECALL_TOP_K, |
| 28 | - JUDGE_PROMPT_VERSION_COMPLEX, | |
| 29 | 33 | RELEVANCE_EXACT, |
| 34 | + RELEVANCE_HIGH, | |
| 30 | 35 | RELEVANCE_IRRELEVANT, |
| 31 | - RELEVANCE_PARTIAL, | |
| 36 | + RELEVANCE_LOW, | |
| 37 | + RELEVANCE_NON_IRRELEVANT, | |
| 32 | 38 | VALID_LABELS, |
| 33 | 39 | ) |
| 34 | 40 | from .metrics import aggregate_metrics, compute_query_metrics, label_distribution |
| ... | ... | @@ -40,26 +46,44 @@ from .utils import ( |
| 40 | 46 | compact_option_values, |
| 41 | 47 | compact_product_payload, |
| 42 | 48 | ensure_dir, |
| 43 | - normalize_text, | |
| 44 | - pick_text, | |
| 45 | 49 | sha1_text, |
| 46 | 50 | utc_now_iso, |
| 47 | 51 | utc_timestamp, |
| 52 | + zh_title_from_multilingual, | |
| 48 | 53 | ) |
| 49 | 54 | |
| 50 | 55 | |
| 56 | +def _zh_titles_from_debug_per_result(debug_info: Any) -> Dict[str, str]: | |
| 57 | + """Map ``spu_id`` -> Chinese title from ``debug_info.per_result[].title_multilingual``.""" | |
| 58 | + out: Dict[str, str] = {} | |
| 59 | + if not isinstance(debug_info, dict): | |
| 60 | + return out | |
| 61 | + for entry in debug_info.get("per_result") or []: | |
| 62 | + if not isinstance(entry, dict): | |
| 63 | + continue | |
| 64 | + spu_id = str(entry.get("spu_id") or "").strip() | |
| 65 | + if not spu_id: | |
| 66 | + continue | |
| 67 | + zh = zh_title_from_multilingual(entry.get("title_multilingual")) | |
| 68 | + if zh: | |
| 69 | + out[spu_id] = zh | |
| 70 | + return out | |
| 71 | + | |
| 72 | + | |
| 51 | 73 | class SearchEvaluationFramework: |
| 52 | 74 | def __init__( |
| 53 | 75 | self, |
| 54 | 76 | tenant_id: str, |
| 55 | 77 | artifact_root: Path = DEFAULT_ARTIFACT_ROOT, |
| 56 | 78 | search_base_url: str = "http://localhost:6002", |
| 57 | - labeler_mode: str = DEFAULT_LABELER_MODE, | |
| 79 | + *, | |
| 80 | + judge_model: str | None = None, | |
| 81 | + enable_thinking: bool | None = None, | |
| 82 | + use_dashscope_batch: bool | None = None, | |
| 58 | 83 | ): |
| 59 | 84 | init_service(get_app_config().infrastructure.elasticsearch.host) |
| 60 | 85 | self.tenant_id = str(tenant_id) |
| 61 | 86 | self.artifact_root = ensure_dir(artifact_root) |
| 62 | - self.labeler_mode = str(labeler_mode).strip().lower() or DEFAULT_LABELER_MODE | |
| 63 | 87 | self.store = EvalStore(self.artifact_root / "search_eval.sqlite3") |
| 64 | 88 | self.search_client = SearchServiceClient(search_base_url, self.tenant_id) |
| 65 | 89 | app_cfg = get_app_config() |
| ... | ... | @@ -71,183 +95,20 @@ class SearchEvaluationFramework: |
| 71 | 95 | api_key = app_cfg.infrastructure.secrets.dashscope_api_key |
| 72 | 96 | if not api_key: |
| 73 | 97 | raise RuntimeError("dashscope_api_key is required for search evaluation annotation") |
| 98 | + model = str(judge_model or DEFAULT_JUDGE_MODEL) | |
| 99 | + et = DEFAULT_JUDGE_ENABLE_THINKING if enable_thinking is None else enable_thinking | |
| 100 | + use_batch = DEFAULT_JUDGE_DASHSCOPE_BATCH if use_dashscope_batch is None else use_dashscope_batch | |
| 101 | + batch_window = DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW | |
| 102 | + batch_poll = float(DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC) | |
| 74 | 103 | self.label_client = DashScopeLabelClient( |
| 75 | - model=str(llm_cfg["model"]), | |
| 104 | + model=model, | |
| 76 | 105 | base_url=str(llm_cfg["base_url"]), |
| 77 | 106 | api_key=str(api_key), |
| 107 | + batch_completion_window=batch_window, | |
| 108 | + batch_poll_interval_sec=batch_poll, | |
| 109 | + enable_thinking=et, | |
| 110 | + use_batch=use_batch, | |
| 78 | 111 | ) |
| 79 | - self.query_parser = None | |
| 80 | - | |
| 81 | - def _get_query_parser(self): | |
| 82 | - if self.query_parser is None: | |
| 83 | - self.query_parser = get_query_parser() | |
| 84 | - return self.query_parser | |
| 85 | - | |
| 86 | - def build_query_parser_hints(self, query: str) -> Dict[str, Any]: | |
| 87 | - parsed = self._get_query_parser().parse(query, generate_vector=False, target_languages=["en", "zh"]) | |
| 88 | - payload = parsed.to_dict() | |
| 89 | - payload["text_for_rerank"] = parsed.text_for_rerank() | |
| 90 | - return payload | |
| 91 | - | |
| 92 | - def get_query_profile(self, query: str, force_refresh: bool = False) -> Dict[str, Any]: | |
| 93 | - if self.labeler_mode != "complex": | |
| 94 | - raise RuntimeError("query profiles are only used in complex labeler mode") | |
| 95 | - if not force_refresh: | |
| 96 | - cached = self.store.get_query_profile(self.tenant_id, query, JUDGE_PROMPT_VERSION_COMPLEX) | |
| 97 | - if cached is not None: | |
| 98 | - return cached | |
| 99 | - parser_hints = self.build_query_parser_hints(query) | |
| 100 | - profile, raw_response = self.label_client.extract_query_profile(query, parser_hints) | |
| 101 | - profile["parser_hints"] = parser_hints | |
| 102 | - self.store.upsert_query_profile( | |
| 103 | - self.tenant_id, | |
| 104 | - query, | |
| 105 | - JUDGE_PROMPT_VERSION_COMPLEX, | |
| 106 | - self.label_client.model, | |
| 107 | - profile, | |
| 108 | - raw_response, | |
| 109 | - ) | |
| 110 | - return profile | |
| 111 | - | |
| 112 | - @staticmethod | |
| 113 | - def _doc_evidence_text(doc: Dict[str, Any]) -> str: | |
| 114 | - pieces: List[str] = [ | |
| 115 | - build_display_title(doc), | |
| 116 | - pick_text(doc.get("vendor"), "en"), | |
| 117 | - pick_text(doc.get("category_path"), "en"), | |
| 118 | - pick_text(doc.get("category_name"), "en"), | |
| 119 | - ] | |
| 120 | - for sku in doc.get("skus") or []: | |
| 121 | - pieces.extend( | |
| 122 | - [ | |
| 123 | - str(sku.get("option1_value") or ""), | |
| 124 | - str(sku.get("option2_value") or ""), | |
| 125 | - str(sku.get("option3_value") or ""), | |
| 126 | - ] | |
| 127 | - ) | |
| 128 | - for tag in doc.get("tags") or []: | |
| 129 | - pieces.append(str(tag)) | |
| 130 | - return normalize_text(" | ".join(piece for piece in pieces if piece)) | |
| 131 | - | |
| 132 | - def _apply_rule_based_label_guardrails( | |
| 133 | - self, | |
| 134 | - label: str, | |
| 135 | - query_profile: Dict[str, Any], | |
| 136 | - doc: Dict[str, Any], | |
| 137 | - ) -> str: | |
| 138 | - if label not in VALID_LABELS: | |
| 139 | - return label | |
| 140 | - evidence = self._doc_evidence_text(doc) | |
| 141 | - category = normalize_text(query_profile.get("primary_category")) | |
| 142 | - allowed_categories = [normalize_text(item) for item in query_profile.get("allowed_categories") or [] if str(item).strip()] | |
| 143 | - | |
| 144 | - primary_category_match = True | |
| 145 | - if category: | |
| 146 | - primary_category_match = category in evidence | |
| 147 | - allowed_category_match = True | |
| 148 | - if allowed_categories: | |
| 149 | - allowed_category_match = any(signal in evidence for signal in allowed_categories) | |
| 150 | - | |
| 151 | - if label == RELEVANCE_EXACT and not primary_category_match: | |
| 152 | - if allowed_category_match: | |
| 153 | - label = RELEVANCE_PARTIAL | |
| 154 | - else: | |
| 155 | - return RELEVANCE_IRRELEVANT | |
| 156 | - | |
| 157 | - for attr in query_profile.get("required_attributes") or []: | |
| 158 | - if not isinstance(attr, dict): | |
| 159 | - continue | |
| 160 | - attr_name = normalize_text(attr.get("name")) | |
| 161 | - if attr_name not in {"color", "fit", "length", "type", "product_type", "material", "size", "gender", "style", "waist_style", "rise"}: | |
| 162 | - continue | |
| 163 | - required_terms = [normalize_text(item) for item in attr.get("required_terms") or [] if normalize_text(item)] | |
| 164 | - conflicting_terms = [normalize_text(item) for item in attr.get("conflicting_terms") or [] if normalize_text(item)] | |
| 165 | - if attr_name == "fit": | |
| 166 | - if any(term in {"oversized", "oversize"} for term in required_terms): | |
| 167 | - conflicting_terms.extend(["slim", "slimming", "fitted", "tight", "close-fitting"]) | |
| 168 | - if any(term in {"fitted", "slim fit", "tight"} for term in required_terms): | |
| 169 | - conflicting_terms.extend(["oversized", "oversize", "loose", "relaxed"]) | |
| 170 | - has_required = any(term in evidence for term in required_terms) if required_terms else True | |
| 171 | - has_conflict = any(term in evidence for term in conflicting_terms) | |
| 172 | - | |
| 173 | - if has_conflict: | |
| 174 | - return RELEVANCE_IRRELEVANT | |
| 175 | - if label == RELEVANCE_EXACT and not has_required: | |
| 176 | - label = RELEVANCE_PARTIAL | |
| 177 | - | |
| 178 | - if label == RELEVANCE_PARTIAL and not primary_category_match and not allowed_category_match: | |
| 179 | - return RELEVANCE_IRRELEVANT | |
| 180 | - | |
| 181 | - return label | |
| 182 | - | |
| 183 | - @staticmethod | |
| 184 | - def _result_item_to_doc(item: Dict[str, Any]) -> Dict[str, Any]: | |
| 185 | - option_values = list(item.get("option_values") or []) | |
| 186 | - while len(option_values) < 3: | |
| 187 | - option_values.append("") | |
| 188 | - product = dict(item.get("product") or {}) | |
| 189 | - return { | |
| 190 | - "spu_id": item.get("spu_id"), | |
| 191 | - "title": product.get("title") or item.get("title"), | |
| 192 | - "vendor": product.get("vendor"), | |
| 193 | - "category_path": product.get("category"), | |
| 194 | - "category_name": product.get("category"), | |
| 195 | - "image_url": item.get("image_url") or product.get("image_url"), | |
| 196 | - "tags": product.get("tags") or [], | |
| 197 | - "skus": [ | |
| 198 | - { | |
| 199 | - "option1_value": option_values[0], | |
| 200 | - "option2_value": option_values[1], | |
| 201 | - "option3_value": option_values[2], | |
| 202 | - } | |
| 203 | - ], | |
| 204 | - } | |
| 205 | - | |
| 206 | - def _collect_label_issues( | |
| 207 | - self, | |
| 208 | - label: str, | |
| 209 | - query_profile: Dict[str, Any], | |
| 210 | - doc: Dict[str, Any], | |
| 211 | - ) -> List[str]: | |
| 212 | - evidence = self._doc_evidence_text(doc) | |
| 213 | - issues: List[str] = [] | |
| 214 | - category = normalize_text(query_profile.get("primary_category")) | |
| 215 | - allowed_categories = [ | |
| 216 | - normalize_text(item) | |
| 217 | - for item in query_profile.get("allowed_categories") or [] | |
| 218 | - if str(item).strip() | |
| 219 | - ] | |
| 220 | - | |
| 221 | - primary_category_match = True if not category else category in evidence | |
| 222 | - allowed_category_match = False if allowed_categories else primary_category_match | |
| 223 | - if allowed_categories: | |
| 224 | - allowed_category_match = any(signal in evidence for signal in allowed_categories) | |
| 225 | - | |
| 226 | - if label == RELEVANCE_EXACT and not primary_category_match: | |
| 227 | - if allowed_category_match: | |
| 228 | - issues.append("Exact missing primary category evidence") | |
| 229 | - else: | |
| 230 | - issues.append("Exact has category mismatch") | |
| 231 | - | |
| 232 | - if label == RELEVANCE_PARTIAL and not primary_category_match and not allowed_category_match: | |
| 233 | - issues.append("Partial has category mismatch") | |
| 234 | - | |
| 235 | - for attr in query_profile.get("required_attributes") or []: | |
| 236 | - if not isinstance(attr, dict): | |
| 237 | - continue | |
| 238 | - attr_name = normalize_text(attr.get("name")) | |
| 239 | - if attr_name not in {"color", "fit", "length", "type", "product_type", "material", "size", "gender", "style"}: | |
| 240 | - continue | |
| 241 | - required_terms = [normalize_text(item) for item in attr.get("required_terms") or [] if normalize_text(item)] | |
| 242 | - conflicting_terms = [normalize_text(item) for item in attr.get("conflicting_terms") or [] if normalize_text(item)] | |
| 243 | - has_required = any(term in evidence for term in required_terms) if required_terms else True | |
| 244 | - has_conflict = any(term in evidence for term in conflicting_terms) | |
| 245 | - | |
| 246 | - if has_conflict and label != RELEVANCE_IRRELEVANT: | |
| 247 | - issues.append(f"{label} conflicts on {attr_name}") | |
| 248 | - if label == RELEVANCE_EXACT and not has_required: | |
| 249 | - issues.append(f"Exact missing {attr_name}") | |
| 250 | - return issues | |
| 251 | 112 | |
| 252 | 113 | def audit_live_query( |
| 253 | 114 | self, |
| ... | ... | @@ -258,42 +119,6 @@ class SearchEvaluationFramework: |
| 258 | 119 | auto_annotate: bool = False, |
| 259 | 120 | ) -> Dict[str, Any]: |
| 260 | 121 | live = self.evaluate_live_query(query=query, top_k=top_k, auto_annotate=auto_annotate, language=language) |
| 261 | - if self.labeler_mode != "complex": | |
| 262 | - labels = [ | |
| 263 | - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT | |
| 264 | - for item in live["results"] | |
| 265 | - ] | |
| 266 | - return { | |
| 267 | - "query": query, | |
| 268 | - "tenant_id": self.tenant_id, | |
| 269 | - "top_k": top_k, | |
| 270 | - "metrics": live["metrics"], | |
| 271 | - "distribution": label_distribution(labels), | |
| 272 | - "query_profile": None, | |
| 273 | - "suspicious": [], | |
| 274 | - "results": live["results"], | |
| 275 | - } | |
| 276 | - query_profile = self.get_query_profile(query, force_refresh=False) | |
| 277 | - suspicious: List[Dict[str, Any]] = [] | |
| 278 | - | |
| 279 | - for item in live["results"]: | |
| 280 | - doc = self._result_item_to_doc(item) | |
| 281 | - issues = self._collect_label_issues(item["label"] or "", query_profile, doc) | |
| 282 | - suggested_label = self._apply_rule_based_label_guardrails(item["label"] or "", query_profile, doc) | |
| 283 | - if suggested_label != (item["label"] or ""): | |
| 284 | - issues = list(issues) + [f"Suggested relabel: {item['label']} -> {suggested_label}"] | |
| 285 | - if issues: | |
| 286 | - suspicious.append( | |
| 287 | - { | |
| 288 | - "rank": item["rank"], | |
| 289 | - "spu_id": item["spu_id"], | |
| 290 | - "title": item["title"], | |
| 291 | - "label": item["label"], | |
| 292 | - "suggested_label": suggested_label, | |
| 293 | - "issues": issues, | |
| 294 | - } | |
| 295 | - ) | |
| 296 | - | |
| 297 | 122 | labels = [ |
| 298 | 123 | item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT |
| 299 | 124 | for item in live["results"] |
| ... | ... | @@ -304,8 +129,8 @@ class SearchEvaluationFramework: |
| 304 | 129 | "top_k": top_k, |
| 305 | 130 | "metrics": live["metrics"], |
| 306 | 131 | "distribution": label_distribution(labels), |
| 307 | - "query_profile": query_profile, | |
| 308 | - "suspicious": suspicious, | |
| 132 | + "query_profile": None, | |
| 133 | + "suspicious": [], | |
| 309 | 134 | "results": live["results"], |
| 310 | 135 | } |
| 311 | 136 | |
| ... | ... | @@ -485,15 +310,7 @@ class SearchEvaluationFramework: |
| 485 | 310 | if not docs: |
| 486 | 311 | return [] |
| 487 | 312 | try: |
| 488 | - if self.labeler_mode == "complex": | |
| 489 | - query_profile = self.get_query_profile(query, force_refresh=force_refresh) | |
| 490 | - labels, raw_response = self.label_client.classify_batch_complex(query, query_profile, docs) | |
| 491 | - labels = [ | |
| 492 | - self._apply_rule_based_label_guardrails(label, query_profile, doc) | |
| 493 | - for doc, label in zip(docs, labels) | |
| 494 | - ] | |
| 495 | - else: | |
| 496 | - labels, raw_response = self.label_client.classify_batch_simple(query, docs) | |
| 313 | + labels, raw_response = self.label_client.classify_batch(query, docs) | |
| 497 | 314 | return [(labels, raw_response, docs)] |
| 498 | 315 | except Exception: |
| 499 | 316 | if len(docs) == 1: |
| ... | ... | @@ -510,10 +327,28 @@ class SearchEvaluationFramework: |
| 510 | 327 | min_batches: int = DEFAULT_REBUILD_MIN_LLM_BATCHES, |
| 511 | 328 | max_batches: int = DEFAULT_REBUILD_MAX_LLM_BATCHES, |
| 512 | 329 | irrelevant_stop_ratio: float = DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, |
| 330 | + irrelevant_low_combined_stop_ratio: float = DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO, | |
| 513 | 331 | stop_streak: int = DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK, |
| 514 | 332 | force_refresh: bool = True, |
| 515 | 333 | ) -> Tuple[Dict[str, str], List[Dict[str, Any]]]: |
| 516 | - """LLM-label ``ordered_docs`` in fixed-size batches with early stop after enough irrelevant-heavy batches.""" | |
| 334 | + """LLM-label ``ordered_docs`` in fixed-size batches along list order. | |
| 335 | + | |
| 336 | + **Early stop** (only after ``min_batches`` full batches have completed): | |
| 337 | + | |
| 338 | + Per batch, let *n* = batch size, and count labels among docs in that batch only. | |
| 339 | + | |
| 340 | + - *bad batch* iff there is **no** ``Exact Match`` in the batch **and** at least one of: | |
| 341 | + | |
| 342 | + - ``irrelevant_ratio = #(Irrelevant)/n >= irrelevant_stop_ratio`` (default 0.94), or | |
| 343 | + - ``( #(Irrelevant) + #(Low Relevant) ) / n >= irrelevant_low_combined_stop_ratio`` | |
| 344 | + (default 0.96; weak relevance = ``RELEVANCE_LOW``). | |
| 345 | + | |
| 346 | + Maintain a streak of consecutive *bad* batches; any non-bad batch resets the streak to 0. | |
| 347 | + Stop labeling when ``streak >= stop_streak`` (default 2) or when ``max_batches`` is reached | |
| 348 | + or the ordered list is exhausted. | |
| 349 | + | |
| 350 | + Constants for defaults: ``eval_framework.constants`` (``DEFAULT_REBUILD_*``). | |
| 351 | + """ | |
| 517 | 352 | batch_logs: List[Dict[str, Any]] = [] |
| 518 | 353 | streak = 0 |
| 519 | 354 | labels: Dict[str, str] = dict(self.store.get_labels(self.tenant_id, query)) |
| ... | ... | @@ -541,32 +376,46 @@ class SearchEvaluationFramework: |
| 541 | 376 | n = len(batch_docs) |
| 542 | 377 | exact_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_EXACT) |
| 543 | 378 | irrel_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_IRRELEVANT) |
| 379 | + low_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_LOW) | |
| 544 | 380 | exact_ratio = exact_n / n if n else 0.0 |
| 545 | 381 | irrelevant_ratio = irrel_n / n if n else 0.0 |
| 382 | + low_ratio = low_n / n if n else 0.0 | |
| 383 | + irrel_low_ratio = (irrel_n + low_n) / n if n else 0.0 | |
| 546 | 384 | log_entry = { |
| 547 | 385 | "batch_index": batch_idx + 1, |
| 548 | 386 | "size": n, |
| 549 | 387 | "exact_ratio": round(exact_ratio, 6), |
| 550 | 388 | "irrelevant_ratio": round(irrelevant_ratio, 6), |
| 389 | + "low_ratio": round(low_ratio, 6), | |
| 390 | + "irrelevant_plus_low_ratio": round(irrel_low_ratio, 6), | |
| 551 | 391 | "offset_start": start, |
| 552 | 392 | "offset_end": min(start + n, total_ordered), |
| 553 | 393 | } |
| 554 | 394 | batch_logs.append(log_entry) |
| 555 | 395 | print( |
| 556 | 396 | f"[eval-rebuild] query={query!r} llm_batch={batch_idx + 1}/{max_batches} " |
| 557 | - f"size={n} exact_ratio={exact_ratio:.4f} irrelevant_ratio={irrelevant_ratio:.4f}", | |
| 397 | + f"size={n} exact_ratio={exact_ratio:.4f} irrelevant_ratio={irrelevant_ratio:.4f} " | |
| 398 | + f"irrel_plus_low_ratio={irrel_low_ratio:.4f}", | |
| 558 | 399 | flush=True, |
| 559 | 400 | ) |
| 560 | 401 | |
| 402 | + # Early-stop streak: only evaluated after min_batches (warm-up before trusting tail quality). | |
| 561 | 403 | if batch_idx + 1 >= min_batches: |
| 562 | - if irrelevant_ratio > irrelevant_stop_ratio: | |
| 404 | + no_exact = exact_n == 0 | |
| 405 | + # Branch 1: high Irrelevant share, no Exact in this batch. | |
| 406 | + heavy_irrel = irrelevant_ratio >= irrelevant_stop_ratio | |
| 407 | + # Branch 2: Irrelevant + Low Relevant combined share, still no Exact. | |
| 408 | + heavy_irrel_low = irrel_low_ratio >= irrelevant_low_combined_stop_ratio | |
| 409 | + bad_batch = no_exact and (heavy_irrel or heavy_irrel_low) | |
| 410 | + if bad_batch: | |
| 563 | 411 | streak += 1 |
| 564 | 412 | else: |
| 565 | 413 | streak = 0 |
| 566 | 414 | if streak >= stop_streak: |
| 567 | 415 | print( |
| 568 | 416 | f"[eval-rebuild] query={query!r} early_stop after {batch_idx + 1} batches " |
| 569 | - f"({stop_streak} consecutive batches with irrelevant_ratio > {irrelevant_stop_ratio})", | |
| 417 | + f"({stop_streak} consecutive batches: no Exact and " | |
| 418 | + f"(irrelevant>={irrelevant_stop_ratio} or irrel+low>={irrelevant_low_combined_stop_ratio}))", | |
| 570 | 419 | flush=True, |
| 571 | 420 | ) |
| 572 | 421 | break |
| ... | ... | @@ -591,8 +440,19 @@ class SearchEvaluationFramework: |
| 591 | 440 | rebuild_min_batches: int = DEFAULT_REBUILD_MIN_LLM_BATCHES, |
| 592 | 441 | rebuild_max_batches: int = DEFAULT_REBUILD_MAX_LLM_BATCHES, |
| 593 | 442 | rebuild_irrelevant_stop_ratio: float = DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, |
| 443 | + rebuild_irrel_low_combined_stop_ratio: float = DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO, | |
| 594 | 444 | rebuild_irrelevant_stop_streak: int = DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK, |
| 595 | 445 | ) -> QueryBuildResult: |
| 446 | + """Build per-query annotation pool and write ``query_builds/*.json``. | |
| 447 | + | |
| 448 | + Normal mode unions search + rerank windows and fills missing labels once. | |
| 449 | + | |
| 450 | + **Rebuild mode** (``force_refresh_labels=True``): full recall pool + corpus rerank outside | |
| 451 | + pool, optional skip for "easy" queries, then batched LLM labeling with **early stop**; | |
| 452 | + see ``_build_query_annotation_set_rebuild`` and ``_annotate_rebuild_batches`` (docstring | |
| 453 | + spells out the bad-batch / streak rule). Rebuild tuning knobs: ``rebuild_*`` and | |
| 454 | + ``search_recall_top_k`` parameters below; CLI mirrors them under ``build --force-refresh-labels``. | |
| 455 | + """ | |
| 596 | 456 | if force_refresh_labels: |
| 597 | 457 | return self._build_query_annotation_set_rebuild( |
| 598 | 458 | query=query, |
| ... | ... | @@ -607,6 +467,7 @@ class SearchEvaluationFramework: |
| 607 | 467 | rebuild_min_batches=rebuild_min_batches, |
| 608 | 468 | rebuild_max_batches=rebuild_max_batches, |
| 609 | 469 | rebuild_irrelevant_stop_ratio=rebuild_irrelevant_stop_ratio, |
| 470 | + rebuild_irrel_low_combined_stop_ratio=rebuild_irrel_low_combined_stop_ratio, | |
| 610 | 471 | rebuild_irrelevant_stop_streak=rebuild_irrelevant_stop_streak, |
| 611 | 472 | ) |
| 612 | 473 | |
| ... | ... | @@ -691,8 +552,6 @@ class SearchEvaluationFramework: |
| 691 | 552 | "annotate_rerank_top_k": annotate_rerank_top_k, |
| 692 | 553 | "pool_size": len(pool_docs), |
| 693 | 554 | }, |
| 694 | - "labeler_mode": self.labeler_mode, | |
| 695 | - "query_profile": self.get_query_profile(query, force_refresh=force_refresh_labels) if self.labeler_mode == "complex" else None, | |
| 696 | 555 | "metrics_top100": metrics, |
| 697 | 556 | "search_results": search_labeled_results, |
| 698 | 557 | "full_rerank_top": rerank_top_results, |
| ... | ... | @@ -724,6 +583,7 @@ class SearchEvaluationFramework: |
| 724 | 583 | rebuild_min_batches: int, |
| 725 | 584 | rebuild_max_batches: int, |
| 726 | 585 | rebuild_irrelevant_stop_ratio: float, |
| 586 | + rebuild_irrel_low_combined_stop_ratio: float, | |
| 727 | 587 | rebuild_irrelevant_stop_streak: int, |
| 728 | 588 | ) -> QueryBuildResult: |
| 729 | 589 | search_size = max(int(search_depth), int(search_recall_top_k)) |
| ... | ... | @@ -756,6 +616,7 @@ class SearchEvaluationFramework: |
| 756 | 616 | "rebuild_min_batches": rebuild_min_batches, |
| 757 | 617 | "rebuild_max_batches": rebuild_max_batches, |
| 758 | 618 | "rebuild_irrelevant_stop_ratio": rebuild_irrelevant_stop_ratio, |
| 619 | + "rebuild_irrel_low_combined_stop_ratio": rebuild_irrel_low_combined_stop_ratio, | |
| 759 | 620 | "rebuild_irrelevant_stop_streak": rebuild_irrelevant_stop_streak, |
| 760 | 621 | } |
| 761 | 622 | |
| ... | ... | @@ -797,6 +658,7 @@ class SearchEvaluationFramework: |
| 797 | 658 | min_batches=rebuild_min_batches, |
| 798 | 659 | max_batches=rebuild_max_batches, |
| 799 | 660 | irrelevant_stop_ratio=rebuild_irrelevant_stop_ratio, |
| 661 | + irrelevant_low_combined_stop_ratio=rebuild_irrel_low_combined_stop_ratio, | |
| 800 | 662 | stop_streak=rebuild_irrelevant_stop_streak, |
| 801 | 663 | force_refresh=True, |
| 802 | 664 | ) |
| ... | ... | @@ -867,8 +729,6 @@ class SearchEvaluationFramework: |
| 867 | 729 | "rebuild": rebuild_meta, |
| 868 | 730 | "ordered_union_size": pool_docs_count, |
| 869 | 731 | }, |
| 870 | - "labeler_mode": self.labeler_mode, | |
| 871 | - "query_profile": self.get_query_profile(query, force_refresh=False) if self.labeler_mode == "complex" else None, | |
| 872 | 732 | "metrics_top100": metrics, |
| 873 | 733 | "search_results": search_labeled_results, |
| 874 | 734 | "full_rerank_top": rerank_top_results, |
| ... | ... | @@ -893,7 +753,10 @@ class SearchEvaluationFramework: |
| 893 | 753 | language: str = "en", |
| 894 | 754 | force_refresh_labels: bool = False, |
| 895 | 755 | ) -> Dict[str, Any]: |
| 896 | - search_payload = self.search_client.search(query=query, size=max(top_k, 100), from_=0, language=language) | |
| 756 | + search_payload = self.search_client.search( | |
| 757 | + query=query, size=max(top_k, 100), from_=0, language=language, debug=True | |
| 758 | + ) | |
| 759 | + zh_by_spu = _zh_titles_from_debug_per_result(search_payload.get("debug_info")) | |
| 897 | 760 | results = list(search_payload.get("results") or []) |
| 898 | 761 | if auto_annotate: |
| 899 | 762 | self.annotate_missing_labels(query=query, docs=results[:top_k], force_refresh=force_refresh_labels) |
| ... | ... | @@ -906,11 +769,16 @@ class SearchEvaluationFramework: |
| 906 | 769 | label = labels.get(spu_id) |
| 907 | 770 | if label not in VALID_LABELS: |
| 908 | 771 | unlabeled_hits += 1 |
| 772 | + primary_title = build_display_title(doc) | |
| 773 | + title_zh = zh_by_spu.get(spu_id) or "" | |
| 774 | + if not title_zh and isinstance(doc.get("title"), dict): | |
| 775 | + title_zh = zh_title_from_multilingual(doc.get("title")) | |
| 909 | 776 | labeled.append( |
| 910 | 777 | { |
| 911 | 778 | "rank": rank, |
| 912 | 779 | "spu_id": spu_id, |
| 913 | - "title": build_display_title(doc), | |
| 780 | + "title": primary_title, | |
| 781 | + "title_zh": title_zh if title_zh and title_zh != primary_title else "", | |
| 914 | 782 | "image_url": doc.get("image_url"), |
| 915 | 783 | "label": label, |
| 916 | 784 | "option_values": list(compact_option_values(doc.get("skus") or [])), |
| ... | ... | @@ -926,7 +794,7 @@ class SearchEvaluationFramework: |
| 926 | 794 | relevant_missing_ids = [ |
| 927 | 795 | spu_id |
| 928 | 796 | for spu_id, label in labels.items() |
| 929 | - if label in {RELEVANCE_EXACT, RELEVANCE_PARTIAL} and spu_id not in recalled_spu_ids | |
| 797 | + if label in RELEVANCE_NON_IRRELEVANT and spu_id not in recalled_spu_ids | |
| 930 | 798 | ] |
| 931 | 799 | missing_docs_map = self.store.get_corpus_docs_by_spu_ids(self.tenant_id, relevant_missing_ids) |
| 932 | 800 | missing_relevant = [] |
| ... | ... | @@ -934,18 +802,26 @@ class SearchEvaluationFramework: |
| 934 | 802 | doc = missing_docs_map.get(spu_id) |
| 935 | 803 | if not doc: |
| 936 | 804 | continue |
| 805 | + miss_title = build_display_title(doc) | |
| 806 | + miss_zh = zh_title_from_multilingual(doc.get("title")) if isinstance(doc.get("title"), dict) else "" | |
| 937 | 807 | missing_relevant.append( |
| 938 | 808 | { |
| 939 | 809 | "spu_id": spu_id, |
| 940 | 810 | "label": labels[spu_id], |
| 941 | 811 | "rerank_score": rerank_scores.get(spu_id), |
| 942 | - "title": build_display_title(doc), | |
| 812 | + "title": miss_title, | |
| 813 | + "title_zh": miss_zh if miss_zh and miss_zh != miss_title else "", | |
| 943 | 814 | "image_url": doc.get("image_url"), |
| 944 | 815 | "option_values": list(compact_option_values(doc.get("skus") or [])), |
| 945 | 816 | "product": compact_product_payload(doc), |
| 946 | 817 | } |
| 947 | 818 | ) |
| 948 | - label_order = {RELEVANCE_EXACT: 0, RELEVANCE_PARTIAL: 1, RELEVANCE_IRRELEVANT: 2} | |
| 819 | + label_order = { | |
| 820 | + RELEVANCE_EXACT: 0, | |
| 821 | + RELEVANCE_HIGH: 1, | |
| 822 | + RELEVANCE_LOW: 2, | |
| 823 | + RELEVANCE_IRRELEVANT: 3, | |
| 824 | + } | |
| 949 | 825 | missing_relevant.sort( |
| 950 | 826 | key=lambda item: ( |
| 951 | 827 | label_order.get(str(item.get("label")), 9), |
| ... | ... | @@ -963,7 +839,7 @@ class SearchEvaluationFramework: |
| 963 | 839 | if unlabeled_hits: |
| 964 | 840 | tips.append(f"{unlabeled_hits} recalled results were not in the annotation set and were counted as Irrelevant.") |
| 965 | 841 | if not missing_relevant: |
| 966 | - tips.append("No cached Exact/Partial products were missed by this recall set.") | |
| 842 | + tips.append("No cached non-irrelevant products were missed by this recall set.") | |
| 967 | 843 | return { |
| 968 | 844 | "query": query, |
| 969 | 845 | "tenant_id": self.tenant_id, |
| ... | ... | @@ -977,7 +853,8 @@ class SearchEvaluationFramework: |
| 977 | 853 | "recalled_hits": len(labeled), |
| 978 | 854 | "missing_relevant_count": len(missing_relevant), |
| 979 | 855 | "missing_exact_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_EXACT), |
| 980 | - "missing_partial_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_PARTIAL), | |
| 856 | + "missing_high_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_HIGH), | |
| 857 | + "missing_low_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LOW), | |
| 981 | 858 | }, |
| 982 | 859 | "tips": tips, |
| 983 | 860 | "total": int(search_payload.get("total") or 0), |
| ... | ... | @@ -1018,7 +895,8 @@ class SearchEvaluationFramework: |
| 1018 | 895 | aggregate = aggregate_metrics([item["metrics"] for item in per_query]) |
| 1019 | 896 | aggregate_distribution = { |
| 1020 | 897 | RELEVANCE_EXACT: sum(item["distribution"][RELEVANCE_EXACT] for item in per_query), |
| 1021 | - RELEVANCE_PARTIAL: sum(item["distribution"][RELEVANCE_PARTIAL] for item in per_query), | |
| 898 | + RELEVANCE_HIGH: sum(item["distribution"][RELEVANCE_HIGH] for item in per_query), | |
| 899 | + RELEVANCE_LOW: sum(item["distribution"][RELEVANCE_LOW] for item in per_query), | |
| 1022 | 900 | RELEVANCE_IRRELEVANT: sum(item["distribution"][RELEVANCE_IRRELEVANT] for item in per_query), |
| 1023 | 901 | } |
| 1024 | 902 | batch_id = f"batch_{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + '|'.join(queries))[:10]}" | ... | ... |
scripts/evaluation/eval_framework/metrics.py
| ... | ... | @@ -4,7 +4,7 @@ from __future__ import annotations |
| 4 | 4 | |
| 5 | 5 | from typing import Dict, Sequence |
| 6 | 6 | |
| 7 | -from .constants import RELEVANCE_EXACT, RELEVANCE_IRRELEVANT, RELEVANCE_PARTIAL | |
| 7 | +from .constants import RELEVANCE_EXACT, RELEVANCE_IRRELEVANT, RELEVANCE_HIGH, RELEVANCE_LOW, RELEVANCE_NON_IRRELEVANT | |
| 8 | 8 | |
| 9 | 9 | |
| 10 | 10 | def precision_at_k(labels: Sequence[str], k: int, relevant: Sequence[str]) -> float: |
| ... | ... | @@ -13,15 +13,17 @@ def precision_at_k(labels: Sequence[str], k: int, relevant: Sequence[str]) -> fl |
| 13 | 13 | sliced = list(labels[:k]) |
| 14 | 14 | if not sliced: |
| 15 | 15 | return 0.0 |
| 16 | - hits = sum(1 for label in sliced if label in relevant) | |
| 16 | + rel = set(relevant) | |
| 17 | + hits = sum(1 for label in sliced if label in rel) | |
| 17 | 18 | return hits / float(min(k, len(sliced))) |
| 18 | 19 | |
| 19 | 20 | |
| 20 | 21 | def average_precision(labels: Sequence[str], relevant: Sequence[str]) -> float: |
| 22 | + rel = set(relevant) | |
| 21 | 23 | hit_count = 0 |
| 22 | 24 | precision_sum = 0.0 |
| 23 | 25 | for idx, label in enumerate(labels, start=1): |
| 24 | - if label not in relevant: | |
| 26 | + if label not in rel: | |
| 25 | 27 | continue |
| 26 | 28 | hit_count += 1 |
| 27 | 29 | precision_sum += hit_count / idx |
| ... | ... | @@ -31,12 +33,14 @@ def average_precision(labels: Sequence[str], relevant: Sequence[str]) -> float: |
| 31 | 33 | |
| 32 | 34 | |
| 33 | 35 | def compute_query_metrics(labels: Sequence[str]) -> Dict[str, float]: |
| 36 | + """P@k / MAP_3: Exact Match only. P@k_2_3 / MAP_2_3: any non-irrelevant tier (legacy metric names).""" | |
| 34 | 37 | metrics: Dict[str, float] = {} |
| 38 | + non_irrel = list(RELEVANCE_NON_IRRELEVANT) | |
| 35 | 39 | for k in (5, 10, 20, 50): |
| 36 | 40 | metrics[f"P@{k}"] = round(precision_at_k(labels, k, [RELEVANCE_EXACT]), 6) |
| 37 | - metrics[f"P@{k}_2_3"] = round(precision_at_k(labels, k, [RELEVANCE_EXACT, RELEVANCE_PARTIAL]), 6) | |
| 41 | + metrics[f"P@{k}_2_3"] = round(precision_at_k(labels, k, non_irrel), 6) | |
| 38 | 42 | metrics["MAP_3"] = round(average_precision(labels, [RELEVANCE_EXACT]), 6) |
| 39 | - metrics["MAP_2_3"] = round(average_precision(labels, [RELEVANCE_EXACT, RELEVANCE_PARTIAL]), 6) | |
| 43 | + metrics["MAP_2_3"] = round(average_precision(labels, non_irrel), 6) | |
| 40 | 44 | return metrics |
| 41 | 45 | |
| 42 | 46 | |
| ... | ... | @@ -53,6 +57,7 @@ def aggregate_metrics(metric_items: Sequence[Dict[str, float]]) -> Dict[str, flo |
| 53 | 57 | def label_distribution(labels: Sequence[str]) -> Dict[str, int]: |
| 54 | 58 | return { |
| 55 | 59 | RELEVANCE_EXACT: sum(1 for label in labels if label == RELEVANCE_EXACT), |
| 56 | - RELEVANCE_PARTIAL: sum(1 for label in labels if label == RELEVANCE_PARTIAL), | |
| 60 | + RELEVANCE_HIGH: sum(1 for label in labels if label == RELEVANCE_HIGH), | |
| 61 | + RELEVANCE_LOW: sum(1 for label in labels if label == RELEVANCE_LOW), | |
| 57 | 62 | RELEVANCE_IRRELEVANT: sum(1 for label in labels if label == RELEVANCE_IRRELEVANT), |
| 58 | 63 | } | ... | ... |
scripts/evaluation/eval_framework/prompts.py
| ... | ... | @@ -2,84 +2,139 @@ |
| 2 | 2 | |
| 3 | 3 | from __future__ import annotations |
| 4 | 4 | |
| 5 | -import json | |
| 6 | -from typing import Any, Dict, Sequence | |
| 5 | +from typing import Sequence | |
| 7 | 6 | |
| 8 | -_CLASSIFY_BATCH_SIMPLE_TEMPLATE = """You are a relevance evaluation assistant for an apparel e-commerce search system. | |
| 9 | -Given the user query and each product's information, assign one relevance label to each product. | |
| 7 | +_CLASSIFY_TEMPLATE_EN = """You are a relevance judgment assistant for a fashion e-commerce search system. | |
| 8 | +Given a user query and the information for each product, assign a relevance label to each product. | |
| 9 | + | |
| 10 | +Your goal is to judge relevance from the perspective of e-commerce search ranking. | |
| 11 | +The key question is whether the user would view the product as the intended item, or as an acceptable substitute. | |
| 10 | 12 | |
| 11 | 13 | ## Relevance Labels |
| 12 | 14 | |
| 13 | -### Exact | |
| 14 | -The product fully satisfies the user’s search intent: the core product type matches, all explicitly stated key attributes are supported by the product information. | |
| 15 | +### Exact Match | |
| 16 | +The product satisfies the user’s core shopping intent: the core product type matches, and all explicitly stated key attributes in the query are supported by the product information, with no obvious conflict. | |
| 15 | 17 | |
| 16 | 18 | Typical use cases: |
| 17 | 19 | - The query contains only a product type, and the product is exactly that type. |
| 18 | -- The query contains product type + attributes, and the product matches both the type and all explicitly stated attributes. | |
| 20 | +- The query contains “product type + attributesâ€, and the product matches both the type and all explicitly stated attributes. | |
| 19 | 21 | |
| 20 | -### Partial | |
| 21 | -The product satisfies the user's primary intent because the core product type matches, but some explicit requirements in the query are missing, cannot be confirmed, or deviate from the query. Despite the mismatch, the product can still be considered a non-target but acceptable substitute. | |
| 22 | +### High Relevant | |
| 23 | +The product satisfies the user’s main intent: the core product type matches, but some explicitly requested attributes are missing from the product information, cannot be confirmed, or show minor / non-critical deviations. The product is still a good substitute for the user’s core need. | |
| 22 | 24 | |
| 23 | -Use Partial when: | |
| 24 | -- The core product type matches, but some requested attributes cannot be confirmed. | |
| 25 | -- The core product type matches, but some secondary requirements deviate or are inconsistent. | |
| 26 | -- The product is not the ideal target, but it is still a plausible and acceptable substitute for the shopper. | |
| 25 | +Use “High Relevant†in the following cases: | |
| 26 | +- The core product type matches, but some requested attributes are missing, not mentioned, or cannot be verified. | |
| 27 | +- The core product type matches, but attributes such as color, material, style, fit, or length have minor deviations, as long as the deviation does not materially undermine the user’s main shopping intent. | |
| 28 | +- The product is not the user’s ideal target, but in an e-commerce shopping context, it would still be considered an acceptable and strong substitute. | |
| 27 | 29 | |
| 28 | -Typical cases: | |
| 29 | -- Query: "red fitted t-shirt", product: "Women's T-Shirt" → color/fit cannot be confirmed. | |
| 30 | -- Query: "red fitted t-shirt", product: "Blue Fitted T-Shirt" → product type and fit match, but color differs. | |
| 30 | +Typical examples: | |
| 31 | +- Query: “red slim-fit T-shirt†| |
| 32 | + Product: “women’s T-shirt†| |
| 33 | + → Color and fit cannot be confirmed. | |
| 34 | +- Query: “red slim-fit T-shirt†| |
| 35 | + Product: “blue slim-fit T-shirt†| |
| 36 | + → Product type and fit match, but the color is different. | |
| 31 | 37 | |
| 32 | -Detailed example: | |
| 33 | -- Query: "cotton long sleeve shirt" | |
| 34 | -- Product: "J.VER Men's Linen Shirts Casual Button Down Long Sleeve Shirt Solid Spread Collar Summer Beach Shirts with Pocket" | |
| 38 | +Detailed case: | |
| 39 | +- Query: “cotton long-sleeve shirt†| |
| 40 | +- Product: “J.VER Men's Linen Shirt Casual Button Down Long Sleeve Solid Plain Collar Summer Beach Shirt with Pocket†| |
| 35 | 41 | |
| 36 | 42 | Analysis: |
| 37 | -- Material mismatch: the query explicitly requires cotton, while the product is linen, so it cannot be Exact. | |
| 38 | -- However, the core product type still matches: both are long sleeve shirts. | |
| 39 | -- In an e-commerce setting, the shopper may still consider clicking this item because the style and use case are similar. | |
| 40 | -- Therefore, it should be labeled Partial as a non-target but acceptable substitute. | |
| 43 | +- Material mismatch: the query explicitly requires “cottonâ€, while the product is “linenâ€, so it cannot be labeled as “Exact Matchâ€. | |
| 44 | +- However, the core category still matches: both are long-sleeve shirts. | |
| 45 | +- In e-commerce search, users may still click this item because the style and wearing scenario are similar. | |
| 46 | +- Therefore, it should be labeled as “High Relevantâ€: not the exact target, but a good substitute. | |
| 41 | 47 | |
| 42 | -### Irrelevant | |
| 43 | -The product does not satisfy the user's main shopping intent. | |
| 48 | +Detailed case: | |
| 49 | +- Query: “black mid-length skirt†| |
| 50 | +- Product: “New spring autumn loose slimming full long floral skirt pleated skirt†| |
| 44 | 51 | |
| 45 | -Use Irrelevant when: | |
| 46 | -- The core product type does not match the query. | |
| 47 | -- The product belongs to a broadly related category, but not the specific product subtype requested, and shoppers would not consider them interchangeable. | |
| 48 | -- The core product type matches, but the product clearly contradicts an explicit and important requirement in the query. | |
| 52 | +Analysis: | |
| 53 | +- Category match: the product is a skirt, so the category matches. | |
| 54 | +- Color mismatch: the product description does not indicate black and explicitly mentions “floralâ€, which is substantially different from plain black. | |
| 55 | +- Length deviation: the user asks for “mid-lengthâ€, while the product title emphasizes “long skirtâ€, which is somewhat longer. | |
| 56 | +- However, the core category “skirt†still matches, and style features such as “slimming†and “full skirt†may still fit some preferences of users searching for a mid-length skirt. Also, “long†versus “mid-length†is a deviation, but not a severe contradiction. | |
| 57 | +- Therefore, this should be labeled as “High Relevantâ€: the core type matches, but there are several non-fatal attribute deviations. | |
| 49 | 58 | |
| 50 | -Typical cases: | |
| 51 | -- Query: "pants", product: "shoes" → wrong product type. | |
| 52 | -- Query: "dress", product: "skirt" → different product type. | |
| 53 | -- Query: "fitted pants", product: "loose wide-leg pants" → explicit contradiction on fit. | |
| 54 | -- Query: "sleeveless dress", product: "long sleeve dress" → explicit contradiction on sleeve style. | |
| 59 | +### Low Relevant | |
| 60 | +The product has a noticeable gap from the user’s core target, but still shares some similarity with the query in style, scenario, function, or broader category. A small portion of users may still view it as a barely acceptable substitute. It is not the intended item, but still has some relevance. | |
| 55 | 61 | |
| 56 | -This label emphasizes clarity of user intent. When the query specifies a concrete product type or an important attribute, products that conflict with that intent should be judged Irrelevant even if they are related at a higher category level. | |
| 62 | +Use “Low Relevant†in the following cases: | |
| 63 | +- The core product type does not match, but the two types are still very close in style, wearing scenario, or function, so there is still some substitutability. | |
| 64 | +- The core product type matches, but the product differs from the user’s ideal target on multiple attributes; it still has some relevance, but is no longer a strong substitute. | |
| 65 | +- An important query requirement is clearly violated, but the product still retains a limited reason to be clicked. | |
| 57 | 66 | |
| 58 | -## Decision Principles | |
| 67 | +Typical cases: | |
| 68 | +- Query: “black mid-length skirt†| |
| 69 | + Product: “New high-waisted V-neck mid-length dress elegant printed black sexy dress†| |
| 70 | + → The core product type differs (“skirt†vs “dressâ€), but both belong to closely related apparel types and share a similar mid-length style, so it is “Low Relevantâ€. | |
| 59 | 71 | |
| 60 | -1. Product type is the highest-priority factor. | |
| 61 | - If the query clearly specifies a concrete product type, the result must match that product type to be Exact or Partial. | |
| 62 | - A different product type is usually Irrelevant, not Partial. | |
| 72 | +- Query: “jeans†| |
| 73 | + Product: “casual pants†| |
| 74 | + → The core product type is different, but both belong to the broader pants category, and the style / wearing scenario may still be close enough to be a weak substitute. | |
| 63 | 75 | |
| 64 | -2. Similar or related product types are not interchangeable when the query is specific. | |
| 76 | +### Irrelevant | |
| 77 | +The product does not satisfy the user’s main shopping intent, and the likelihood of user engagement is very low. | |
| 78 | + | |
| 79 | +Typical situations: | |
| 80 | +- The core product type does not match the query and is not a close substitute in style, scenario, or function. | |
| 81 | +- The product belongs to a roughly related broader category, but not to an interchangeable subtype explicitly requested in the query, and the style or usage scenario differs significantly. | |
| 82 | +- The core product type matches, but the product clearly violates an explicit and important requirement in the query, with little or no acceptable substitutability. | |
| 83 | + | |
| 84 | +Typical examples: | |
| 85 | +- Query: “pants†| |
| 86 | + Product: “shoes†| |
| 87 | + → Wrong product type. | |
| 88 | +- Query: “slim-fit pants†| |
| 89 | + Product: “loose wide-leg pants†| |
| 90 | + → Clear contradiction in fit, with extremely low substitutability. | |
| 91 | +- Query: “sleeveless dress†| |
| 92 | + Product: “long-sleeve dress†| |
| 93 | + → Clear contradiction in sleeve type. | |
| 94 | +- Query: “jeans†| |
| 95 | + Product: “sweatpants†| |
| 96 | + → Different core category, with significantly different style and wearing scenario. | |
| 97 | +- Query: “boots†| |
| 98 | + Product: “sneakers†| |
| 99 | + → Different core category, different function, and different usage scenario. | |
| 100 | + | |
| 101 | +## Judgment Principles | |
| 102 | + | |
| 103 | +1. **Product type is the highest-priority factor.** | |
| 104 | + If the query explicitly specifies a concrete product type, the result must match that product type in order to be labeled as “Exact Match†or “High Relevantâ€. | |
| 105 | + Different product types should usually be labeled as “Low Relevant†or “Irrelevantâ€. | |
| 106 | + | |
| 107 | + - **Low Relevant**: use only when the two product types are very close in style, scenario, or function, and the user may still treat one as a barely acceptable substitute for the other. | |
| 108 | + - **Irrelevant**: all other product type mismatch cases. | |
| 109 | + | |
| 110 | +2. **Similar or related product types are usually not directly interchangeable when the query is explicit, but their closeness should determine whether the label is “Low Relevant†or “Irrelevantâ€.** | |
| 65 | 111 | For example: |
| 66 | - - dress vs skirt vs jumpsuit | |
| 67 | - - jeans vs pants | |
| 68 | - - t-shirt vs blouse | |
| 69 | - - cardigan vs sweater | |
| 70 | - - boots vs shoes | |
| 71 | - - bra vs top | |
| 72 | - - backpack vs bag | |
| 73 | - If the user explicitly searched for one of these, the others should usually be judged Irrelevant. | |
| 74 | - | |
| 75 | -3. If the core product type matches, then evaluate attributes. | |
| 76 | - - If all explicit attributes match → Exact | |
| 77 | - - If some attributes are missing, uncertain, or partially mismatched, but the item is still an acceptable substitute → Partial | |
| 78 | - - If an explicit and important attribute is clearly contradicted, and the item is not a reasonable substitute → Irrelevant | |
| 79 | - | |
| 80 | -4. Distinguish carefully between "not mentioned" and "contradicted". | |
| 81 | - - If an attribute is not mentioned or cannot be verified, prefer Partial. | |
| 82 | - - If an attribute is explicitly opposite to the query, use Irrelevant unless the item is still reasonably acceptable as a substitute under the shopping context. | |
| 112 | + - **May be Low Relevant due to strong similarity in style / scenario**: dress vs skirt, long skirt vs mid-length skirt, jeans vs casual pants, sneakers vs skate shoes. | |
| 113 | + - **Should be Irrelevant due to substantial difference in style / scenario**: pants vs shoes, T-shirt vs hat, boots vs sneakers, jeans vs suit pants, backpack vs handbag. | |
| 114 | + | |
| 115 | +3. **Once the core product type matches, evaluate attributes.** | |
| 116 | + - All explicit attributes match → **Exact Match** | |
| 117 | + - Some attributes are missing, not mentioned, cannot be verified, or show only minor deviations → **High Relevant** | |
| 118 | + - There are multiple attribute deviations, or an important attribute is clearly violated, but the product still retains some substitutability → **Low Relevant** | |
| 119 | + - There is a clear and important hard conflict, and substitutability is extremely low → **Irrelevant** | |
| 120 | + | |
| 121 | +4. **Strictly distinguish among “not mentioned / cannot confirmâ€, “minor deviationâ€, and “explicit contradictionâ€.** | |
| 122 | + - If an attribute is not mentioned or cannot be verified, prefer **High Relevant**. | |
| 123 | + - If an attribute shows a minor deviation, such as different color, different material, or slightly different length, it should usually be labeled **High Relevant**. | |
| 124 | + - If an attribute is explicitly opposite to the query requirement, such as sleeveless vs long-sleeve or slim-fit vs loose wide-leg, decide between **Low Relevant** and **Irrelevant** based on the severity of the conflict and practical substitutability. | |
| 125 | + - If the conflict directly breaks the user’s main shopping goal, it should usually be labeled **Irrelevant**. | |
| 126 | + | |
| 127 | +5. **Substitutability should be judged from real shopping intent, not just surface-level textual similarity.** | |
| 128 | + The question is whether the user would realistically accept the product in a shopping scenario. | |
| 129 | + - Good substitute → **High Relevant** | |
| 130 | + - Barely acceptable substitute → **Low Relevant** | |
| 131 | + - Hardly substitutable at all → **Irrelevant** | |
| 132 | + | |
| 133 | +6. **When product information is insufficient, do not treat “cannot confirm†as “conflictâ€.** | |
| 134 | + If a product does not mention an attribute, that does not mean the attribute is definitely violated. | |
| 135 | + Therefore: | |
| 136 | + - If the attribute is not mentioned or cannot be confirmed, prefer **High Relevant**; | |
| 137 | + - Only treat it as a conflict when the product information clearly shows the opposite of the query requirement. | |
| 83 | 138 | |
| 84 | 139 | Query: {query} |
| 85 | 140 | |
| ... | ... | @@ -87,88 +142,139 @@ Products: |
| 87 | 142 | {lines} |
| 88 | 143 | |
| 89 | 144 | ## Output Format |
| 90 | -Strictly output {n} lines, each line containing exactly one of: | |
| 91 | -Exact | |
| 92 | -Partial | |
| 145 | +Output exactly {n} lines. | |
| 146 | +Each line must be exactly one of the following: | |
| 147 | +Exact Match | |
| 148 | +High Relevant | |
| 149 | +Low Relevant | |
| 93 | 150 | Irrelevant |
| 94 | 151 | |
| 95 | -The lines must correspond sequentially to the products above. | |
| 96 | -Do not output any other information. | |
| 152 | +The output lines must correspond to the products above in the same order. | |
| 153 | +Do not output anything else. | |
| 97 | 154 | """ |
| 98 | 155 | |
| 99 | -_CLASSIFY_BATCH_SIMPLE_TEMPLATE__zh = _CLASSIFY_BATCH_SIMPLE_TEMPLATE_ZH = """ä½ æ˜¯ä¸€ä¸ªæœé¥°ç”µå•†æœç´¢ç³»ç»Ÿä¸çš„相关性判æ–助手。 | |
| 156 | +_CLASSIFY_TEMPLATE_ZH = """ä½ æ˜¯ä¸€ä¸ªæœé¥°ç”µå•†æœç´¢ç³»ç»Ÿä¸çš„相关性判æ–助手。 | |
| 100 | 157 | 给定用户查询è¯ä»¥åŠæ¯ä¸ªå•†å“的信æ¯ï¼Œè¯·ä¸ºæ¯ä¸ªå•†å“分é…ä¸€ä¸ªç›¸å…³æ€§æ ‡ç¾ã€‚ |
| 101 | 158 | |
| 159 | +ä½ çš„ç›®æ ‡æ˜¯ä»Žç”µå•†æœç´¢æŽ’åºçš„角度,判æ–商哿˜¯å¦æ»¡è¶³ç”¨æˆ·çš„è´ç‰©æ„图。 | |
| 160 | +åˆ¤æ–æ—¶åº”优先考虑“用户是å¦ä¼šæŠŠè¯¥å•†å“è§†ä¸ºç›®æ ‡å•†å“ï¼Œæˆ–å¯æŽ¥å—的替代å“â€ã€‚ | |
| 161 | + | |
| 102 | 162 | ## ç›¸å…³æ€§æ ‡ç¾ |
| 103 | 163 | |
| 104 | 164 | ### 完全相关 |
| 105 | -æ ¸å¿ƒäº§å“类型匹é…,所有明确æåŠçš„å…³é”®å±žæ€§å‡æœ‰äº§å“ä¿¡æ¯æ”¯æ’‘。 | |
| 165 | +商哿»¡è¶³ç”¨æˆ·çš„æ ¸å¿ƒè´ç‰©æ„å›¾ï¼šæ ¸å¿ƒå•†å“类型匹é…ï¼Œä¸”æŸ¥è¯¢ä¸æ‰€æœ‰æ˜Žç¡®æåŠçš„å…³é”®å±žæ€§å‡æœ‰å•†å“ä¿¡æ¯æ”¯æŒã€‚ | |
| 106 | 166 | |
| 107 | 167 | 典型适用场景: |
| 108 | -- 查询仅包å«äº§å“类型,产å“å³ä¸ºè¯¥ç±»åž‹ã€‚ | |
| 109 | -- 查询包å«â€œäº§å“类型 + 属性â€ï¼Œäº§å“åœ¨ç±»åž‹åŠæ‰€æœ‰æ˜Žç¡®å±žæ€§ä¸Šå‡ç¬¦åˆã€‚ | |
| 168 | +- 查询仅包å«å•†å“类型,商å“å³ä¸ºè¯¥ç±»åž‹ã€‚ | |
| 169 | +- 查询包å«â€œå•†å“类型 + 属性â€ï¼Œå•†å“åœ¨ç±»åž‹åŠæ‰€æœ‰æ˜Žç¡®å±žæ€§ä¸Šå‡ç¬¦åˆã€‚ | |
| 110 | 170 | |
| 111 | -### 部分相关 | |
| 112 | -äº§å“æ»¡è¶³ç”¨æˆ·çš„ä¸»è¦æ„å›¾ï¼ˆæ ¸å¿ƒäº§å“类型匹é…ï¼‰ï¼Œä½†æŸ¥è¯¢ä¸æ˜Žç¡®çš„éƒ¨åˆ†è¦æ±‚未体现,或å˜åœ¨å差。虽然有ä¸ä¸€è‡´ï¼Œä½†ä»å±žäºŽâ€œéžç›®æ ‡ä½†å¯æŽ¥å—â€çš„æ›¿ä»£å“。 | |
| 171 | +### 基本相关 | |
| 172 | +商哿»¡è¶³ç”¨æˆ·çš„ä¸»è¦æ„å›¾ï¼šæ ¸å¿ƒå•†å“类型匹é…ï¼Œä½†æŸ¥è¯¢ä¸æ˜Žç¡®æå‡ºçš„éƒ¨åˆ†è¦æ±‚未在商å“ä¿¡æ¯ä¸ä½“çŽ°ã€æ— 法确认,或å˜åœ¨è½»å¾®åå·® / éžå…³é”®å差。该商å“仿˜¯æ»¡è¶³ç”¨æˆ·æ ¸å¿ƒéœ€æ±‚的良好替代å“。 | |
| 113 | 173 | |
| 114 | -在以下情况使用部分相关: | |
| 115 | -- æ ¸å¿ƒäº§å“类型匹é…,但部分请求的属性在商å“ä¿¡æ¯ä¸ç¼ºå¤±ã€æœªæåŠæˆ–æ— æ³•ç¡®è®¤ã€‚ | |
| 116 | -- æ ¸å¿ƒäº§å“类型匹é…,但æè´¨ã€ç‰ˆåž‹ã€é£Žæ ¼ç‰æ¬¡è¦è¦æ±‚å˜åœ¨å差或ä¸ä¸€è‡´ã€‚ | |
| 117 | -- 商å“䏿˜¯ç”¨æˆ·æœ€ç†æƒ³çš„ç›®æ ‡ï¼Œä½†ä»Žç”µå•†è´ç‰©è§’度看,ä»å¯èƒ½è¢«ç”¨æˆ·è§†ä¸ºå¯æŽ¥å—的替代å“。 | |
| 174 | +在以下情况使用“基本相关â€ï¼š | |
| 175 | +- æ ¸å¿ƒå•†å“类型匹é…ï¼Œä½†éƒ¨åˆ†å±žæ€§ç¼ºå¤±ã€æœªæåŠæˆ–æ— æ³•ç¡®è®¤ã€‚ | |
| 176 | +- æ ¸å¿ƒå•†å“类型匹é…ï¼Œä½†é¢œè‰²ã€æè´¨ã€é£Žæ ¼ã€ç‰ˆåž‹ã€é•¿åº¦ç‰å±žæ€§å˜åœ¨è½»å¾®å差,åªè¦è¿™ç§åå·®ä¸ä¼šæ˜Žæ˜¾ç ´å用户的主è¦è´ä¹°æ„图。 | |
| 177 | +- 商å“䏿˜¯ç”¨æˆ·æœ€ç†æƒ³çš„ç›®æ ‡ï¼Œä½†åœ¨ç”µå•†è´ç‰©åœºæ™¯ä¸‹ä»å¯èƒ½è¢«è§†ä¸ºå¯æŽ¥å—ã€ä¸”较优的替代å“。 | |
| 118 | 178 | |
| 119 | 179 | 典型情况: |
| 120 | -- 查询:“红色修身Tæ¤â€ï¼Œäº§å“:“女士Tæ¤â€ → 颜色/ç‰ˆåž‹æ— æ³•ç¡®è®¤ã€‚ | |
| 121 | -- 查询:“红色修身Tæ¤â€ï¼Œäº§å“:“è“色修身Tæ¤â€ → 产å“类型和版型匹é…,但颜色ä¸åŒã€‚ | |
| 180 | +- 查询:“红色修身Tæ¤â€ï¼Œå•†å“:“女士Tæ¤â€ | |
| 181 | + → 颜色ã€ç‰ˆåž‹æ— 法确认。 | |
| 182 | +- 查询:“红色修身Tæ¤â€ï¼Œå•†å“:“è“色修身Tæ¤â€ | |
| 183 | + → 商å“类型和版型匹é…,但颜色ä¸åŒã€‚ | |
| 122 | 184 | |
| 123 | 185 | 详细案例: |
| 124 | 186 | - 查询:“棉质长袖衬衫†|
| 125 | 187 | - 商å“:“J.VERç”·å¼äºšéº»è¡¬è¡«ä¼‘闲纽扣长袖衬衫纯色平领å¤å£æ²™æ»©è¡¬è¡«å¸¦å£è¢‹â€ |
| 126 | 188 | |
| 127 | 189 | 分æžï¼š |
| 128 | -- æè´¨ä¸ç¬¦ï¼šQuery 明确指定“棉质â€ï¼Œè€Œå•†å“为“亚麻â€ï¼Œå› æ¤ä¸èƒ½åˆ¤ä¸ºå®Œå…¨ç›¸å…³ã€‚ | |
| 190 | +- æè´¨ä¸ç¬¦ï¼šQuery 明确指定“棉质â€ï¼Œè€Œå•†å“为“亚麻â€ï¼Œå› æ¤ä¸èƒ½åˆ¤ä¸ºâ€œå®Œå…¨ç›¸å…³â€ã€‚ | |
| 129 | 191 | - ä½†æ ¸å¿ƒå“ç±»ä»ç„¶åŒ¹é…:两者都是“长袖衬衫â€ã€‚ |
| 130 | 192 | - 在电商æœç´¢ä¸ï¼Œç”¨æˆ·ä»å¯èƒ½å› 为款å¼ã€ç©¿ç€åœºæ™¯ç›¸è¿‘而点击该商å“。 |
| 131 | -- å› æ¤åº”判为部分相关,å³â€œéžç›®æ ‡ä½†å¯æŽ¥å—â€çš„æ›¿ä»£å“。 | |
| 193 | +- å› æ¤åº”判为“基本相关â€ï¼Œå³â€œéžç²¾ç¡®ç›®æ ‡ï¼Œä½†å±žäºŽè‰¯å¥½æ›¿ä»£å“â€ã€‚ | |
| 132 | 194 | |
| 133 | -### ä¸ç›¸å…³ | |
| 134 | -äº§å“æœªæ»¡è¶³ç”¨æˆ·çš„主è¦è´ç‰©æ„图,主è¦è¡¨çŽ°ä¸ºä»¥ä¸‹æƒ…å½¢ä¹‹ä¸€ï¼š | |
| 135 | -- æ ¸å¿ƒäº§å“类型与查询ä¸åŒ¹é…。 | |
| 136 | -- 产å“虽属大致相关的大类,但与查询指定的具体åç±»ä¸å¯äº’æ¢ã€‚ | |
| 137 | -- æ ¸å¿ƒäº§å“类型匹é…ï¼Œä½†äº§å“æ˜Žæ˜¾è¿èƒŒäº†æŸ¥è¯¢ä¸ä¸€ä¸ªæ˜Žç¡®ä¸”é‡è¦çš„å±žæ€§è¦æ±‚。 | |
| 195 | +详细案例: | |
| 196 | +- 查询:“黑色ä¸é•¿åŠèº«è£™â€ | |
| 197 | +- 商å“ï¼šâ€œæ˜¥ç§‹å£æ–°æ¬¾å®½æ¾æ˜¾ç˜¦å¤§æ‘†é•¿è£™ç¢ŽèбåŠèº«è£™è¤¶çš±è®¾è®¡è£™â€ | |
| 198 | + | |
| 199 | +分æžï¼š | |
| 200 | +- å“类匹é…ï¼šå•†å“æ˜¯â€œåŠèº«è£™â€ï¼Œå“类符åˆã€‚ | |
| 201 | +- 颜色ä¸åŒ¹é…ï¼šå•†å“æè¿°æœªæåŠé»‘色,且明确包å«â€œç¢Žèбâ€ï¼Œä¸Žçº¯é»‘差异较大。 | |
| 202 | +- 长度å˜åœ¨åå·®ï¼šç”¨æˆ·è¦æ±‚“ä¸é•¿â€ï¼Œè€Œå•†å“æ ‡é¢˜å¼ºè°ƒâ€œé•¿è£™â€ï¼Œé•¿åº¦å长。 | |
| 203 | +- ä½†æ ¸å¿ƒå“类“åŠèº«è£™â€åŒ¹é…,“显瘦â€â€œå¤§æ‘†â€ç‰é£Žæ ¼ç‰¹å¾ä»å¯èƒ½ç¬¦åˆéƒ¨åˆ†æœç´¢â€œä¸é•¿åŠèº«è£™â€ç”¨æˆ·çš„æ½œåœ¨åå¥½ï¼›åŒæ—¶â€œé•¿è£™â€å’Œâ€œä¸é•¿â€è™½æœ‰åå·®ï¼Œä½†ä¸æž„æˆä¸¥é‡å¯¹ç«‹ã€‚ | |
| 204 | +- å› æ¤åº”判为“基本相关â€ï¼šæ ¸å¿ƒå“类匹é…,但å˜åœ¨è‹¥å¹²éžè‡´å‘½å±žæ€§å差。 | |
| 205 | + | |
| 206 | +### 弱相关 | |
| 207 | +商å“ä¸Žç”¨æˆ·çš„æ ¸å¿ƒç›®æ ‡å˜åœ¨æ˜Žæ˜¾å·®è·ï¼Œä½†ä»ä¸ŽæŸ¥è¯¢åœ¨é£Žæ ¼ã€åœºæ™¯ã€åŠŸèƒ½æˆ–å¤§ç±»ä¸Šå…·æœ‰ä¸€å®šç›¸ä¼¼æ€§ï¼Œå¯èƒ½è¢«å°‘é‡ç”¨æˆ·è§†ä¸ºå‹‰å¼ºå¯æŽ¥å—的替代å“。属于“éžç›®æ ‡ï¼Œä½†ä»æœ‰ä¸€å®šå…³è”â€ã€‚ | |
| 208 | + | |
| 209 | +在以下情况使用“弱相关â€ï¼š | |
| 210 | +- æ ¸å¿ƒå•†å“类型ä¸ä¸€è‡´ï¼Œä½†ä¸¤è€…åœ¨é£Žæ ¼ã€ç©¿ç€åœºæ™¯æˆ–功能上éžå¸¸æŽ¥è¿‘,ä»å…·æœ‰ä¸€å®šæ›¿ä»£æ€§ã€‚ | |
| 211 | +- æ ¸å¿ƒå•†å“类型匹é…ï¼Œä½†åœ¨å¤šä¸ªå±žæ€§ä¸Šä¸Žç”¨æˆ·ç†æƒ³ç›®æ ‡å·®è·è¾ƒå¤§ï¼Œè™½ä»æœ‰ä¸€å®šå…³è”æ€§ï¼Œä½†å·²ä¸æ˜¯é«˜è´¨é‡æ›¿ä»£å“。 | |
| 212 | +- æŸ¥è¯¢è¦æ±‚ä¸çš„æŸä¸ªé‡è¦å±žæ€§è¢«æ˜Žæ˜¾è¿èƒŒï¼Œä½†å•†å“ä»ä¿ç•™å°‘é‡è¢«ç‚¹å‡»çš„ç†ç”±ã€‚ | |
| 138 | 213 | |
| 139 | 214 | 典型情况: |
| 140 | -- 查询:“裤åâ€ï¼Œäº§å“:“鞋å†→ 产å“类型错误。 | |
| 141 | -- 查询:“连衣裙â€ï¼Œäº§å“:“åŠèº«è£™â€ → 具体产å“类型ä¸åŒã€‚ | |
| 142 | -- 查询:“修身裤â€ï¼Œäº§å“:“宽æ¾é˜”腿裤†→ ä¸Žç‰ˆåž‹è¦æ±‚明显冲çªã€‚ | |
| 143 | -- æŸ¥è¯¢ï¼šâ€œæ— è¢–è¿žè¡£è£™â€ï¼Œäº§å“:“长袖连衣裙†→ ä¸Žè¢–åž‹è¦æ±‚明显冲çªã€‚ | |
| 215 | +- 查询:“黑色ä¸é•¿åŠèº«è£™â€ï¼Œå•†å“:“新款高腰V领ä¸é•¿æ¬¾è¿žè¡£è£™ 优雅å°èŠ±é»‘è‰²æ€§æ„Ÿè¿žè¡£è£™â€ | |
| 216 | + → æ ¸å¿ƒå•†å“类型“åŠèº«è£™â€ä¸Žâ€œè¿žè¡£è£™â€ä¸åŒï¼Œä½†ä¸¤è€…åŒå±žè£™è£…,且款å¼ä¸Šå‡ä¸ºâ€œä¸é•¿æ¬¾â€ï¼Œåœ¨ç©¿æåœºæ™¯ä¸ŠæŽ¥è¿‘ï¼Œå› æ¤å±žäºŽâ€œå¼±ç›¸å…³â€ã€‚ | |
| 217 | + | |
| 218 | +- 查询:“牛仔裤â€ï¼Œå•†å“:“休闲裤†| |
| 219 | + → æ ¸å¿ƒå•†å“类型ä¸åŒï¼Œä½†åŒå±žè£¤è£…å¤§ç±»ï¼Œé£Žæ ¼å’Œç©¿ç€åœºæ™¯å¯èƒ½æŽ¥è¿‘,å¯ä½œä¸ºè¾ƒå¼±æ›¿ä»£å“。 | |
| 220 | + | |
| 221 | +### ä¸ç›¸å…³ | |
| 222 | +商哿œªæ»¡è¶³ç”¨æˆ·çš„主è¦è´ç‰©æ„图,用户点击动机æžä½Žã€‚ | |
| 223 | + | |
| 224 | +主è¦è¡¨çŽ°ä¸ºä»¥ä¸‹æƒ…å½¢ä¹‹ä¸€ï¼š | |
| 225 | +- æ ¸å¿ƒå•†å“类型与查询ä¸åŒ¹é…,且ä¸å±žäºŽé£Žæ ¼ / 场景 / åŠŸèƒ½æŽ¥è¿‘çš„å¯æ›¿ä»£å“。 | |
| 226 | +- 商å“虽属于大致相关的大类,但与查询明确指定的具体åç±»ä¸å¯äº’æ¢ï¼Œä¸”é£Žæ ¼æˆ–åœºæ™¯å·®å¼‚å¤§ã€‚ | |
| 227 | +- æ ¸å¿ƒå•†å“类型匹é…ï¼Œä½†å•†å“æ˜Žæ˜¾è¿èƒŒäº†æŸ¥è¯¢ä¸ä¸€ä¸ªæ˜Žç¡®ä¸”é‡è¦çš„è¦æ±‚ï¼Œä¸”å‡ ä¹Žä¸å…·å¤‡å¯æŽ¥å—的替代性。 | |
| 144 | 228 | |
| 145 | -è¯¥æ ‡ç¾å¼ºè°ƒç”¨æˆ·æ„图的明确性。当查询指å‘具体类型或关键属性时,å³ä½¿äº§å“在更高层级类别上相关,也应按ä¸ç›¸å…³å¤„ç†ã€‚ | |
| 229 | +典型情况: | |
| 230 | +- 查询:“裤åâ€ï¼Œå•†å“:“鞋å†| |
| 231 | + → 商å“类型错误。 | |
| 232 | +- 查询:“修身裤â€ï¼Œå•†å“:“宽æ¾é˜”腿裤†| |
| 233 | + → ä¸Žç‰ˆåž‹è¦æ±‚明显冲çªï¼Œæ›¿ä»£æ€§æžä½Žã€‚ | |
| 234 | +- æŸ¥è¯¢ï¼šâ€œæ— è¢–è¿žè¡£è£™â€ï¼Œå•†å“:“长袖连衣裙†| |
| 235 | + → ä¸Žè¢–åž‹è¦æ±‚明显冲çªã€‚ | |
| 236 | +- 查询:“牛仔裤â€ï¼Œå•†å“:“è¿åŠ¨è£¤â€ | |
| 237 | + → æ ¸å¿ƒå“ç±»ä¸åŒï¼ˆç‰›ä»”裤 vs è¿åŠ¨è£¤ï¼‰ï¼Œé£Žæ ¼å’Œåœºæ™¯å·®å¼‚å¤§ã€‚ | |
| 238 | +- 查询:“é´åâ€ï¼Œå•†å“:“è¿åŠ¨éž‹â€ | |
| 239 | + → æ ¸å¿ƒå“ç±»ä¸åŒï¼ŒåŠŸèƒ½å’Œé€‚ç”¨åœºæ™¯å·®å¼‚å¤§ã€‚ | |
| 146 | 240 | |
| 147 | 241 | ## 判æ–原则 |
| 148 | 242 | |
| 149 | -1. 产å“ç±»åž‹æ˜¯æœ€é«˜ä¼˜å…ˆçº§å› ç´ ã€‚ | |
| 150 | - 如果查询明确指定了具体产å“类型,那么结果必须匹é…该产å“类型,æ‰å¯èƒ½åˆ¤ä¸ºâ€œå®Œå…¨ç›¸å…³â€æˆ–“部分相关â€ã€‚ | |
| 151 | - ä¸åŒäº§å“类型通常应判为“ä¸ç›¸å…³â€ï¼Œè€Œä¸æ˜¯â€œéƒ¨åˆ†ç›¸å…³â€ã€‚ | |
| 243 | +1. **商å“ç±»åž‹æ˜¯æœ€é«˜ä¼˜å…ˆçº§å› ç´ ã€‚** | |
| 244 | + 如果查询明确指定了具体商å“类型,那么结果必须匹é…该商å“类型,æ‰å¯èƒ½åˆ¤ä¸ºâ€œå®Œå…¨ç›¸å…³â€æˆ–“基本相关â€ã€‚ | |
| 245 | + ä¸åŒå•†å“ç±»åž‹é€šå¸¸åº”åˆ¤ä¸ºâ€œå¼±ç›¸å…³â€æˆ–“ä¸ç›¸å…³â€ã€‚ | |
| 246 | + | |
| 247 | + - **弱相关**:仅当两ç§å•†å“ç±»åž‹åœ¨é£Žæ ¼ã€åœºæ™¯ã€åŠŸèƒ½ä¸Šéžå¸¸æŽ¥è¿‘ï¼Œç”¨æˆ·æœ‰ä¸€å®šæ¦‚çŽ‡å°†å…¶è§†ä¸ºå‹‰å¼ºå¯æŽ¥å—çš„æ›¿ä»£å“æ—¶ä½¿ç”¨ã€‚ | |
| 248 | + - **ä¸ç›¸å…³**:其他所有商å“类型ä¸åŒ¹é…的情况。 | |
| 152 | 249 | |
| 153 | -2. 相似或相关的产å“类型,在查询明确时通常ä¸å¯äº’æ¢ã€‚ | |
| 250 | +2. **相似或相关的商å“类型,在查询明确时通常ä¸å¯ç›´æŽ¥äº’æ¢ï¼Œä½†è¦æ ¹æ®æŽ¥è¿‘程度区分“弱相关â€ä¸Žâ€œä¸ç›¸å…³â€ã€‚** | |
| 154 | 251 | 例如: |
| 155 | - - 连衣裙 vs åŠèº«è£™ vs 连体裤 | |
| 156 | - - 牛仔裤 vs 裤å | |
| 157 | - - Tæ¤ vs 衬衫/上衣 | |
| 158 | - - 开衫 vs 毛衣 | |
| 159 | - - é´å vs éž‹å | |
| 160 | - - 文胸 vs 上衣 | |
| 161 | - - åŒè‚©åŒ… vs 包 | |
| 162 | - 如果用户明确æœç´¢å…¶ä¸ä¸€ç§ï¼Œå…¶ä»–类型通常应判为“ä¸ç›¸å…³â€ã€‚ | |
| 163 | - | |
| 164 | -3. å½“æ ¸å¿ƒäº§å“类型匹é…åŽï¼Œå†è¯„估属性。 | |
| 165 | - - æ‰€æœ‰æ˜Žç¡®å±žæ€§éƒ½åŒ¹é… â†’ 完全相关 | |
| 166 | - - éƒ¨åˆ†å±žæ€§ç¼ºå¤±ã€æ— 法确认,或å˜åœ¨ä¸€å®šåå·®ï¼Œä½†ä»æ˜¯å¯æŽ¥å—æ›¿ä»£å“ â†’ 部分相关 | |
| 167 | - - 明确且é‡è¦çš„属性被明显è¿èƒŒï¼Œä¸”ä¸èƒ½ä½œä¸ºåˆç†æ›¿ä»£å“ → ä¸ç›¸å…³ | |
| 168 | - | |
| 169 | -4. è¦ä¸¥æ ¼åŒºåˆ†â€œæœªæåŠ/æ— æ³•ç¡®è®¤â€å’Œâ€œæ˜Žç¡®å†²çªâ€ã€‚ | |
| 170 | - - 如果æŸå±žæ€§æ²¡æœ‰æåŠï¼Œæˆ–æ— æ³•éªŒè¯ï¼Œä¼˜å…ˆåˆ¤ä¸ºâ€œéƒ¨åˆ†ç›¸å…³â€ã€‚ | |
| 171 | - - 如果æŸå±žæ€§ä¸ŽæŸ¥è¯¢è¦æ±‚明确相å,则判为“ä¸ç›¸å…³â€ï¼›é™¤éžåœ¨è´ç‰©è¯å¢ƒä¸‹å®ƒä»æ˜Žæ˜¾å±žäºŽå¯æŽ¥å—替代å“。 | |
| 252 | + - **é£Žæ ¼ / 场景高度接近,å¯åˆ¤ä¸ºå¼±ç›¸å…³**:连衣裙 vs åŠèº«è£™ã€é•¿è£™ vs ä¸é•¿è£™ã€ç‰›ä»”裤 vs 休闲裤ã€è¿åŠ¨éž‹ vs æ¿éž‹ã€‚ | |
| 253 | + - **é£Žæ ¼ / 场景差异大,应判为ä¸ç›¸å…³**:裤å vs éž‹åã€Tæ¤ vs 帽åã€é´å vs è¿åŠ¨éž‹ã€ç‰›ä»”裤 vs 西装裤ã€åŒè‚©åŒ… vs 手æåŒ…。 | |
| 254 | + | |
| 255 | +3. **å½“æ ¸å¿ƒå•†å“类型匹é…åŽï¼Œå†è¯„估属性。** | |
| 256 | + - æ‰€æœ‰æ˜Žç¡®å±žæ€§éƒ½åŒ¹é… â†’ **完全相关** | |
| 257 | + - éƒ¨åˆ†å±žæ€§ç¼ºå¤±ã€æœªæåŠã€æ— 法确认,或å˜åœ¨è½»å¾®åå·® → **基本相关** | |
| 258 | + - å˜åœ¨å¤šä¸ªå±žæ€§å差,或æŸä¸ªé‡è¦å±žæ€§è¢«æ˜Žæ˜¾è¿èƒŒï¼Œä½†å•†å“ä»ä¿ç•™ä¸€å®šæ›¿ä»£æ€§ → **弱相关** | |
| 259 | + - å˜åœ¨æ˜Žç¡®ä¸”é‡è¦çš„强冲çªï¼Œä¸”替代性æžä½Ž → **ä¸ç›¸å…³** | |
| 260 | + | |
| 261 | +4. **è¦ä¸¥æ ¼åŒºåˆ†â€œæœªæåŠ / æ— æ³•ç¡®è®¤â€â€œè½»å¾®åå·®â€â€œæ˜Žç¡®å†²çªâ€ã€‚** | |
| 262 | + - 如果æŸå±žæ€§æ²¡æœ‰æåŠï¼Œæˆ–æ— æ³•éªŒè¯ï¼Œä¼˜å…ˆåˆ¤ä¸ºâ€œåŸºæœ¬ç›¸å…³â€ã€‚ | |
| 263 | + - 如果æŸå±žæ€§å˜åœ¨è½»å¾®å差,如颜色ä¸åŒã€æè´¨ä¸åŒã€é•¿åº¦ç•¥æœ‰å·®å¼‚,通常判为“基本相关â€ã€‚ | |
| 264 | + - 如果æŸå±žæ€§ä¸ŽæŸ¥è¯¢è¦æ±‚明确相åï¼Œå¦‚æ— è¢– vs 长袖ã€ä¿®èº« vs 宽æ¾é˜”è…¿ï¼Œåˆ™è¦æ ¹æ®å†²çªä¸¥é‡æ€§ä¸Žæ›¿ä»£æ€§ï¼Œåœ¨â€œå¼±ç›¸å…³â€ä¸Žâ€œä¸ç›¸å…³â€ä¹‹é—´åˆ¤æ–。 | |
| 265 | + - 若该冲çªä¼šç›´æŽ¥ç ´å用户的主è¦è´ä¹°ç›®æ ‡ï¼Œé€šå¸¸åˆ¤ä¸ºâ€œä¸ç›¸å…³â€ã€‚ | |
| 266 | + | |
| 267 | +5. **“是å¦å¯æ›¿ä»£â€åº”从真实电商è´ç‰©æ„图出å‘判æ–。** | |
| 268 | + 䏿˜¯åªçœ‹å—é¢ç›¸ä¼¼ï¼Œè€Œè¦çœ‹ç”¨æˆ·åœ¨è´ç‰©åœºæ™¯ä¸‹æ˜¯å¦å¯èƒ½æŽ¥å—该商å“。 | |
| 269 | + - è‰¯å¥½æ›¿ä»£å“ â†’ **基本相关** | |
| 270 | + - å‹‰å¼ºæ›¿ä»£å“ â†’ **弱相关** | |
| 271 | + - å‡ ä¹Žä¸å¯æ›¿ä»£ → **ä¸ç›¸å…³** | |
| 272 | + | |
| 273 | +6. **若商å“ä¿¡æ¯ä¸è¶³ï¼Œä¸è¦æŠŠâ€œæ— 法确认â€è¯¯åˆ¤ä¸ºâ€œå†²çªâ€ã€‚** | |
| 274 | + 商哿œªå†™æ˜ŽæŸå±žæ€§ï¼Œä¸ç‰äºŽè¯¥å±žæ€§ä¸€å®šä¸ç¬¦åˆã€‚ | |
| 275 | + å› æ¤ï¼š | |
| 276 | + - 未æåŠ / æ— æ³•ç¡®è®¤ï¼Œä¼˜å…ˆæŒ‰â€œåŸºæœ¬ç›¸å…³â€å¤„ç†ï¼› | |
| 277 | + - åªæœ‰å½“商å“ä¿¡æ¯æ˜Žç¡®æ˜¾ç¤ºä¸ŽæŸ¥è¯¢è¦æ±‚ç›¸åæ—¶ï¼Œæ‰è§†ä¸ºå±žæ€§å†²çªã€‚ | |
| 172 | 278 | |
| 173 | 279 | 查询:{query} |
| 174 | 280 | |
| ... | ... | @@ -176,9 +282,10 @@ _CLASSIFY_BATCH_SIMPLE_TEMPLATE__zh = _CLASSIFY_BATCH_SIMPLE_TEMPLATE_ZH = """ä½ |
| 176 | 282 | {lines} |
| 177 | 283 | |
| 178 | 284 | ## è¾“å‡ºæ ¼å¼ |
| 179 | -ä¸¥æ ¼è¾“å‡º {n} 行,æ¯è¡Œåªèƒ½æ˜¯ä»¥ä¸‹ä¸‰è€…之一: | |
| 285 | +ä¸¥æ ¼è¾“å‡º {n} 行,æ¯è¡Œåªèƒ½æ˜¯ä»¥ä¸‹å››è€…之一: | |
| 180 | 286 | 完全相关 |
| 181 | -部分相关 | |
| 287 | +基本相关 | |
| 288 | +弱相关 | |
| 182 | 289 | ä¸ç›¸å…³ |
| 183 | 290 | |
| 184 | 291 | 输出行必须与上方商å“顺åºä¸€ä¸€å¯¹åº”。 |
| ... | ... | @@ -186,77 +293,7 @@ _CLASSIFY_BATCH_SIMPLE_TEMPLATE__zh = _CLASSIFY_BATCH_SIMPLE_TEMPLATE_ZH = """ä½ |
| 186 | 293 | """ |
| 187 | 294 | |
| 188 | 295 | |
| 189 | - | |
| 190 | -def classify_batch_simple_prompt(query: str, numbered_doc_lines: Sequence[str]) -> str: | |
| 296 | +def classify_prompt(query: str, numbered_doc_lines: Sequence[str]) -> str: | |
| 191 | 297 | lines = "\n".join(numbered_doc_lines) |
| 192 | 298 | n = len(numbered_doc_lines) |
| 193 | - return _CLASSIFY_BATCH_SIMPLE_TEMPLATE.format(query=query, lines=lines, n=n) | |
| 194 | - | |
| 195 | - | |
| 196 | -_EXTRACT_QUERY_PROFILE_TEMPLATE = """You are building a structured intent profile for e-commerce relevance judging. | |
| 197 | -Use the original user query as the source of truth. Parser hints may help, but if a hint conflicts with the original query, trust the original query. | |
| 198 | -Be conservative: only mark an attribute as required if the user explicitly asked for it. | |
| 199 | - | |
| 200 | -Return JSON with this schema: | |
| 201 | -{{ | |
| 202 | - "normalized_query_en": string, | |
| 203 | - "primary_category": string, | |
| 204 | - "allowed_categories": [string], | |
| 205 | - "required_attributes": [ | |
| 206 | - {{"name": string, "required_terms": [string], "conflicting_terms": [string], "match_mode": "explicit"}} | |
| 207 | - ], | |
| 208 | - "notes": [string] | |
| 209 | -}} | |
| 210 | - | |
| 211 | -Guidelines: | |
| 212 | -- Exact later will require explicit evidence for all required attributes. | |
| 213 | -- allowed_categories should contain only near-synonyms of the same product type, not substitutes. For example dress can allow midi dress/cocktail dress, but not skirt, top, jumpsuit, or outfit unless the query explicitly asks for them. | |
| 214 | -- If the query asks for dress/skirt/jeans/t-shirt, near but different product types are not Exact. | |
| 215 | -- If the query includes color, fit, silhouette, or length, include them as required_attributes. | |
| 216 | -- For fit words, include conflicting terms when obvious, e.g. fitted conflicts with oversized/loose; oversized conflicts with fitted/tight. | |
| 217 | -- For color, include conflicting colors only when clear from the query. | |
| 218 | - | |
| 219 | -Original query: {query} | |
| 220 | -Parser hints JSON: {hints_json} | |
| 221 | -""" | |
| 222 | - | |
| 223 | - | |
| 224 | -def extract_query_profile_prompt(query: str, parser_hints: Dict[str, Any]) -> str: | |
| 225 | - hints_json = json.dumps(parser_hints, ensure_ascii=False) | |
| 226 | - return _EXTRACT_QUERY_PROFILE_TEMPLATE.format(query=query, hints_json=hints_json) | |
| 227 | - | |
| 228 | - | |
| 229 | -_CLASSIFY_BATCH_COMPLEX_TEMPLATE = """You are an e-commerce search relevance judge. | |
| 230 | -Judge each product against the structured query profile below. | |
| 231 | - | |
| 232 | -Relevance rules: | |
| 233 | -- Exact: product type matches the target intent, and every explicit required attribute is positively supported by the title/options/tags/category. If an attribute is missing or only guessed, it is NOT Exact. | |
| 234 | -- Partial: main product type/use case matches, but some required attribute is missing, weaker, uncertain, or only approximately matched. | |
| 235 | -- Irrelevant: product type/use case mismatched, or an explicit required attribute clearly conflicts. | |
| 236 | -- Be conservative with Exact. | |
| 237 | -- Graphic/holiday/message tees are not Exact for a plain color/style tee query unless that graphic/theme was requested. | |
| 238 | -- Jumpsuit/romper/set is not Exact for dress/skirt/jeans queries. | |
| 239 | - | |
| 240 | -Original query: {query} | |
| 241 | -Structured query profile JSON: {profile_json} | |
| 242 | - | |
| 243 | -Products: | |
| 244 | -{lines} | |
| 245 | - | |
| 246 | -Return JSON only, with schema: | |
| 247 | -{{"labels":[{{"index":1,"label":"Exact","reason":"short phrase"}}]}} | |
| 248 | -""" | |
| 249 | - | |
| 250 | - | |
| 251 | -def classify_batch_complex_prompt( | |
| 252 | - query: str, | |
| 253 | - query_profile: Dict[str, Any], | |
| 254 | - numbered_doc_lines: Sequence[str], | |
| 255 | -) -> str: | |
| 256 | - lines = "\n".join(numbered_doc_lines) | |
| 257 | - profile_json = json.dumps(query_profile, ensure_ascii=False) | |
| 258 | - return _CLASSIFY_BATCH_COMPLEX_TEMPLATE.format( | |
| 259 | - query=query, | |
| 260 | - profile_json=profile_json, | |
| 261 | - lines=lines, | |
| 262 | - ) | |
| 299 | + return _CLASSIFY_TEMPLATE_EN.format(query=query, lines=lines, n=n) | ... | ... |
scripts/evaluation/eval_framework/reports.py
| ... | ... | @@ -4,7 +4,7 @@ from __future__ import annotations |
| 4 | 4 | |
| 5 | 5 | from typing import Any, Dict |
| 6 | 6 | |
| 7 | -from .constants import RELEVANCE_EXACT, RELEVANCE_IRRELEVANT, RELEVANCE_PARTIAL | |
| 7 | +from .constants import RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_IRRELEVANT, RELEVANCE_LOW | |
| 8 | 8 | |
| 9 | 9 | |
| 10 | 10 | def render_batch_report_markdown(payload: Dict[str, Any]) -> str: |
| ... | ... | @@ -29,8 +29,9 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str: |
| 29 | 29 | "", |
| 30 | 30 | "## Label Distribution", |
| 31 | 31 | "", |
| 32 | - f"- Exact: {distribution.get(RELEVANCE_EXACT, 0)}", | |
| 33 | - f"- Partial: {distribution.get(RELEVANCE_PARTIAL, 0)}", | |
| 32 | + f"- Exact Match: {distribution.get(RELEVANCE_EXACT, 0)}", | |
| 33 | + f"- High Relevant: {distribution.get(RELEVANCE_HIGH, 0)}", | |
| 34 | + f"- Low Relevant: {distribution.get(RELEVANCE_LOW, 0)}", | |
| 34 | 35 | f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}", |
| 35 | 36 | ] |
| 36 | 37 | ) |
| ... | ... | @@ -41,8 +42,9 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str: |
| 41 | 42 | for key, value in sorted((item.get("metrics") or {}).items()): |
| 42 | 43 | lines.append(f"- {key}: {value}") |
| 43 | 44 | distribution = item.get("distribution") or {} |
| 44 | - lines.append(f"- Exact: {distribution.get(RELEVANCE_EXACT, 0)}") | |
| 45 | - lines.append(f"- Partial: {distribution.get(RELEVANCE_PARTIAL, 0)}") | |
| 45 | + lines.append(f"- Exact Match: {distribution.get(RELEVANCE_EXACT, 0)}") | |
| 46 | + lines.append(f"- High Relevant: {distribution.get(RELEVANCE_HIGH, 0)}") | |
| 47 | + lines.append(f"- Low Relevant: {distribution.get(RELEVANCE_LOW, 0)}") | |
| 46 | 48 | lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}") |
| 47 | 49 | lines.append("") |
| 48 | 50 | return "\n".join(lines) | ... | ... |
scripts/evaluation/eval_framework/static/eval_web.css
| ... | ... | @@ -35,12 +35,14 @@ |
| 35 | 35 | .results { display: grid; gap: 10px; } |
| 36 | 36 | .result { display: grid; grid-template-columns: 110px 100px 1fr; gap: 14px; align-items: center; background: var(--panel); border: 1px solid var(--line); border-radius: 18px; padding: 12px; } |
| 37 | 37 | .badge { display: inline-block; padding: 8px 10px; border-radius: 999px; color: white; font-weight: 700; text-align: center; } |
| 38 | - .Exact { background: var(--exact); } | |
| 39 | - .Partial { background: var(--partial); } | |
| 40 | - .Irrelevant { background: var(--irrelevant); } | |
| 41 | - .Unknown { background: #637381; } | |
| 38 | + .label-exact-match { background: var(--exact); } | |
| 39 | + .label-high-relevant { background: var(--partial); } | |
| 40 | + .label-low-relevant { background: #6b5b95; } | |
| 41 | + .label-irrelevant { background: var(--irrelevant); } | |
| 42 | + .badge-unknown { background: #637381; } | |
| 42 | 43 | .thumb { width: 100px; height: 100px; object-fit: cover; border-radius: 14px; background: #e7e1d4; } |
| 43 | - .title { font-size: 16px; font-weight: 700; margin-bottom: 8px; } | |
| 44 | + .title { font-size: 16px; font-weight: 700; margin-bottom: 4px; } | |
| 45 | + .title-zh { font-size: 14px; font-weight: 500; color: var(--muted); margin-bottom: 8px; line-height: 1.4; } | |
| 44 | 46 | .options { color: var(--muted); line-height: 1.5; font-size: 14px; } |
| 45 | 47 | .section { margin-bottom: 28px; } |
| 46 | 48 | .history { font-size: 13px; line-height: 1.5; } | ... | ... |
scripts/evaluation/eval_framework/static/eval_web.js
| ... | ... | @@ -13,6 +13,10 @@ |
| 13 | 13 | root.appendChild(card); |
| 14 | 14 | }); |
| 15 | 15 | } |
| 16 | + function labelBadgeClass(label) { | |
| 17 | + if (!label || label === 'Unknown') return 'badge-unknown'; | |
| 18 | + return 'label-' + String(label).toLowerCase().replace(/\s+/g, '-'); | |
| 19 | + } | |
| 16 | 20 | function renderResults(results, rootId='results', showRank=true) { |
| 17 | 21 | const mount = document.getElementById(rootId); |
| 18 | 22 | mount.innerHTML = ''; |
| ... | ... | @@ -21,10 +25,11 @@ |
| 21 | 25 | const box = document.createElement('div'); |
| 22 | 26 | box.className = 'result'; |
| 23 | 27 | box.innerHTML = ` |
| 24 | - <div><span class="badge ${label}">${label}</span><div class="muted" style="margin-top:8px">${showRank ? `#${item.rank || '-'}` : (item.rerank_score != null ? `rerank=${item.rerank_score.toFixed ? item.rerank_score.toFixed(4) : item.rerank_score}` : 'not recalled')}</div></div> | |
| 28 | + <div><span class="badge ${labelBadgeClass(label)}">${label}</span><div class="muted" style="margin-top:8px">${showRank ? `#${item.rank || '-'}` : (item.rerank_score != null ? `rerank=${item.rerank_score.toFixed ? item.rerank_score.toFixed(4) : item.rerank_score}` : 'not recalled')}</div></div> | |
| 25 | 29 | <img class="thumb" src="${item.image_url || ''}" alt="" /> |
| 26 | 30 | <div> |
| 27 | 31 | <div class="title">${item.title || ''}</div> |
| 32 | + ${item.title_zh ? `<div class="title-zh">${item.title_zh}</div>` : ''} | |
| 28 | 33 | <div class="options"> |
| 29 | 34 | <div>${(item.option_values || [])[0] || ''}</div> |
| 30 | 35 | <div>${(item.option_values || [])[1] || ''}</div> |
| ... | ... | @@ -41,7 +46,7 @@ |
| 41 | 46 | const root = document.getElementById('tips'); |
| 42 | 47 | const tips = [...(data.tips || [])]; |
| 43 | 48 | const stats = data.label_stats || {}; |
| 44 | - tips.unshift(`Cached labels for query: ${stats.total || 0}. Recalled hits: ${stats.recalled_hits || 0}. Missed Exact: ${stats.missing_exact_count || 0}. Missed Partial: ${stats.missing_partial_count || 0}.`); | |
| 49 | + tips.unshift(`Cached labels for query: ${stats.total || 0}. Recalled hits: ${stats.recalled_hits || 0}. Missed (non-irrelevant): ${stats.missing_relevant_count || 0} — Exact: ${stats.missing_exact_count || 0}, High: ${stats.missing_high_count || 0}, Low: ${stats.missing_low_count || 0}.`); | |
| 45 | 50 | root.innerHTML = tips.map(text => `<div class="tip">${text}</div>`).join(''); |
| 46 | 51 | } |
| 47 | 52 | async function loadQueries() { | ... | ... |
scripts/evaluation/eval_framework/static/index.html
| ... | ... | @@ -37,7 +37,7 @@ |
| 37 | 37 | <div id="results" class="results"></div> |
| 38 | 38 | </section> |
| 39 | 39 | <section class="section"> |
| 40 | - <h2>Missed Exact / Partial</h2> | |
| 40 | + <h2>Missed non-irrelevant (cached)</h2> | |
| 41 | 41 | <div id="missingRelevant" class="results"></div> |
| 42 | 42 | </section> |
| 43 | 43 | <section class="section"> | ... | ... |
scripts/evaluation/eval_framework/store.py
| ... | ... | @@ -8,7 +8,7 @@ from dataclasses import dataclass |
| 8 | 8 | from pathlib import Path |
| 9 | 9 | from typing import Any, Dict, List, Optional, Sequence |
| 10 | 10 | |
| 11 | -from .constants import VALID_LABELS | |
| 11 | +from .constants import VALID_LABELS, normalize_stored_label | |
| 12 | 12 | from .utils import ensure_dir, safe_json_dumps, utc_now_iso |
| 13 | 13 | |
| 14 | 14 | |
| ... | ... | @@ -220,7 +220,7 @@ class EvalStore: |
| 220 | 220 | """, |
| 221 | 221 | (tenant_id, query_text), |
| 222 | 222 | ).fetchall() |
| 223 | - return {str(row["spu_id"]): str(row["label"]) for row in rows} | |
| 223 | + return {str(row["spu_id"]): normalize_stored_label(str(row["label"])) for row in rows} | |
| 224 | 224 | |
| 225 | 225 | def upsert_labels( |
| 226 | 226 | self, |
| ... | ... | @@ -379,8 +379,9 @@ class EvalStore: |
| 379 | 379 | SELECT |
| 380 | 380 | query_text, |
| 381 | 381 | COUNT(*) AS total, |
| 382 | - SUM(CASE WHEN label='Exact' THEN 1 ELSE 0 END) AS exact_count, | |
| 383 | - SUM(CASE WHEN label='Partial' THEN 1 ELSE 0 END) AS partial_count, | |
| 382 | + SUM(CASE WHEN label IN ('Exact Match','Exact') THEN 1 ELSE 0 END) AS exact_count, | |
| 383 | + SUM(CASE WHEN label IN ('High Relevant','Partial') THEN 1 ELSE 0 END) AS high_relevant_count, | |
| 384 | + SUM(CASE WHEN label='Low Relevant' THEN 1 ELSE 0 END) AS low_relevant_count, | |
| 384 | 385 | SUM(CASE WHEN label='Irrelevant' THEN 1 ELSE 0 END) AS irrelevant_count, |
| 385 | 386 | MAX(updated_at) AS updated_at |
| 386 | 387 | FROM relevance_labels |
| ... | ... | @@ -395,7 +396,8 @@ class EvalStore: |
| 395 | 396 | "query": str(row["query_text"]), |
| 396 | 397 | "total": int(row["total"]), |
| 397 | 398 | "exact_count": int(row["exact_count"] or 0), |
| 398 | - "partial_count": int(row["partial_count"] or 0), | |
| 399 | + "high_relevant_count": int(row["high_relevant_count"] or 0), | |
| 400 | + "low_relevant_count": int(row["low_relevant_count"] or 0), | |
| 399 | 401 | "irrelevant_count": int(row["irrelevant_count"] or 0), |
| 400 | 402 | "updated_at": row["updated_at"], |
| 401 | 403 | } |
| ... | ... | @@ -407,8 +409,9 @@ class EvalStore: |
| 407 | 409 | """ |
| 408 | 410 | SELECT |
| 409 | 411 | COUNT(*) AS total, |
| 410 | - SUM(CASE WHEN label='Exact' THEN 1 ELSE 0 END) AS exact_count, | |
| 411 | - SUM(CASE WHEN label='Partial' THEN 1 ELSE 0 END) AS partial_count, | |
| 412 | + SUM(CASE WHEN label IN ('Exact Match','Exact') THEN 1 ELSE 0 END) AS exact_count, | |
| 413 | + SUM(CASE WHEN label IN ('High Relevant','Partial') THEN 1 ELSE 0 END) AS high_relevant_count, | |
| 414 | + SUM(CASE WHEN label='Low Relevant' THEN 1 ELSE 0 END) AS low_relevant_count, | |
| 412 | 415 | SUM(CASE WHEN label='Irrelevant' THEN 1 ELSE 0 END) AS irrelevant_count, |
| 413 | 416 | MAX(updated_at) AS updated_at |
| 414 | 417 | FROM relevance_labels |
| ... | ... | @@ -420,7 +423,8 @@ class EvalStore: |
| 420 | 423 | "query": query_text, |
| 421 | 424 | "total": int((row["total"] or 0) if row else 0), |
| 422 | 425 | "exact_count": int((row["exact_count"] or 0) if row else 0), |
| 423 | - "partial_count": int((row["partial_count"] or 0) if row else 0), | |
| 426 | + "high_relevant_count": int((row["high_relevant_count"] or 0) if row else 0), | |
| 427 | + "low_relevant_count": int((row["low_relevant_count"] or 0) if row else 0), | |
| 424 | 428 | "irrelevant_count": int((row["irrelevant_count"] or 0) if row else 0), |
| 425 | 429 | "updated_at": row["updated_at"] if row else None, |
| 426 | 430 | } | ... | ... |
scripts/evaluation/eval_framework/utils.py
| ... | ... | @@ -42,6 +42,14 @@ def pick_text(value: Any, preferred_lang: str = "en") -> str: |
| 42 | 42 | return str(value).strip() |
| 43 | 43 | |
| 44 | 44 | |
| 45 | +def zh_title_from_multilingual(title_multilingual: Any) -> str: | |
| 46 | + """Chinese title string from API debug ``title_multilingual`` (ES-style dict).""" | |
| 47 | + if not isinstance(title_multilingual, dict): | |
| 48 | + return "" | |
| 49 | + zh = str(title_multilingual.get("zh") or "").strip() | |
| 50 | + return zh | |
| 51 | + | |
| 52 | + | |
| 45 | 53 | def safe_json_dumps(data: Any) -> str: |
| 46 | 54 | return json.dumps(data, ensure_ascii=False, separators=(",", ":")) |
| 47 | 55 | ... | ... |
scripts/evaluation/quick_start_eval.sh renamed to scripts/evaluation/start_eval.sh
| ... | ... | @@ -10,7 +10,7 @@ QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}" |
| 10 | 10 | |
| 11 | 11 | usage() { |
| 12 | 12 | echo "Usage: $0 batch|batch-rebuild|serve" |
| 13 | - echo " batch — batch eval: live search every query, LLM only for missing labels (top_k=50, simple)" | |
| 13 | + echo " batch — batch eval: live search every query, LLM only for missing labels (top_k=50)" | |
| 14 | 14 | echo " batch-rebuild — deep rebuild: build --force-refresh-labels (search recall pool + full-corpus rerank + batched LLM; expensive)" |
| 15 | 15 | echo " serve — eval UI (default http://0.0.0.0:\${EVAL_WEB_PORT:-6010}/; also: ./scripts/start_eval_web.sh)" |
| 16 | 16 | echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)" |
| ... | ... | @@ -22,8 +22,7 @@ case "${1:-}" in |
| 22 | 22 | --tenant-id "$TENANT_ID" \ |
| 23 | 23 | --queries-file "$QUERIES" \ |
| 24 | 24 | --top-k 50 \ |
| 25 | - --language en \ | |
| 26 | - --labeler-mode simple | |
| 25 | + --language en | |
| 27 | 26 | ;; |
| 28 | 27 | batch-rebuild) |
| 29 | 28 | exec "$PY" scripts/evaluation/build_annotation_set.py build \ |
| ... | ... | @@ -33,8 +32,7 @@ case "${1:-}" in |
| 33 | 32 | --rerank-depth 10000 \ |
| 34 | 33 | --force-refresh-rerank \ |
| 35 | 34 | --force-refresh-labels \ |
| 36 | - --language en \ | |
| 37 | - --labeler-mode simple | |
| 35 | + --language en | |
| 38 | 36 | ;; |
| 39 | 37 | serve) |
| 40 | 38 | EVAL_WEB_PORT="${EVAL_WEB_PORT:-6010}" | ... | ... |
tests/test_product_enrich_partial_mode.py
| ... | ... | @@ -322,6 +322,109 @@ def test_analyze_products_uses_product_level_cache_across_batch_requests(): |
| 322 | 322 | assert third[1]["anchor_text"] == "anchor:shirt" |
| 323 | 323 | |
| 324 | 324 | |
| 325 | +def test_analyze_products_reuses_cached_content_with_current_product_identity(): | |
| 326 | + cached_result = { | |
| 327 | + "id": "1165", | |
| 328 | + "lang": "zh", | |
| 329 | + "title_input": "old-title", | |
| 330 | + "title": "法式连衣裙", | |
| 331 | + "category_path": "女装>连衣裙", | |
| 332 | + "enriched_tags": "法式,收腰", | |
| 333 | + "target_audience": "年轻女性", | |
| 334 | + "usage_scene": "通勤,约会", | |
| 335 | + "season": "春季,夏季", | |
| 336 | + "key_attributes": "中长款", | |
| 337 | + "material": "聚酯纤维", | |
| 338 | + "features": "透气", | |
| 339 | + "anchor_text": "法式收腰连衣裙", | |
| 340 | + } | |
| 341 | + products = [{"id": "69960", "title": "dress"}] | |
| 342 | + | |
| 343 | + with mock.patch.object(product_enrich, "API_KEY", "fake-key"), mock.patch.object( | |
| 344 | + product_enrich, | |
| 345 | + "_get_cached_anchor_result", | |
| 346 | + wraps=lambda product, target_lang: product_enrich._normalize_analysis_result( | |
| 347 | + cached_result, | |
| 348 | + product=product, | |
| 349 | + target_lang=target_lang, | |
| 350 | + ), | |
| 351 | + ), mock.patch.object( | |
| 352 | + product_enrich, | |
| 353 | + "process_batch", | |
| 354 | + side_effect=AssertionError("process_batch should not be called on cache hit"), | |
| 355 | + ): | |
| 356 | + result = product_enrich.analyze_products( | |
| 357 | + products, | |
| 358 | + target_lang="zh", | |
| 359 | + tenant_id="170", | |
| 360 | + ) | |
| 361 | + | |
| 362 | + assert result == [ | |
| 363 | + { | |
| 364 | + "id": "69960", | |
| 365 | + "lang": "zh", | |
| 366 | + "title_input": "dress", | |
| 367 | + "title": "法式连衣裙", | |
| 368 | + "category_path": "女装>连衣裙", | |
| 369 | + "tags": "法式,收腰", | |
| 370 | + "target_audience": "年轻女性", | |
| 371 | + "usage_scene": "通勤,约会", | |
| 372 | + "season": "春季,夏季", | |
| 373 | + "key_attributes": "中长款", | |
| 374 | + "material": "聚酯纤维", | |
| 375 | + "features": "透气", | |
| 376 | + "anchor_text": "法式收腰连衣裙", | |
| 377 | + } | |
| 378 | + ] | |
| 379 | + | |
| 380 | + | |
| 381 | +def test_build_index_content_fields_maps_internal_tags_to_enriched_tags_output(): | |
| 382 | + def fake_analyze_products(products, target_lang="zh", batch_size=None, tenant_id=None): | |
| 383 | + return [ | |
| 384 | + { | |
| 385 | + "id": products[0]["id"], | |
| 386 | + "lang": target_lang, | |
| 387 | + "title_input": products[0]["title"], | |
| 388 | + "title": products[0]["title"], | |
| 389 | + "category_path": "玩具>滑行玩具", | |
| 390 | + "tags": f"{target_lang}-tag1,{target_lang}-tag2", | |
| 391 | + "target_audience": f"{target_lang}-audience", | |
| 392 | + "usage_scene": "", | |
| 393 | + "season": "", | |
| 394 | + "key_attributes": "", | |
| 395 | + "material": "", | |
| 396 | + "features": "", | |
| 397 | + "anchor_text": f"{target_lang}-anchor", | |
| 398 | + } | |
| 399 | + ] | |
| 400 | + | |
| 401 | + with mock.patch.object( | |
| 402 | + product_enrich, | |
| 403 | + "analyze_products", | |
| 404 | + side_effect=fake_analyze_products, | |
| 405 | + ): | |
| 406 | + result = product_enrich.build_index_content_fields( | |
| 407 | + items=[{"spu_id": "69960", "title": "dress"}], | |
| 408 | + tenant_id="170", | |
| 409 | + ) | |
| 410 | + | |
| 411 | + assert result == [ | |
| 412 | + { | |
| 413 | + "id": "69960", | |
| 414 | + "qanchors": {"zh": ["zh-anchor"], "en": ["en-anchor"]}, | |
| 415 | + "enriched_tags": {"zh": ["zh-tag1", "zh-tag2"], "en": ["en-tag1", "en-tag2"]}, | |
| 416 | + "enriched_attributes": [ | |
| 417 | + {"name": "enriched_tags", "value": {"zh": "zh-tag1"}}, | |
| 418 | + {"name": "enriched_tags", "value": {"zh": "zh-tag2"}}, | |
| 419 | + {"name": "target_audience", "value": {"zh": "zh-audience"}}, | |
| 420 | + {"name": "enriched_tags", "value": {"en": "en-tag1"}}, | |
| 421 | + {"name": "enriched_tags", "value": {"en": "en-tag2"}}, | |
| 422 | + {"name": "target_audience", "value": {"en": "en-audience"}}, | |
| 423 | + ], | |
| 424 | + } | |
| 425 | + ] | |
| 426 | + | |
| 427 | + | |
| 325 | 428 | def test_anchor_cache_key_depends_on_product_input_not_identifiers(): |
| 326 | 429 | product_a = { |
| 327 | 430 | "id": "1", | ... | ... |