From bdb65283908f21ee69eadaf078075e4f9140a332 Mon Sep 17 00:00:00 2001 From: tangwang Date: Wed, 1 Apr 2026 09:34:12 +0800 Subject: [PATCH] 标注框架 批量标注 --- scripts/evaluation/eval_framework/cli.py | 53 +++++++++++++++++++++++++++++++++++++++++++++++++---- scripts/evaluation/eval_framework/clients.py | 161 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------ scripts/evaluation/eval_framework/constants.py | 6 ++++++ scripts/evaluation/eval_framework/framework.py | 22 ++++++++++++++++++++-- scripts/evaluation/eval_framework/prompts.py | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------------------ 5 files changed, 277 insertions(+), 54 deletions(-) diff --git a/scripts/evaluation/eval_framework/cli.py b/scripts/evaluation/eval_framework/cli.py index dfdbbd7..6421a0a 100644 --- a/scripts/evaluation/eval_framework/cli.py +++ b/scripts/evaluation/eval_framework/cli.py @@ -5,6 +5,7 @@ from __future__ import annotations import argparse import json from pathlib import Path +from typing import Any, Dict from .constants import ( DEFAULT_LABELER_MODE, @@ -23,6 +24,38 @@ from .utils import ensure_dir, utc_now_iso, utc_timestamp from .web_app import create_web_app +def add_judge_llm_args(p: argparse.ArgumentParser) -> None: + p.add_argument( + "--judge-model", + default=None, + metavar="MODEL", + help="Judge LLM model (default: eval_framework.constants.DEFAULT_JUDGE_MODEL).", + ) + p.add_argument( + "--enable-thinking", + action=argparse.BooleanOptionalAction, + default=None, + help="enable_thinking for DashScope (default: DEFAULT_JUDGE_ENABLE_THINKING).", + ) + p.add_argument( + "--dashscope-batch", + action=argparse.BooleanOptionalAction, + default=None, + help="DashScope Batch File API vs sync chat (default: DEFAULT_JUDGE_DASHSCOPE_BATCH).", + ) + + +def framework_kwargs_from_args(args: argparse.Namespace) -> Dict[str, Any]: + kw: Dict[str, Any] = {} + if args.judge_model is not None: + kw["judge_model"] = args.judge_model + if args.enable_thinking is not None: + kw["enable_thinking"] = args.enable_thinking + if args.dashscope_batch is not None: + kw["use_dashscope_batch"] = args.dashscope_batch + return kw + + def build_cli_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(description="Search evaluation annotation builder and web UI") sub = parser.add_subparsers(dest="command", required=True) @@ -71,6 +104,7 @@ def build_cli_parser() -> argparse.ArgumentParser: build.add_argument("--force-refresh-rerank", action="store_true") build.add_argument("--force-refresh-labels", action="store_true") build.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"]) + add_judge_llm_args(build) batch = sub.add_parser("batch", help="Run batch evaluation against live search") batch.add_argument("--tenant-id", default="163") @@ -79,6 +113,7 @@ def build_cli_parser() -> argparse.ArgumentParser: batch.add_argument("--language", default="en") batch.add_argument("--force-refresh-labels", action="store_true") batch.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"]) + add_judge_llm_args(batch) audit = sub.add_parser("audit", help="Audit annotation quality for queries") audit.add_argument("--tenant-id", default="163") @@ -88,6 +123,7 @@ def build_cli_parser() -> argparse.ArgumentParser: audit.add_argument("--limit-suspicious", type=int, default=5) audit.add_argument("--force-refresh-labels", action="store_true") audit.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"]) + add_judge_llm_args(audit) serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010") serve.add_argument("--tenant-id", default="163") @@ -95,12 +131,15 @@ def build_cli_parser() -> argparse.ArgumentParser: serve.add_argument("--host", default="0.0.0.0") serve.add_argument("--port", type=int, default=6010) serve.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"]) + add_judge_llm_args(serve) return parser def run_build(args: argparse.Namespace) -> None: - framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode) + framework = SearchEvaluationFramework( + tenant_id=args.tenant_id, labeler_mode=args.labeler_mode, **framework_kwargs_from_args(args) + ) queries = framework.queries_from_file(Path(args.queries_file)) summary = [] rebuild_kwargs = {} @@ -152,7 +191,9 @@ def run_build(args: argparse.Namespace) -> None: def run_batch(args: argparse.Namespace) -> None: - framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode) + framework = SearchEvaluationFramework( + tenant_id=args.tenant_id, labeler_mode=args.labeler_mode, **framework_kwargs_from_args(args) + ) queries = framework.queries_from_file(Path(args.queries_file)) payload = framework.batch_evaluate( queries=queries, @@ -165,7 +206,9 @@ def run_batch(args: argparse.Namespace) -> None: def run_audit(args: argparse.Namespace) -> None: - framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode) + framework = SearchEvaluationFramework( + tenant_id=args.tenant_id, labeler_mode=args.labeler_mode, **framework_kwargs_from_args(args) + ) queries = framework.queries_from_file(Path(args.queries_file)) audit_items = [] for query in queries: @@ -215,7 +258,9 @@ def run_audit(args: argparse.Namespace) -> None: def run_serve(args: argparse.Namespace) -> None: - framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode) + framework = SearchEvaluationFramework( + tenant_id=args.tenant_id, labeler_mode=args.labeler_mode, **framework_kwargs_from_args(args) + ) app = create_web_app(framework, Path(args.queries_file)) import uvicorn diff --git a/scripts/evaluation/eval_framework/clients.py b/scripts/evaluation/eval_framework/clients.py index 05fbe10..d228e42 100644 --- a/scripts/evaluation/eval_framework/clients.py +++ b/scripts/evaluation/eval_framework/clients.py @@ -2,6 +2,10 @@ from __future__ import annotations +import io +import json +import time +import uuid from typing import Any, Dict, List, Optional, Sequence, Tuple import requests @@ -60,26 +64,51 @@ class RerankServiceClient: class DashScopeLabelClient: - def __init__(self, model: str, base_url: str, api_key: str, batch_size: int = 40): + """DashScope OpenAI-compatible chat: synchronous or Batch File API (JSONL job). + + Batch flow: https://help.aliyun.com/zh/model-studio/batch-interfaces-compatible-with-openai/ + """ + + def __init__( + self, + model: str, + base_url: str, + api_key: str, + batch_size: int = 40, + *, + batch_completion_window: str = "24h", + batch_poll_interval_sec: float = 10.0, + enable_thinking: bool = True, + use_batch: bool = True, + ): self.model = model self.base_url = base_url.rstrip("/") self.api_key = api_key self.batch_size = int(batch_size) + self.batch_completion_window = str(batch_completion_window) + self.batch_poll_interval_sec = float(batch_poll_interval_sec) + self.enable_thinking = bool(enable_thinking) + self.use_batch = bool(use_batch) self.session = requests.Session() - def _chat(self, prompt: str) -> Tuple[str, str]: + def _auth_headers(self) -> Dict[str, str]: + return {"Authorization": f"Bearer {self.api_key}"} + + def _completion_body(self, prompt: str) -> Dict[str, Any]: + body: Dict[str, Any] = { + "model": self.model, + "messages": [{"role": "user", "content": prompt}], + "temperature": 0, + "top_p": 0.1, + "enable_thinking": self.enable_thinking, + } + return body + + def _chat_sync(self, prompt: str) -> Tuple[str, str]: response = self.session.post( f"{self.base_url}/chat/completions", - headers={ - "Authorization": f"Bearer {self.api_key}", - "Content-Type": "application/json", - }, - json={ - "model": self.model, - "messages": [{"role": "user", "content": prompt}], - "temperature": 0, - "top_p": 0.1, - }, + headers={**self._auth_headers(), "Content-Type": "application/json"}, + json=self._completion_body(prompt), timeout=180, ) response.raise_for_status() @@ -87,6 +116,114 @@ class DashScopeLabelClient: content = str(((data.get("choices") or [{}])[0].get("message") or {}).get("content") or "").strip() return content, safe_json_dumps(data) + def _chat_batch(self, prompt: str) -> Tuple[str, str]: + """One chat completion via Batch File API (single-line JSONL job).""" + custom_id = uuid.uuid4().hex + body = self._completion_body(prompt) + line_obj = { + "custom_id": custom_id, + "method": "POST", + "url": "/v1/chat/completions", + "body": body, + } + jsonl = json.dumps(line_obj, ensure_ascii=False, separators=(",", ":")) + "\n" + auth = self._auth_headers() + + up = self.session.post( + f"{self.base_url}/files", + headers=auth, + files={ + "file": ( + "eval_batch_input.jsonl", + io.BytesIO(jsonl.encode("utf-8")), + "application/octet-stream", + ) + }, + data={"purpose": "batch"}, + timeout=300, + ) + up.raise_for_status() + file_id = (up.json() or {}).get("id") + if not file_id: + raise RuntimeError(f"DashScope file upload returned no id: {up.text!r}") + + cr = self.session.post( + f"{self.base_url}/batches", + headers={**auth, "Content-Type": "application/json"}, + json={ + "input_file_id": file_id, + "endpoint": "/v1/chat/completions", + "completion_window": self.batch_completion_window, + }, + timeout=120, + ) + cr.raise_for_status() + batch_payload = cr.json() or {} + batch_id = batch_payload.get("id") + if not batch_id: + raise RuntimeError(f"DashScope batches.create returned no id: {cr.text!r}") + + terminal = frozenset({"completed", "failed", "expired", "cancelled"}) + batch: Dict[str, Any] = dict(batch_payload) + status = str(batch.get("status") or "") + while status not in terminal: + time.sleep(self.batch_poll_interval_sec) + br = self.session.get(f"{self.base_url}/batches/{batch_id}", headers=auth, timeout=120) + br.raise_for_status() + batch = br.json() or {} + status = str(batch.get("status") or "") + + if status != "completed": + raise RuntimeError( + f"DashScope batch {batch_id} ended with status={status!r} errors={batch.get('errors')!r}" + ) + + out_id = batch.get("output_file_id") + err_id = batch.get("error_file_id") + + row = self._find_batch_line_for_custom_id(out_id, custom_id, auth) + if row is None: + err_row = self._find_batch_line_for_custom_id(err_id, custom_id, auth) + if err_row is not None: + raise RuntimeError(f"DashScope batch request failed: {err_row!r}") + raise RuntimeError(f"DashScope batch output missing custom_id={custom_id!r}") + + resp = row.get("response") or {} + sc = resp.get("status_code") + if sc is not None and int(sc) != 200: + raise RuntimeError(f"DashScope batch line error: {row!r}") + + data = resp.get("body") or {} + content = str(((data.get("choices") or [{}])[0].get("message") or {}).get("content") or "").strip() + return content, safe_json_dumps(row) + + def _chat(self, prompt: str) -> Tuple[str, str]: + if self.use_batch: + return self._chat_batch(prompt) + return self._chat_sync(prompt) + + def _find_batch_line_for_custom_id( + self, + file_id: Optional[str], + custom_id: str, + auth: Dict[str, str], + ) -> Optional[Dict[str, Any]]: + if not file_id or str(file_id) in ("null", ""): + return None + r = self.session.get(f"{self.base_url}/files/{file_id}/content", headers=auth, timeout=300) + r.raise_for_status() + for raw in r.text.splitlines(): + raw = raw.strip() + if not raw: + continue + try: + obj = json.loads(raw) + except json.JSONDecodeError: + continue + if str(obj.get("custom_id")) == custom_id: + return obj + return None + def classify_batch_simple( self, query: str, diff --git a/scripts/evaluation/eval_framework/constants.py b/scripts/evaluation/eval_framework/constants.py index f3fcf87..eea6182 100644 --- a/scripts/evaluation/eval_framework/constants.py +++ b/scripts/evaluation/eval_framework/constants.py @@ -17,6 +17,12 @@ DEFAULT_QUERY_FILE = _SCRIPTS_EVAL_DIR / "queries" / "queries.txt" JUDGE_PROMPT_VERSION_SIMPLE = "v3_simple_20260331" JUDGE_PROMPT_VERSION_COMPLEX = "v2_structured_20260331" DEFAULT_LABELER_MODE = "simple" +# Judge LLM (eval_framework only; override via CLI --judge-model / constructor kwargs) +DEFAULT_JUDGE_MODEL = "qwen3.5-flash" +DEFAULT_JUDGE_ENABLE_THINKING = True +DEFAULT_JUDGE_DASHSCOPE_BATCH = True +DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW = "24h" +DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC = 10.0 # Rebuild annotation pool (build --force-refresh-labels): search recall + full-corpus rerank + LLM batches DEFAULT_SEARCH_RECALL_TOP_K = 500 diff --git a/scripts/evaluation/eval_framework/framework.py b/scripts/evaluation/eval_framework/framework.py index 9fea5f2..ab6260c 100644 --- a/scripts/evaluation/eval_framework/framework.py +++ b/scripts/evaluation/eval_framework/framework.py @@ -16,6 +16,11 @@ from indexer.mapping_generator import get_tenant_index_name from .clients import DashScopeLabelClient, RerankServiceClient, SearchServiceClient from .constants import ( DEFAULT_ARTIFACT_ROOT, + DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW, + DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC, + DEFAULT_JUDGE_DASHSCOPE_BATCH, + DEFAULT_JUDGE_ENABLE_THINKING, + DEFAULT_JUDGE_MODEL, DEFAULT_LABELER_MODE, DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK, @@ -73,6 +78,10 @@ class SearchEvaluationFramework: artifact_root: Path = DEFAULT_ARTIFACT_ROOT, search_base_url: str = "http://localhost:6002", labeler_mode: str = DEFAULT_LABELER_MODE, + *, + judge_model: str | None = None, + enable_thinking: bool | None = None, + use_dashscope_batch: bool | None = None, ): init_service(get_app_config().infrastructure.elasticsearch.host) self.tenant_id = str(tenant_id) @@ -89,13 +98,22 @@ class SearchEvaluationFramework: api_key = app_cfg.infrastructure.secrets.dashscope_api_key if not api_key: raise RuntimeError("dashscope_api_key is required for search evaluation annotation") + model = str(judge_model or DEFAULT_JUDGE_MODEL) + et = DEFAULT_JUDGE_ENABLE_THINKING if enable_thinking is None else enable_thinking + use_batch = DEFAULT_JUDGE_DASHSCOPE_BATCH if use_dashscope_batch is None else use_dashscope_batch + batch_window = DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW + batch_poll = float(DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC) self.label_client = DashScopeLabelClient( - model=str(llm_cfg["model"]), + model=model, base_url=str(llm_cfg["base_url"]), api_key=str(api_key), + batch_completion_window=batch_window, + batch_poll_interval_sec=batch_poll, + enable_thinking=et, + use_batch=use_batch, ) self.query_parser = None - + def _get_query_parser(self): if self.query_parser is None: self.query_parser = get_query_parser() diff --git a/scripts/evaluation/eval_framework/prompts.py b/scripts/evaluation/eval_framework/prompts.py index 8f9e630..a9bb174 100644 --- a/scripts/evaluation/eval_framework/prompts.py +++ b/scripts/evaluation/eval_framework/prompts.py @@ -96,7 +96,7 @@ The lines must correspond sequentially to the products above. Do not output any other information. """ -_CLASSIFY_BATCH_SIMPLE_TEMPLATE__zh = _CLASSIFY_BATCH_SIMPLE_TEMPLATE_ZH = """你是一个服饰电商搜索系统中的相关性判断助手。 +_CLASSIFY_BATCH_SIMPLE_TEMPLATE_ZH = """你是一个服饰电商搜索系统中的相关性判断助手。 给定用户查询词以及每个商品的信息,请为每个商品分配一个相关性标签。 ## 相关性标签 @@ -108,9 +108,8 @@ _CLASSIFY_BATCH_SIMPLE_TEMPLATE__zh = _CLASSIFY_BATCH_SIMPLE_TEMPLATE_ZH = """ - 查询仅包含产品类型,产品即为该类型。 - 查询包含“产品类型 + 属性”,产品在类型及所有明确属性上均符合。 -### 部分相关 -产品满足用户的主要意图(核心产品类型匹配),但查询中明确的部分要求未体现,或存在偏差。虽然有不一致,但仍属于“非目标但可接受”的替代品。 - +### 基本相关 (High Relevant) +产品满足用户的主要意图(核心产品类型匹配),但查询中明确的部分要求未在产品信息中体现、无法确认,或存在并不严重冲突的偏差。该商品是满足用户核心需求的良好替代品。 在以下情况使用部分相关: - 核心产品类型匹配,但部分请求的属性在商品信息中缺失、未提及或无法确认。 - 核心产品类型匹配,但材质、版型、风格等次要要求存在偏差或不一致。 @@ -130,45 +129,63 @@ _CLASSIFY_BATCH_SIMPLE_TEMPLATE__zh = _CLASSIFY_BATCH_SIMPLE_TEMPLATE_ZH = """ - 在电商搜索中,用户仍可能因为款式、穿着场景相近而点击该商品。 - 因此应判为部分相关,即“非目标但可接受”的替代品。 -### 不相关 -产品未满足用户的主要购物意图,主要表现为以下情形之一: -- 核心产品类型与查询不匹配。 -- 产品虽属大致相关的大类,但与查询指定的具体子类不可互换。 -- 核心产品类型匹配,但产品明显违背了查询中一个明确且重要的属性要求。 +详细案例: +- 查询:“黑色中长半身裙” +- 商品:“春秋季新款宽松显瘦大摆长裙碎花半身裙褶皱设计裙” + +分析: +- 品类匹配:产品是“半身裙”,品类符合。 +- 颜色不匹配:产品描述未提及黑色且明确包含“碎花”(floral),花色与纯黑差异较大。 +- 长度存在偏差:用户要求“中长”,而产品标题强调“长裙”(Long Skirt),长度偏长。 +- 核心品类“半身裙”匹配,且“显瘦”“大摆”等风格可能符合部分搜索“中长半身裙”用户的潜在偏好(如版型相似),“长裙”和“中长”无严重矛盾,属于核心品类匹配,属性存在不严重偏差的“基本相关”。 + +### 弱相关 (Low Relevant) +产品与用户的核心意图存在差距,主要表现为以下情形之一,但仍可能因风格、场景或功能上的相似性而被用户接受。为“非目标但可接受”的替代品。 +- **典型情况**: + - 核心产品类型有差异,但风格、穿着场景或功能非常接近,如查询“黑色中长半身裙”,商品为“连衣裙”(同属裙装大类,款式相似)。 + - 核心产品类型有差异,但在购物场景下属于相近品类,可勉强替代,如查询“牛仔裤”,商品为“休闲裤”(均为裤子大类,风格可能相近)。 + - 核心产品类型匹配,但产品在多个非关键属性上存在偏差,导致与用户理想目标差距较大,但仍保留一定关联性。 + +典型情况: +- 查询:“黑色中长半身裙”,产品:“新款高腰V领中长款连衣裙 优雅印花黑色性感连衣裙” → 核心产品类型“半身裙”与“连衣裙”有差异,但两者同属裙装大类且款式上均为“中长款”,具有相似性。 + +### 不相关 (Irrelevant) +产品未满足用户的主要购物意图,用户点击动机极低。主要表现为以下情形之一: +- 核心产品类型与查询不匹配,且不属于风格/场景相近的替代品。 +- 产品虽属大致相关的大类,但与查询指定的具体子类不可互换,且风格/场景差异大。 +- 核心产品类型匹配,但产品明显违背了查询中一个明确且重要的属性要求,且不存在可接受的理由。 典型情况: - 查询:“裤子”,产品:“鞋子” → 产品类型错误。 -- 查询:“连衣裙”,产品:“半身裙” → 具体产品类型不同。 -- 查询:“修身裤”,产品:“宽松阔腿裤” → 与版型要求明显冲突。 +- 查询:“修身裤”,产品:“宽松阔腿裤” → 与版型要求明显冲突,替代性极低。 - 查询:“无袖连衣裙”,产品:“长袖连衣裙” → 与袖型要求明显冲突。 - -该标签强调用户意图的明确性。当查询指向具体类型或关键属性时,即使产品在更高层级类别上相关,也应按不相关处理。 +- 查询:“牛仔裤”,产品:“运动裤” → 核心品类不同(牛仔裤 vs 运动裤),风格和场景差异大。 +- 查询:“靴子”,产品:“运动鞋” → 核心品类不同,功能和适用场景差异大。 ## 判断原则 -1. 产品类型是最高优先级因素。 - 如果查询明确指定了具体产品类型,那么结果必须匹配该产品类型,才可能判为“完全相关”或“部分相关”。 - 不同产品类型通常应判为“不相关”,而不是“部分相关”。 - -2. 相似或相关的产品类型,在查询明确时通常不可互换。 - 例如: - - 连衣裙 vs 半身裙 vs 连体裤 - - 牛仔裤 vs 裤子 - - T恤 vs 衬衫/上衣 - - 开衫 vs 毛衣 - - 靴子 vs 鞋子 - - 文胸 vs 上衣 - - 双肩包 vs 包 - 如果用户明确搜索其中一种,其他类型通常应判为“不相关”。 - -3. 当核心产品类型匹配后,再评估属性。 - - 所有明确属性都匹配 → 完全相关 - - 部分属性缺失、无法确认,或存在一定偏差,但仍是可接受替代品 → 部分相关 - - 明确且重要的属性被明显违背,且不能作为合理替代品 → 不相关 - -4. 要严格区分“未提及/无法确认”和“明确冲突”。 - - 如果某属性没有提及,或无法验证,优先判为“部分相关”。 - - 如果某属性与查询要求明确相反,则判为“不相关”;除非在购物语境下它仍明显属于可接受替代品。 +1. **产品类型是最高优先级因素。** + 如果查询明确指定了具体产品类型,那么结果必须匹配该产品类型,才可能判为“完全相关”或“基本相关”。不同产品类型通常应判为“弱相关”或“不相关”。 + - **弱相关**:仅当两种产品类型风格、场景、功能非常接近,可能被视为可接受的替代品时使用。 + - **不相关**:其他所有产品类型不匹配的情况。 + +2. **相似或相关的产品类型,在查询明确时通常不可互换,但需根据接近程度区分。** + 例如: + - **风格/场景高度接近,可判为弱相关**:连衣裙 vs 半身裙、长裙 vs 中长裙、牛仔裤 vs 休闲裤、运动鞋 vs 板鞋。 + - **风格/场景差异大,判为不相关**:裤子 vs 鞋子、T恤 vs 帽子、靴子 vs 运动鞋、牛仔裤 vs 西装裤、双肩包 vs 手提包。 + 如果用户明确搜索其中一种,其他类型是否可接受取决于其风格、场景的接近程度。 + +3. **当核心产品类型匹配后,再评估属性。** + - 所有明确属性都匹配 → **完全相关** + - 部分属性缺失、无法确认,或存在较小偏差 → **基本相关** + - 明确且重要的属性被明显违背(如修身 vs 宽松),但核心品类仍匹配 → **弱相关** 或 **不相关**。 + - **弱相关**:属性明显违背,但存在可被用户接受的微弱理由(如版型虽不同但风格类似)。 + - **不相关**:属性明显违背,且替代性极低,用户无点击动机(如修身 vs 宽松阔腿裤)。 + +4. **要严格区分“未提及/无法确认”、“较小偏差”、“明确冲突”。** + - 如果某属性没有提及,或无法验证,优先判为“**基本相关**”。 + - 如果某属性存在较小偏差(如颜色不同、材质不同),判为“**基本相关**”。 + - 如果某属性与查询要求明确相反,则需根据冲突的严重性和替代性判为“**弱相关**”或“**不相关**”。 查询:{query} -- libgit2 0.21.2