Commit 4202440950af6b1f1c53c7e165d6f12d2dfeec7e

Authored by tangwang
1 parent 286e9b4f

评估框架-批量打标

docs/常用查询 - ES.md
... ... @@ -659,7 +659,7 @@ curl -u 'saas:4hOaLaf41y2VuI8y' -X POST \
659 659 "query": {
660 660 "bool": {
661 661 "filter": [
662   - { "exists": { "field": "image_embedding" } }
  662 + { "exists": { "field": "title_embedding" } }
663 663 ]
664 664 }
665 665 }
... ...
query/query_parser.py
... ... @@ -27,7 +27,7 @@ from .product_title_exclusion import (
27 27 )
28 28 from .query_rewriter import QueryRewriter, QueryNormalizer
29 29 from .style_intent import StyleIntentDetector, StyleIntentProfile, StyleIntentRegistry
30   -from .tokenization import QueryTextAnalysisCache, contains_han_text, extract_token_strings
  30 +from .tokenization import QueryTextAnalysisCache, extract_token_strings
31 31 from .keyword_extractor import KeywordExtractor, collect_keywords_queries
32 32  
33 33 logger = logging.getLogger(__name__)
... ... @@ -289,31 +289,12 @@ class QueryParser:
289 289 def _get_query_tokens(self, query: str) -> List[str]:
290 290 return self._extract_tokens(self._tokenizer(query))
291 291  
292   - @staticmethod
293   - def _is_ascii_latin_query(text: str) -> bool:
294   - candidate = str(text or "").strip()
295   - if not candidate or contains_han_text(candidate):
296   - return False
297   - try:
298   - candidate.encode("ascii")
299   - except UnicodeEncodeError:
300   - return False
301   - return any(ch.isalpha() for ch in candidate)
302   -
303 292 def _detect_query_language(
304 293 self,
305 294 query_text: str,
306 295 *,
307 296 target_languages: Optional[List[str]] = None,
308 297 ) -> str:
309   - normalized_targets = self._normalize_language_codes(target_languages)
310   - supported_languages = self._normalize_language_codes(
311   - getattr(self.config.query_config, "supported_languages", None)
312   - )
313   - active_languages = normalized_targets or supported_languages
314   - if active_languages and set(active_languages).issubset({"en", "zh"}):
315   - if self._is_ascii_latin_query(query_text):
316   - return "en"
317 298 return self.language_detector.detect(query_text)
318 299  
319 300 def parse(
... ...
scripts/evaluation/eval_framework/clients.py
... ... @@ -82,6 +82,18 @@ def _canonicalize_judge_label(raw: str) -> str | None:
82 82 return None
83 83  
84 84  
  85 +def _describe_request_exception(exc: requests.exceptions.RequestException) -> str:
  86 + if isinstance(exc, requests.exceptions.HTTPError):
  87 + response = getattr(exc, "response", None)
  88 + if response is None:
  89 + return str(exc)
  90 + body = str(getattr(response, "text", "") or "").strip()
  91 + if len(body) > 600:
  92 + body = body[:600].rstrip() + "...[truncated]"
  93 + return f"status={response.status_code} body={body or '<empty>'}"
  94 + return str(exc)
  95 +
  96 +
85 97 class SearchServiceClient:
86 98 def __init__(self, base_url: str, tenant_id: str):
87 99 self.base_url = base_url.rstrip("/")
... ... @@ -341,7 +353,8 @@ class DashScopeLabelClient:
341 353 except Exception as exc:
342 354 last_exc = exc
343 355 is_request_error = isinstance(exc, requests.exceptions.RequestException)
344   - if not is_request_error or attempt >= self.retry_attempts:
  356 + is_transient = is_request_error and self._is_transient_request_error(exc)
  357 + if not is_transient or attempt >= self.retry_attempts:
345 358 raise
346 359 _client_log.warning(
347 360 "Transient DashScope error, retrying (%s/%s): phase=%s model=%s use_batch=%s error=%s",
... ... @@ -350,7 +363,7 @@ class DashScopeLabelClient:
350 363 phase,
351 364 self.model,
352 365 self.use_batch,
353   - exc,
  366 + _describe_request_exception(exc),
354 367 )
355 368 time.sleep(self.retry_delay_sec)
356 369 else:
... ... @@ -366,6 +379,17 @@ class DashScopeLabelClient:
366 379 )
367 380 return content, raw
368 381  
  382 + @staticmethod
  383 + def _is_transient_request_error(exc: requests.exceptions.RequestException) -> bool:
  384 + if isinstance(exc, (requests.exceptions.ConnectionError, requests.exceptions.Timeout)):
  385 + return True
  386 + if isinstance(exc, requests.exceptions.HTTPError):
  387 + response = getattr(exc, "response", None)
  388 + if response is None:
  389 + return True
  390 + return int(response.status_code) in _TRANSIENT_HTTP_STATUS_CODES
  391 + return False
  392 +
369 393 def _find_batch_line_for_custom_id(
370 394 self,
371 395 file_id: Optional[str],
... ...
scripts/evaluation/eval_framework/utils.py
... ... @@ -11,6 +11,20 @@ from typing import Any, Dict, List, Sequence, Tuple
11 11  
12 12 from .constants import PROJECT_ROOT
13 13  
  14 +_LABEL_OPTION_MAX_CHARS = 40
  15 +_LABEL_DOC_LINE_MAX_CHARS = 260
  16 +
  17 +
  18 +def _truncate_text(value: Any, max_chars: int) -> str:
  19 + text = str(value or "").strip()
  20 + if max_chars <= 0:
  21 + return ""
  22 + if len(text) <= max_chars:
  23 + return text
  24 + if max_chars <= 3:
  25 + return text[:max_chars]
  26 + return text[: max_chars - 3].rstrip() + "..."
  27 +
14 28  
15 29 def utc_now_iso() -> str:
16 30 return datetime.now(timezone.utc).isoformat()
... ... @@ -67,11 +81,7 @@ def compact_option_values(skus: Sequence[Dict[str, Any]]) -&gt; Tuple[str, str, str
67 81  
68 82 def build_display_title(doc: Dict[str, Any]) -> str:
69 83 title = doc.get("title")
70   - en = pick_text(title, "en")
71   - zh = pick_text(title, "zh")
72   - if en and zh and en != zh:
73   - return f"{en} / {zh}"
74   - return en or zh
  84 + return pick_text(title, "en") or pick_text(title, "zh")
75 85  
76 86  
77 87 def build_rerank_doc(doc: Dict[str, Any]) -> str:
... ... @@ -82,14 +92,15 @@ def build_rerank_doc(doc: Dict[str, Any]) -&gt; str:
82 92 def build_label_doc_line(idx: int, doc: Dict[str, Any]) -> str:
83 93 title = build_display_title(doc)
84 94 option1, option2, option3 = compact_option_values(doc.get("skus") or [])
85   - vendor = pick_text(doc.get("vendor"), "en")
86   - category = pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en")
87 95 parts = [title]
88 96 if option1:
89   - parts.append(f"{option1}")
  97 + parts.append(_truncate_text(option1, _LABEL_OPTION_MAX_CHARS))
90 98 if option2:
91   - parts.append(f"{option2}")
92   - return f"{idx}. " + " ".join(part for part in parts if part)
  99 + parts.append(_truncate_text(option2, _LABEL_OPTION_MAX_CHARS))
  100 + if option3:
  101 + parts.append(_truncate_text(option3, _LABEL_OPTION_MAX_CHARS))
  102 + line = " ".join(part for part in parts if part)
  103 + return _truncate_text(f"{idx}. {line}", _LABEL_DOC_LINE_MAX_CHARS)
93 104  
94 105  
95 106 def compact_product_payload(doc: Dict[str, Any]) -> Dict[str, Any]:
... ...
tests/test_eval_framework_clients.py 0 → 100644
... ... @@ -0,0 +1,71 @@
  1 +import pytest
  2 +import requests
  3 +
  4 +from scripts.evaluation.eval_framework.clients import DashScopeLabelClient
  5 +from scripts.evaluation.eval_framework.utils import build_label_doc_line
  6 +
  7 +
  8 +def _http_error(status_code: int, body: str) -> requests.exceptions.HTTPError:
  9 + response = requests.Response()
  10 + response.status_code = status_code
  11 + response._content = body.encode("utf-8")
  12 + response.url = "https://dashscope-us.aliyuncs.com/compatible-mode/v1/chat/completions"
  13 + return requests.exceptions.HTTPError(f"{status_code} error", response=response)
  14 +
  15 +
  16 +def test_build_label_doc_line_truncates_long_fields():
  17 + doc = {
  18 + "title": {"en": "Minimalist Top " * 40},
  19 + "skus": [
  20 + {
  21 + "option1_value": "Very Long Color Name " * 10,
  22 + "option2_value": "Very Long Size Name " * 10,
  23 + "option3_value": "Very Long Material Name " * 10,
  24 + }
  25 + ],
  26 + }
  27 +
  28 + line = build_label_doc_line(1, doc)
  29 +
  30 + assert line.startswith("1. ")
  31 + assert len(line) <= 260
  32 + assert line.endswith("...")
  33 +
  34 +
  35 +def test_dashscope_chat_does_not_retry_non_transient_400(monkeypatch):
  36 + client = DashScopeLabelClient(model="qwen3.5-plus", base_url="https://example.com", api_key="test")
  37 + client.retry_attempts = 4
  38 + client.retry_delay_sec = 0
  39 + calls = {"count": 0}
  40 +
  41 + def fake_chat_sync(prompt: str):
  42 + calls["count"] += 1
  43 + raise _http_error(400, '{"code":"InvalidParameter"}')
  44 +
  45 + monkeypatch.setattr(client, "_chat_sync", fake_chat_sync)
  46 +
  47 + with pytest.raises(requests.exceptions.HTTPError):
  48 + client._chat("prompt", phase="relevance_classify")
  49 +
  50 + assert calls["count"] == 1
  51 +
  52 +
  53 +def test_dashscope_chat_retries_transient_500(monkeypatch):
  54 + client = DashScopeLabelClient(model="qwen3.5-plus", base_url="https://example.com", api_key="test")
  55 + client.retry_attempts = 4
  56 + client.retry_delay_sec = 0
  57 + calls = {"count": 0}
  58 +
  59 + def fake_chat_sync(prompt: str):
  60 + calls["count"] += 1
  61 + if calls["count"] < 3:
  62 + raise _http_error(500, '{"code":"InternalError"}')
  63 + return "Exact Match", '{"choices":[{"message":{"content":"Exact Match"}}]}'
  64 +
  65 + monkeypatch.setattr(client, "_chat_sync", fake_chat_sync)
  66 +
  67 + content, raw = client._chat("prompt", phase="relevance_classify")
  68 +
  69 + assert content == "Exact Match"
  70 + assert "Exact Match" in raw
  71 + assert calls["count"] == 3
... ...
tests/test_query_parser_mixed_language.py
... ... @@ -136,20 +136,24 @@ def test_parse_reuses_tokenization_across_tail_stages(monkeypatch):
136 136 assert tokenize_calls == []
137 137  
138 138  
139   -def test_parse_fast_path_detects_ascii_query_as_english_without_lingua(monkeypatch):
  139 +def test_parse_ascii_latin_query_uses_language_detector(monkeypatch):
140 140 parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
  141 + detector_calls = []
  142 +
141 143 monkeypatch.setattr(
142 144 parser.language_detector,
143 145 "detect",
144   - lambda text: (_ for _ in ()).throw(AssertionError("Lingua path should not be used")),
  146 + lambda text: detector_calls.append(text) or "es",
145 147 )
146 148  
147 149 result = parser.parse(
148   - "street t-shirt women",
  150 + "falda negra oficina",
149 151 tenant_id="0",
150 152 generate_vector=False,
151 153 target_languages=["en", "zh"],
152 154 )
153 155  
154   - assert result.detected_language == "en"
155   - assert result.query_tokens == ["street", "t-shirt", "women"]
  156 + assert detector_calls == ["falda negra oficina"]
  157 + assert result.detected_language == "es"
  158 + assert result.translations == {"en": "falda negra oficina-en", "zh": "falda negra oficina-zh"}
  159 + assert result.query_tokens == ["falda", "negra", "oficina"]
... ...