评估框架-批量打标

tangwang
1 parent 286e9b4f
Showing 6 changed files with 129 additions and 38 deletions Show diff stats
docs/常用查询 - ES.md
query/query_parser.py
scripts/evaluation/eval_framework/clients.py
scripts/evaluation/eval_framework/utils.py
tests/test_eval_framework_clients.py
tests/test_query_parser_mixed_language.py
@@ -659,7 +659,7 @@ curl -u &#39;saas:4hOaLaf41y2VuI8y&#39; -X POST \
     "query": {
         "bool": {
             "filter": [
-                { "exists": { "field": "image_embedding" } }
+                { "exists": { "field": "title_embedding" } }
             ]
         }
     }
@@ -27,7 +27,7 @@ from .product_title_exclusion import (
 )
 from .query_rewriter import QueryRewriter, QueryNormalizer
 from .style_intent import StyleIntentDetector, StyleIntentProfile, StyleIntentRegistry
-from .tokenization import QueryTextAnalysisCache, contains_han_text, extract_token_strings
+from .tokenization import QueryTextAnalysisCache, extract_token_strings
 from .keyword_extractor import KeywordExtractor, collect_keywords_queries
  
 logger = logging.getLogger(__name__)
@@ -289,31 +289,12 @@ class QueryParser:
     def _get_query_tokens(self, query: str) -> List[str]:
         return self._extract_tokens(self._tokenizer(query))
  
-    @staticmethod
-    def _is_ascii_latin_query(text: str) -> bool:
-        candidate = str(text or "").strip()
-        if not candidate or contains_han_text(candidate):
-            return False
-        try:
-            candidate.encode("ascii")
-        except UnicodeEncodeError:
-            return False
-        return any(ch.isalpha() for ch in candidate)
-
     def _detect_query_language(
         self,
         query_text: str,
         *,
         target_languages: Optional[List[str]] = None,
     ) -> str:
-        normalized_targets = self._normalize_language_codes(target_languages)
-        supported_languages = self._normalize_language_codes(
-            getattr(self.config.query_config, "supported_languages", None)
-        )
-        active_languages = normalized_targets or supported_languages
-        if active_languages and set(active_languages).issubset({"en", "zh"}):
-            if self._is_ascii_latin_query(query_text):
-                return "en"
         return self.language_detector.detect(query_text)
  
     def parse(
@@ -82,6 +82,18 @@ def _canonicalize_judge_label(raw: str) -&gt; str | None:
     return None
  
  
+def _describe_request_exception(exc: requests.exceptions.RequestException) -> str:
+    if isinstance(exc, requests.exceptions.HTTPError):
+        response = getattr(exc, "response", None)
+        if response is None:
+            return str(exc)
+        body = str(getattr(response, "text", "") or "").strip()
+        if len(body) > 600:
+            body = body[:600].rstrip() + "...[truncated]"
+        return f"status={response.status_code} body={body or '<empty>'}"
+    return str(exc)
+
+
 class SearchServiceClient:
     def __init__(self, base_url: str, tenant_id: str):
         self.base_url = base_url.rstrip("/")
@@ -341,7 +353,8 @@ class DashScopeLabelClient:
             except Exception as exc:
                 last_exc = exc
                 is_request_error = isinstance(exc, requests.exceptions.RequestException)
-                if not is_request_error or attempt >= self.retry_attempts:
+                is_transient = is_request_error and self._is_transient_request_error(exc)
+                if not is_transient or attempt >= self.retry_attempts:
                     raise
                 _client_log.warning(
                     "Transient DashScope error, retrying (%s/%s): phase=%s model=%s use_batch=%s error=%s",
@@ -350,7 +363,7 @@ class DashScopeLabelClient:
                     phase,
                     self.model,
                     self.use_batch,
-                    exc,
+                    _describe_request_exception(exc),
                 )
                 time.sleep(self.retry_delay_sec)
         else:
@@ -366,6 +379,17 @@ class DashScopeLabelClient:
         )
         return content, raw
  
+    @staticmethod
+    def _is_transient_request_error(exc: requests.exceptions.RequestException) -> bool:
+        if isinstance(exc, (requests.exceptions.ConnectionError, requests.exceptions.Timeout)):
+            return True
+        if isinstance(exc, requests.exceptions.HTTPError):
+            response = getattr(exc, "response", None)
+            if response is None:
+                return True
+            return int(response.status_code) in _TRANSIENT_HTTP_STATUS_CODES
+        return False
+
     def _find_batch_line_for_custom_id(
         self,
         file_id: Optional[str],
@@ -11,6 +11,20 @@ from typing import Any, Dict, List, Sequence, Tuple
  
 from .constants import PROJECT_ROOT
  
+_LABEL_OPTION_MAX_CHARS = 40
+_LABEL_DOC_LINE_MAX_CHARS = 260
+
+
+def _truncate_text(value: Any, max_chars: int) -> str:
+    text = str(value or "").strip()
+    if max_chars <= 0:
+        return ""
+    if len(text) <= max_chars:
+        return text
+    if max_chars <= 3:
+        return text[:max_chars]
+    return text[: max_chars - 3].rstrip() + "..."
+
  
 def utc_now_iso() -> str:
     return datetime.now(timezone.utc).isoformat()
@@ -67,11 +81,7 @@ def compact_option_values(skus: Sequence[Dict[str, Any]]) -&gt; Tuple[str, str, str
  
 def build_display_title(doc: Dict[str, Any]) -> str:
     title = doc.get("title")
-    en = pick_text(title, "en")
-    zh = pick_text(title, "zh")
-    if en and zh and en != zh:
-        return f"{en} / {zh}"
-    return en or zh
+    return pick_text(title, "en") or pick_text(title, "zh")
  
  
 def build_rerank_doc(doc: Dict[str, Any]) -> str:
@@ -82,14 +92,15 @@ def build_rerank_doc(doc: Dict[str, Any]) -&gt; str:
 def build_label_doc_line(idx: int, doc: Dict[str, Any]) -> str:
     title = build_display_title(doc)
     option1, option2, option3 = compact_option_values(doc.get("skus") or [])
-    vendor = pick_text(doc.get("vendor"), "en")
-    category = pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en")
     parts = [title]
     if option1:
-        parts.append(f"{option1}")
+        parts.append(_truncate_text(option1, _LABEL_OPTION_MAX_CHARS))
     if option2:
-        parts.append(f"{option2}")
-    return f"{idx}. " + " ".join(part for part in parts if part)
+        parts.append(_truncate_text(option2, _LABEL_OPTION_MAX_CHARS))
+    if option3:
+        parts.append(_truncate_text(option3, _LABEL_OPTION_MAX_CHARS))
+    line = " ".join(part for part in parts if part)
+    return _truncate_text(f"{idx}. {line}", _LABEL_DOC_LINE_MAX_CHARS)
  
  
 def compact_product_payload(doc: Dict[str, Any]) -> Dict[str, Any]:
@@ -0,0 +1,71 @@
+import pytest
+import requests
+
+from scripts.evaluation.eval_framework.clients import DashScopeLabelClient
+from scripts.evaluation.eval_framework.utils import build_label_doc_line
+
+
+def _http_error(status_code: int, body: str) -> requests.exceptions.HTTPError:
+    response = requests.Response()
+    response.status_code = status_code
+    response._content = body.encode("utf-8")
+    response.url = "https://dashscope-us.aliyuncs.com/compatible-mode/v1/chat/completions"
+    return requests.exceptions.HTTPError(f"{status_code} error", response=response)
+
+
+def test_build_label_doc_line_truncates_long_fields():
+    doc = {
+        "title": {"en": "Minimalist Top " * 40},
+        "skus": [
+            {
+                "option1_value": "Very Long Color Name " * 10,
+                "option2_value": "Very Long Size Name " * 10,
+                "option3_value": "Very Long Material Name " * 10,
+            }
+        ],
+    }
+
+    line = build_label_doc_line(1, doc)
+
+    assert line.startswith("1. ")
+    assert len(line) <= 260
+    assert line.endswith("...")
+
+
+def test_dashscope_chat_does_not_retry_non_transient_400(monkeypatch):
+    client = DashScopeLabelClient(model="qwen3.5-plus", base_url="https://example.com", api_key="test")
+    client.retry_attempts = 4
+    client.retry_delay_sec = 0
+    calls = {"count": 0}
+
+    def fake_chat_sync(prompt: str):
+        calls["count"] += 1
+        raise _http_error(400, '{"code":"InvalidParameter"}')
+
+    monkeypatch.setattr(client, "_chat_sync", fake_chat_sync)
+
+    with pytest.raises(requests.exceptions.HTTPError):
+        client._chat("prompt", phase="relevance_classify")
+
+    assert calls["count"] == 1
+
+
+def test_dashscope_chat_retries_transient_500(monkeypatch):
+    client = DashScopeLabelClient(model="qwen3.5-plus", base_url="https://example.com", api_key="test")
+    client.retry_attempts = 4
+    client.retry_delay_sec = 0
+    calls = {"count": 0}
+
+    def fake_chat_sync(prompt: str):
+        calls["count"] += 1
+        if calls["count"] < 3:
+            raise _http_error(500, '{"code":"InternalError"}')
+        return "Exact Match", '{"choices":[{"message":{"content":"Exact Match"}}]}'
+
+    monkeypatch.setattr(client, "_chat_sync", fake_chat_sync)
+
+    content, raw = client._chat("prompt", phase="relevance_classify")
+
+    assert content == "Exact Match"
+    assert "Exact Match" in raw
+    assert calls["count"] == 3
@@ -136,20 +136,24 @@ def test_parse_reuses_tokenization_across_tail_stages(monkeypatch):
     assert tokenize_calls == []
  
  
-def test_parse_fast_path_detects_ascii_query_as_english_without_lingua(monkeypatch):
+def test_parse_ascii_latin_query_uses_language_detector(monkeypatch):
     parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
+    detector_calls = []
+
     monkeypatch.setattr(
         parser.language_detector,
         "detect",
-        lambda text: (_ for _ in ()).throw(AssertionError("Lingua path should not be used")),
+        lambda text: detector_calls.append(text) or "es",
     )
  
     result = parser.parse(
-        "street t-shirt women",
+        "falda negra oficina",
         tenant_id="0",
         generate_vector=False,
         target_languages=["en", "zh"],
     )
  
-    assert result.detected_language == "en"
-    assert result.query_tokens == ["street", "t-shirt", "women"]
+    assert detector_calls == ["falda negra oficina"]
+    assert result.detected_language == "es"
+    assert result.translations == {"en": "falda negra oficina-en", "zh": "falda negra oficina-zh"}
+    assert result.query_tokens == ["falda", "negra", "oficina"]