diff --git a/docs/常用查询 - ES.md b/docs/常用查询 - ES.md index 459ce80..ed3927d 100644 --- a/docs/常用查询 - ES.md +++ b/docs/常用查询 - ES.md @@ -659,7 +659,7 @@ curl -u 'saas:4hOaLaf41y2VuI8y' -X POST \ "query": { "bool": { "filter": [ - { "exists": { "field": "image_embedding" } } + { "exists": { "field": "title_embedding" } } ] } } diff --git a/query/query_parser.py b/query/query_parser.py index 3a36e37..904603f 100644 --- a/query/query_parser.py +++ b/query/query_parser.py @@ -27,7 +27,7 @@ from .product_title_exclusion import ( ) from .query_rewriter import QueryRewriter, QueryNormalizer from .style_intent import StyleIntentDetector, StyleIntentProfile, StyleIntentRegistry -from .tokenization import QueryTextAnalysisCache, contains_han_text, extract_token_strings +from .tokenization import QueryTextAnalysisCache, extract_token_strings from .keyword_extractor import KeywordExtractor, collect_keywords_queries logger = logging.getLogger(__name__) @@ -289,31 +289,12 @@ class QueryParser: def _get_query_tokens(self, query: str) -> List[str]: return self._extract_tokens(self._tokenizer(query)) - @staticmethod - def _is_ascii_latin_query(text: str) -> bool: - candidate = str(text or "").strip() - if not candidate or contains_han_text(candidate): - return False - try: - candidate.encode("ascii") - except UnicodeEncodeError: - return False - return any(ch.isalpha() for ch in candidate) - def _detect_query_language( self, query_text: str, *, target_languages: Optional[List[str]] = None, ) -> str: - normalized_targets = self._normalize_language_codes(target_languages) - supported_languages = self._normalize_language_codes( - getattr(self.config.query_config, "supported_languages", None) - ) - active_languages = normalized_targets or supported_languages - if active_languages and set(active_languages).issubset({"en", "zh"}): - if self._is_ascii_latin_query(query_text): - return "en" return self.language_detector.detect(query_text) def parse( diff --git a/scripts/evaluation/eval_framework/clients.py b/scripts/evaluation/eval_framework/clients.py index 17b8196..95b230e 100644 --- a/scripts/evaluation/eval_framework/clients.py +++ b/scripts/evaluation/eval_framework/clients.py @@ -82,6 +82,18 @@ def _canonicalize_judge_label(raw: str) -> str | None: return None +def _describe_request_exception(exc: requests.exceptions.RequestException) -> str: + if isinstance(exc, requests.exceptions.HTTPError): + response = getattr(exc, "response", None) + if response is None: + return str(exc) + body = str(getattr(response, "text", "") or "").strip() + if len(body) > 600: + body = body[:600].rstrip() + "...[truncated]" + return f"status={response.status_code} body={body or ''}" + return str(exc) + + class SearchServiceClient: def __init__(self, base_url: str, tenant_id: str): self.base_url = base_url.rstrip("/") @@ -341,7 +353,8 @@ class DashScopeLabelClient: except Exception as exc: last_exc = exc is_request_error = isinstance(exc, requests.exceptions.RequestException) - if not is_request_error or attempt >= self.retry_attempts: + is_transient = is_request_error and self._is_transient_request_error(exc) + if not is_transient or attempt >= self.retry_attempts: raise _client_log.warning( "Transient DashScope error, retrying (%s/%s): phase=%s model=%s use_batch=%s error=%s", @@ -350,7 +363,7 @@ class DashScopeLabelClient: phase, self.model, self.use_batch, - exc, + _describe_request_exception(exc), ) time.sleep(self.retry_delay_sec) else: @@ -366,6 +379,17 @@ class DashScopeLabelClient: ) return content, raw + @staticmethod + def _is_transient_request_error(exc: requests.exceptions.RequestException) -> bool: + if isinstance(exc, (requests.exceptions.ConnectionError, requests.exceptions.Timeout)): + return True + if isinstance(exc, requests.exceptions.HTTPError): + response = getattr(exc, "response", None) + if response is None: + return True + return int(response.status_code) in _TRANSIENT_HTTP_STATUS_CODES + return False + def _find_batch_line_for_custom_id( self, file_id: Optional[str], diff --git a/scripts/evaluation/eval_framework/utils.py b/scripts/evaluation/eval_framework/utils.py index 7ae33a5..bdb9643 100644 --- a/scripts/evaluation/eval_framework/utils.py +++ b/scripts/evaluation/eval_framework/utils.py @@ -11,6 +11,20 @@ from typing import Any, Dict, List, Sequence, Tuple from .constants import PROJECT_ROOT +_LABEL_OPTION_MAX_CHARS = 40 +_LABEL_DOC_LINE_MAX_CHARS = 260 + + +def _truncate_text(value: Any, max_chars: int) -> str: + text = str(value or "").strip() + if max_chars <= 0: + return "" + if len(text) <= max_chars: + return text + if max_chars <= 3: + return text[:max_chars] + return text[: max_chars - 3].rstrip() + "..." + def utc_now_iso() -> str: return datetime.now(timezone.utc).isoformat() @@ -67,11 +81,7 @@ def compact_option_values(skus: Sequence[Dict[str, Any]]) -> Tuple[str, str, str def build_display_title(doc: Dict[str, Any]) -> str: title = doc.get("title") - en = pick_text(title, "en") - zh = pick_text(title, "zh") - if en and zh and en != zh: - return f"{en} / {zh}" - return en or zh + return pick_text(title, "en") or pick_text(title, "zh") def build_rerank_doc(doc: Dict[str, Any]) -> str: @@ -82,14 +92,15 @@ def build_rerank_doc(doc: Dict[str, Any]) -> str: def build_label_doc_line(idx: int, doc: Dict[str, Any]) -> str: title = build_display_title(doc) option1, option2, option3 = compact_option_values(doc.get("skus") or []) - vendor = pick_text(doc.get("vendor"), "en") - category = pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en") parts = [title] if option1: - parts.append(f"{option1}") + parts.append(_truncate_text(option1, _LABEL_OPTION_MAX_CHARS)) if option2: - parts.append(f"{option2}") - return f"{idx}. " + " ".join(part for part in parts if part) + parts.append(_truncate_text(option2, _LABEL_OPTION_MAX_CHARS)) + if option3: + parts.append(_truncate_text(option3, _LABEL_OPTION_MAX_CHARS)) + line = " ".join(part for part in parts if part) + return _truncate_text(f"{idx}. {line}", _LABEL_DOC_LINE_MAX_CHARS) def compact_product_payload(doc: Dict[str, Any]) -> Dict[str, Any]: diff --git a/tests/test_eval_framework_clients.py b/tests/test_eval_framework_clients.py new file mode 100644 index 0000000..150fcab --- /dev/null +++ b/tests/test_eval_framework_clients.py @@ -0,0 +1,71 @@ +import pytest +import requests + +from scripts.evaluation.eval_framework.clients import DashScopeLabelClient +from scripts.evaluation.eval_framework.utils import build_label_doc_line + + +def _http_error(status_code: int, body: str) -> requests.exceptions.HTTPError: + response = requests.Response() + response.status_code = status_code + response._content = body.encode("utf-8") + response.url = "https://dashscope-us.aliyuncs.com/compatible-mode/v1/chat/completions" + return requests.exceptions.HTTPError(f"{status_code} error", response=response) + + +def test_build_label_doc_line_truncates_long_fields(): + doc = { + "title": {"en": "Minimalist Top " * 40}, + "skus": [ + { + "option1_value": "Very Long Color Name " * 10, + "option2_value": "Very Long Size Name " * 10, + "option3_value": "Very Long Material Name " * 10, + } + ], + } + + line = build_label_doc_line(1, doc) + + assert line.startswith("1. ") + assert len(line) <= 260 + assert line.endswith("...") + + +def test_dashscope_chat_does_not_retry_non_transient_400(monkeypatch): + client = DashScopeLabelClient(model="qwen3.5-plus", base_url="https://example.com", api_key="test") + client.retry_attempts = 4 + client.retry_delay_sec = 0 + calls = {"count": 0} + + def fake_chat_sync(prompt: str): + calls["count"] += 1 + raise _http_error(400, '{"code":"InvalidParameter"}') + + monkeypatch.setattr(client, "_chat_sync", fake_chat_sync) + + with pytest.raises(requests.exceptions.HTTPError): + client._chat("prompt", phase="relevance_classify") + + assert calls["count"] == 1 + + +def test_dashscope_chat_retries_transient_500(monkeypatch): + client = DashScopeLabelClient(model="qwen3.5-plus", base_url="https://example.com", api_key="test") + client.retry_attempts = 4 + client.retry_delay_sec = 0 + calls = {"count": 0} + + def fake_chat_sync(prompt: str): + calls["count"] += 1 + if calls["count"] < 3: + raise _http_error(500, '{"code":"InternalError"}') + return "Exact Match", '{"choices":[{"message":{"content":"Exact Match"}}]}' + + monkeypatch.setattr(client, "_chat_sync", fake_chat_sync) + + content, raw = client._chat("prompt", phase="relevance_classify") + + assert content == "Exact Match" + assert "Exact Match" in raw + assert calls["count"] == 3 diff --git a/tests/test_query_parser_mixed_language.py b/tests/test_query_parser_mixed_language.py index ec8e385..b615fd6 100644 --- a/tests/test_query_parser_mixed_language.py +++ b/tests/test_query_parser_mixed_language.py @@ -136,20 +136,24 @@ def test_parse_reuses_tokenization_across_tail_stages(monkeypatch): assert tokenize_calls == [] -def test_parse_fast_path_detects_ascii_query_as_english_without_lingua(monkeypatch): +def test_parse_ascii_latin_query_uses_language_detector(monkeypatch): parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer) + detector_calls = [] + monkeypatch.setattr( parser.language_detector, "detect", - lambda text: (_ for _ in ()).throw(AssertionError("Lingua path should not be used")), + lambda text: detector_calls.append(text) or "es", ) result = parser.parse( - "street t-shirt women", + "falda negra oficina", tenant_id="0", generate_vector=False, target_languages=["en", "zh"], ) - assert result.detected_language == "en" - assert result.query_tokens == ["street", "t-shirt", "women"] + assert detector_calls == ["falda negra oficina"] + assert result.detected_language == "es" + assert result.translations == {"en": "falda negra oficina-en", "zh": "falda negra oficina-zh"} + assert result.query_tokens == ["falda", "negra", "oficina"] -- libgit2 0.21.2