eval任务美国地区不支持batch调用，改为在线调用

tangwang
1 parent a345b01f
Showing 2 changed files with 15 additions and 4 deletions Show diff stats
scripts/evaluation/eval_framework/clients.py
scripts/evaluation/eval_framework/constants.py
@@ -74,6 +74,10 @@ class DashScopeLabelClient:
     """DashScope OpenAI-compatible chat: synchronous or Batch File API (JSONL job).
     Batch flow: https://help.aliyun.com/zh/model-studio/batch-interfaces-compatible-with-openai/
+
+    Some regional endpoints (e.g. ``dashscope-us`` compatible-mode) do not implement ``/batches``;
+    on HTTP 404 from batch calls we fall back to synchronous ``/chat/completions`` and stop using batch
+    for subsequent requests on this client.
     """
     def __init__(
@@ -86,7 +90,7 @@ class DashScopeLabelClient:
         batch_completion_window: str = "24h",
         batch_poll_interval_sec: float = 10.0,
         enable_thinking: bool = True,
-        use_batch: bool = True,
+        use_batch: bool = False,
     ):
         self.model = model
         self.base_url = base_url.rstrip("/")
@@ -205,9 +209,16 @@ class DashScopeLabelClient:
         return content, safe_json_dumps(row)
     def _chat(self, prompt: str) -> Tuple[str, str]:
-        if self.use_batch:
+        if not self.use_batch:
+            return self._chat_sync(prompt)
+        try:
             return self._chat_batch(prompt)
-        return self._chat_sync(prompt)
+        except requests.exceptions.HTTPError as e:
+            resp = getattr(e, "response", None)
+            if resp is not None and resp.status_code == 404:
+                self.use_batch = False
+                return self._chat_sync(prompt)
+            raise
     def _find_batch_line_for_custom_id(
         self,
@@ -37,7 +37,7 @@ DEFAULT_QUERY_FILE = _SCRIPTS_EVAL_DIR / &quot;queries&quot; / &quot;queries.txt&quot;
 # Judge LLM (eval_framework only; override via CLI --judge-model / constructor kwargs)
 DEFAULT_JUDGE_MODEL = "qwen3.5-flash"
 DEFAULT_JUDGE_ENABLE_THINKING = True
-DEFAULT_JUDGE_DASHSCOPE_BATCH = True
+DEFAULT_JUDGE_DASHSCOPE_BATCH = False
 DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW = "24h"
 DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC = 10.0