Commit a3734f13ec48c457670d4cd68fe3e865bc0a18fe

Authored by tangwang
1 parent a345b01f

eval任务 美国地区不支持batch调用,改为在线调用

scripts/evaluation/eval_framework/clients.py
... ... @@ -74,6 +74,10 @@ class DashScopeLabelClient:
74 74 """DashScope OpenAI-compatible chat: synchronous or Batch File API (JSONL job).
75 75  
76 76 Batch flow: https://help.aliyun.com/zh/model-studio/batch-interfaces-compatible-with-openai/
  77 +
  78 + Some regional endpoints (e.g. ``dashscope-us`` compatible-mode) do not implement ``/batches``;
  79 + on HTTP 404 from batch calls we fall back to synchronous ``/chat/completions`` and stop using batch
  80 + for subsequent requests on this client.
77 81 """
78 82  
79 83 def __init__(
... ... @@ -86,7 +90,7 @@ class DashScopeLabelClient:
86 90 batch_completion_window: str = "24h",
87 91 batch_poll_interval_sec: float = 10.0,
88 92 enable_thinking: bool = True,
89   - use_batch: bool = True,
  93 + use_batch: bool = False,
90 94 ):
91 95 self.model = model
92 96 self.base_url = base_url.rstrip("/")
... ... @@ -205,9 +209,16 @@ class DashScopeLabelClient:
205 209 return content, safe_json_dumps(row)
206 210  
207 211 def _chat(self, prompt: str) -> Tuple[str, str]:
208   - if self.use_batch:
  212 + if not self.use_batch:
  213 + return self._chat_sync(prompt)
  214 + try:
209 215 return self._chat_batch(prompt)
210   - return self._chat_sync(prompt)
  216 + except requests.exceptions.HTTPError as e:
  217 + resp = getattr(e, "response", None)
  218 + if resp is not None and resp.status_code == 404:
  219 + self.use_batch = False
  220 + return self._chat_sync(prompt)
  221 + raise
211 222  
212 223 def _find_batch_line_for_custom_id(
213 224 self,
... ...
scripts/evaluation/eval_framework/constants.py
... ... @@ -37,7 +37,7 @@ DEFAULT_QUERY_FILE = _SCRIPTS_EVAL_DIR / "queries" / "queries.txt"
37 37 # Judge LLM (eval_framework only; override via CLI --judge-model / constructor kwargs)
38 38 DEFAULT_JUDGE_MODEL = "qwen3.5-flash"
39 39 DEFAULT_JUDGE_ENABLE_THINKING = True
40   -DEFAULT_JUDGE_DASHSCOPE_BATCH = True
  40 +DEFAULT_JUDGE_DASHSCOPE_BATCH = False
41 41 DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW = "24h"
42 42 DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC = 10.0
43 43  
... ...