Commit a3734f13ec48c457670d4cd68fe3e865bc0a18fe

Authored by tangwang
1 parent a345b01f

eval任务 美国地区不支持batch调用,改为在线调用

scripts/evaluation/eval_framework/clients.py
@@ -74,6 +74,10 @@ class DashScopeLabelClient: @@ -74,6 +74,10 @@ class DashScopeLabelClient:
74 """DashScope OpenAI-compatible chat: synchronous or Batch File API (JSONL job). 74 """DashScope OpenAI-compatible chat: synchronous or Batch File API (JSONL job).
75 75
76 Batch flow: https://help.aliyun.com/zh/model-studio/batch-interfaces-compatible-with-openai/ 76 Batch flow: https://help.aliyun.com/zh/model-studio/batch-interfaces-compatible-with-openai/
  77 +
  78 + Some regional endpoints (e.g. ``dashscope-us`` compatible-mode) do not implement ``/batches``;
  79 + on HTTP 404 from batch calls we fall back to synchronous ``/chat/completions`` and stop using batch
  80 + for subsequent requests on this client.
77 """ 81 """
78 82
79 def __init__( 83 def __init__(
@@ -86,7 +90,7 @@ class DashScopeLabelClient: @@ -86,7 +90,7 @@ class DashScopeLabelClient:
86 batch_completion_window: str = "24h", 90 batch_completion_window: str = "24h",
87 batch_poll_interval_sec: float = 10.0, 91 batch_poll_interval_sec: float = 10.0,
88 enable_thinking: bool = True, 92 enable_thinking: bool = True,
89 - use_batch: bool = True, 93 + use_batch: bool = False,
90 ): 94 ):
91 self.model = model 95 self.model = model
92 self.base_url = base_url.rstrip("/") 96 self.base_url = base_url.rstrip("/")
@@ -205,9 +209,16 @@ class DashScopeLabelClient: @@ -205,9 +209,16 @@ class DashScopeLabelClient:
205 return content, safe_json_dumps(row) 209 return content, safe_json_dumps(row)
206 210
207 def _chat(self, prompt: str) -> Tuple[str, str]: 211 def _chat(self, prompt: str) -> Tuple[str, str]:
208 - if self.use_batch: 212 + if not self.use_batch:
  213 + return self._chat_sync(prompt)
  214 + try:
209 return self._chat_batch(prompt) 215 return self._chat_batch(prompt)
210 - return self._chat_sync(prompt) 216 + except requests.exceptions.HTTPError as e:
  217 + resp = getattr(e, "response", None)
  218 + if resp is not None and resp.status_code == 404:
  219 + self.use_batch = False
  220 + return self._chat_sync(prompt)
  221 + raise
211 222
212 def _find_batch_line_for_custom_id( 223 def _find_batch_line_for_custom_id(
213 self, 224 self,
scripts/evaluation/eval_framework/constants.py
@@ -37,7 +37,7 @@ DEFAULT_QUERY_FILE = _SCRIPTS_EVAL_DIR / "queries" / "queries.txt" @@ -37,7 +37,7 @@ DEFAULT_QUERY_FILE = _SCRIPTS_EVAL_DIR / "queries" / "queries.txt"
37 # Judge LLM (eval_framework only; override via CLI --judge-model / constructor kwargs) 37 # Judge LLM (eval_framework only; override via CLI --judge-model / constructor kwargs)
38 DEFAULT_JUDGE_MODEL = "qwen3.5-flash" 38 DEFAULT_JUDGE_MODEL = "qwen3.5-flash"
39 DEFAULT_JUDGE_ENABLE_THINKING = True 39 DEFAULT_JUDGE_ENABLE_THINKING = True
40 -DEFAULT_JUDGE_DASHSCOPE_BATCH = True 40 +DEFAULT_JUDGE_DASHSCOPE_BATCH = False
41 DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW = "24h" 41 DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW = "24h"
42 DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC = 10.0 42 DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC = 10.0
43 43