Commit a3734f13ec48c457670d4cd68fe3e865bc0a18fe
1 parent
a345b01f
eval任务 美国地区不支持batch调用,改为在线调用
Showing
2 changed files
with
15 additions
and
4 deletions
Show diff stats
scripts/evaluation/eval_framework/clients.py
| @@ -74,6 +74,10 @@ class DashScopeLabelClient: | @@ -74,6 +74,10 @@ class DashScopeLabelClient: | ||
| 74 | """DashScope OpenAI-compatible chat: synchronous or Batch File API (JSONL job). | 74 | """DashScope OpenAI-compatible chat: synchronous or Batch File API (JSONL job). |
| 75 | 75 | ||
| 76 | Batch flow: https://help.aliyun.com/zh/model-studio/batch-interfaces-compatible-with-openai/ | 76 | Batch flow: https://help.aliyun.com/zh/model-studio/batch-interfaces-compatible-with-openai/ |
| 77 | + | ||
| 78 | + Some regional endpoints (e.g. ``dashscope-us`` compatible-mode) do not implement ``/batches``; | ||
| 79 | + on HTTP 404 from batch calls we fall back to synchronous ``/chat/completions`` and stop using batch | ||
| 80 | + for subsequent requests on this client. | ||
| 77 | """ | 81 | """ |
| 78 | 82 | ||
| 79 | def __init__( | 83 | def __init__( |
| @@ -86,7 +90,7 @@ class DashScopeLabelClient: | @@ -86,7 +90,7 @@ class DashScopeLabelClient: | ||
| 86 | batch_completion_window: str = "24h", | 90 | batch_completion_window: str = "24h", |
| 87 | batch_poll_interval_sec: float = 10.0, | 91 | batch_poll_interval_sec: float = 10.0, |
| 88 | enable_thinking: bool = True, | 92 | enable_thinking: bool = True, |
| 89 | - use_batch: bool = True, | 93 | + use_batch: bool = False, |
| 90 | ): | 94 | ): |
| 91 | self.model = model | 95 | self.model = model |
| 92 | self.base_url = base_url.rstrip("/") | 96 | self.base_url = base_url.rstrip("/") |
| @@ -205,9 +209,16 @@ class DashScopeLabelClient: | @@ -205,9 +209,16 @@ class DashScopeLabelClient: | ||
| 205 | return content, safe_json_dumps(row) | 209 | return content, safe_json_dumps(row) |
| 206 | 210 | ||
| 207 | def _chat(self, prompt: str) -> Tuple[str, str]: | 211 | def _chat(self, prompt: str) -> Tuple[str, str]: |
| 208 | - if self.use_batch: | 212 | + if not self.use_batch: |
| 213 | + return self._chat_sync(prompt) | ||
| 214 | + try: | ||
| 209 | return self._chat_batch(prompt) | 215 | return self._chat_batch(prompt) |
| 210 | - return self._chat_sync(prompt) | 216 | + except requests.exceptions.HTTPError as e: |
| 217 | + resp = getattr(e, "response", None) | ||
| 218 | + if resp is not None and resp.status_code == 404: | ||
| 219 | + self.use_batch = False | ||
| 220 | + return self._chat_sync(prompt) | ||
| 221 | + raise | ||
| 211 | 222 | ||
| 212 | def _find_batch_line_for_custom_id( | 223 | def _find_batch_line_for_custom_id( |
| 213 | self, | 224 | self, |
scripts/evaluation/eval_framework/constants.py
| @@ -37,7 +37,7 @@ DEFAULT_QUERY_FILE = _SCRIPTS_EVAL_DIR / "queries" / "queries.txt" | @@ -37,7 +37,7 @@ DEFAULT_QUERY_FILE = _SCRIPTS_EVAL_DIR / "queries" / "queries.txt" | ||
| 37 | # Judge LLM (eval_framework only; override via CLI --judge-model / constructor kwargs) | 37 | # Judge LLM (eval_framework only; override via CLI --judge-model / constructor kwargs) |
| 38 | DEFAULT_JUDGE_MODEL = "qwen3.5-flash" | 38 | DEFAULT_JUDGE_MODEL = "qwen3.5-flash" |
| 39 | DEFAULT_JUDGE_ENABLE_THINKING = True | 39 | DEFAULT_JUDGE_ENABLE_THINKING = True |
| 40 | -DEFAULT_JUDGE_DASHSCOPE_BATCH = True | 40 | +DEFAULT_JUDGE_DASHSCOPE_BATCH = False |
| 41 | DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW = "24h" | 41 | DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW = "24h" |
| 42 | DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC = 10.0 | 42 | DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC = 10.0 |
| 43 | 43 |