Commit a7bb846ca0d6aa2bc446efa63b8cf782cbb63b39
1 parent
c6da6bca
monitor
Showing
9 changed files
with
213 additions
and
47 deletions
Show diff stats
| ... | ... | @@ -36,7 +36,7 @@ API_BASE_URL=http://43.166.252.75:6002 |
| 36 | 36 | |
| 37 | 37 | |
| 38 | 38 | # 通用 DashScope key(翻译/内容理解等模块) |
| 39 | -DASHSCOPE_API_KEY=sk-c3b8d4db061840aa8effb748df2a997b | |
| 39 | +DASHSCOPE_API_KEY=sk-482cc3ff37a8467dab134a7a46830556 | |
| 40 | 40 | # Reranker 专用 key(按地域) |
| 41 | 41 | RERANK_DASHSCOPE_API_KEY_CN=sk-c3b8d4db061840aa8effb748df2a997b |
| 42 | 42 | RERANK_DASHSCOPE_API_KEY_US=sk-482cc3ff37a8467dab134a7a46830556 | ... | ... |
| ... | ... | @@ -0,0 +1,17 @@ |
| 1 | + | |
| 2 | +# Please enter the commit message for your changes. Lines starting | |
| 3 | +# with '#' will be ignored, and an empty message aborts the commit. | |
| 4 | +# | |
| 5 | +# On branch master | |
| 6 | +# Your branch is ahead of 'origin/master' by 3 commits. | |
| 7 | +# (use "git push" to publish your local commits) | |
| 8 | +# | |
| 9 | +# Changes to be committed: | |
| 10 | +# modified: README.md | |
| 11 | +# modified: docs/Usage-Guide.md | |
| 12 | +# modified: scripts/service_ctl.sh | |
| 13 | +# new file: status.sh | |
| 14 | +# | |
| 15 | +# Changes not staged for commit: | |
| 16 | +# modified: third-party/clip-as-service (untracked content) | |
| 17 | +# | ... | ... |
query/translator.py
| ... | ... | @@ -243,21 +243,22 @@ class Translator: |
| 243 | 243 | else: # deepl |
| 244 | 244 | result = self._translate_deepl(text, target_lang, source_lang, translation_context, prompt) |
| 245 | 245 | |
| 246 | - # If still failed, return original text with warning | |
| 246 | + # Surface translation failure to the caller instead of silently | |
| 247 | + # masquerading the source text as a successful translation. | |
| 247 | 248 | if result is None: |
| 248 | 249 | logger.warning( |
| 249 | 250 | f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " |
| 250 | - f"Source language: {source_lang or 'auto'} | Result: '{text}' | Status: Translation failed, returning original" | |
| 251 | + f"Source language: {source_lang or 'auto'} | Status: Translation failed" | |
| 251 | 252 | ) |
| 252 | - result = text | |
| 253 | 253 | else: |
| 254 | 254 | logger.info( |
| 255 | 255 | f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " |
| 256 | 256 | f"Source language: {source_lang or 'auto'} | Result: '{result}' | Status: Translation successful" |
| 257 | 257 | ) |
| 258 | 258 | |
| 259 | - # Cache result | |
| 260 | - if result and self.use_cache and self.redis_client: | |
| 259 | + # Cache only successful translations. Failed attempts must not poison | |
| 260 | + # Redis with the original text. | |
| 261 | + if result is not None and self.use_cache and self.redis_client: | |
| 261 | 262 | self._set_cached_translation_redis(text, target_lang, result, source_lang, translation_context, prompt) |
| 262 | 263 | |
| 263 | 264 | return result | ... | ... |
scripts/service_ctl.sh
| ... | ... | @@ -112,12 +112,68 @@ monitor_targets_file() { |
| 112 | 112 | echo "${LOG_DIR}/service-monitor.targets" |
| 113 | 113 | } |
| 114 | 114 | |
| 115 | +sync_monitor_daemon_state() { | |
| 116 | + local pf | |
| 117 | + pf="$(monitor_pid_file)" | |
| 118 | + local tf | |
| 119 | + tf="$(monitor_targets_file)" | |
| 120 | + | |
| 121 | + if [ ! -f "${pf}" ]; then | |
| 122 | + return 1 | |
| 123 | + fi | |
| 124 | + | |
| 125 | + local pid | |
| 126 | + pid="$(cat "${pf}" 2>/dev/null || true)" | |
| 127 | + if [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then | |
| 128 | + return 0 | |
| 129 | + fi | |
| 130 | + | |
| 131 | + rm -f "${pf}" "${tf}" | |
| 132 | + return 1 | |
| 133 | +} | |
| 134 | + | |
| 115 | 135 | monitor_current_targets() { |
| 116 | 136 | if [ -f "$(monitor_targets_file)" ]; then |
| 117 | 137 | cat "$(monitor_targets_file)" 2>/dev/null || true |
| 118 | 138 | fi |
| 119 | 139 | } |
| 120 | 140 | |
| 141 | +merge_targets() { | |
| 142 | + local base="${1:-}" | |
| 143 | + local extra="${2:-}" | |
| 144 | + local merged="" | |
| 145 | + merged="$(normalize_targets "${base} ${extra}")" | |
| 146 | + if [ -n "${merged}" ]; then | |
| 147 | + merged="$(apply_target_order monitor "${merged}")" | |
| 148 | + fi | |
| 149 | + echo "${merged}" | |
| 150 | +} | |
| 151 | + | |
| 152 | +subtract_targets() { | |
| 153 | + local base="${1:-}" | |
| 154 | + local remove="${2:-}" | |
| 155 | + local out="" | |
| 156 | + local svc | |
| 157 | + declare -A removed=() | |
| 158 | + | |
| 159 | + for svc in ${remove}; do | |
| 160 | + removed["${svc}"]=1 | |
| 161 | + done | |
| 162 | + | |
| 163 | + for svc in ${base}; do | |
| 164 | + if [ "${removed[${svc}]:-0}" != "1" ]; then | |
| 165 | + out="${out} ${svc}" | |
| 166 | + fi | |
| 167 | + done | |
| 168 | + | |
| 169 | + out="${out# }" | |
| 170 | + if [ -n "${out}" ]; then | |
| 171 | + out="$(normalize_targets "${out}")" | |
| 172 | + out="$(apply_target_order monitor "${out}")" | |
| 173 | + fi | |
| 174 | + echo "${out}" | |
| 175 | +} | |
| 176 | + | |
| 121 | 177 | monitor_log_event() { |
| 122 | 178 | local service="$1" |
| 123 | 179 | local level="$2" |
| ... | ... | @@ -125,7 +181,11 @@ monitor_log_event() { |
| 125 | 181 | local ts line |
| 126 | 182 | ts="$(date '+%F %T')" |
| 127 | 183 | line="[${ts}] [${level}] [${service}] ${message}" |
| 128 | - echo "${line}" | tee -a "$(monitor_log_file)" | |
| 184 | + if [ -t 1 ]; then | |
| 185 | + echo "${line}" | tee -a "$(monitor_log_file)" | |
| 186 | + else | |
| 187 | + echo "${line}" >> "$(monitor_log_file)" | |
| 188 | + fi | |
| 129 | 189 | } |
| 130 | 190 | |
| 131 | 191 | require_positive_int() { |
| ... | ... | @@ -156,9 +216,13 @@ service_healthy_now() { |
| 156 | 216 | |
| 157 | 217 | port="$(get_port "${service}")" |
| 158 | 218 | path="$(health_path_for_service "${service}")" |
| 159 | - if [ -z "${port}" ] || [ -z "${path}" ]; then | |
| 219 | + if [ -z "${port}" ]; then | |
| 160 | 220 | return 1 |
| 161 | 221 | fi |
| 222 | + if [ -z "${path}" ]; then | |
| 223 | + is_running_by_pid "${service}" || is_running_by_port "${service}" | |
| 224 | + return | |
| 225 | + fi | |
| 162 | 226 | if ! is_running_by_port "${service}"; then |
| 163 | 227 | return 1 |
| 164 | 228 | fi |
| ... | ... | @@ -228,6 +292,17 @@ monitor_services() { |
| 228 | 292 | |
| 229 | 293 | touch "$(monitor_log_file)" |
| 230 | 294 | |
| 295 | + if [ "${MONITOR_DAEMON:-0}" = "1" ]; then | |
| 296 | + echo "$$" > "$(monitor_pid_file)" | |
| 297 | + echo "${targets}" > "$(monitor_targets_file)" | |
| 298 | + trap ' | |
| 299 | + current_pid="$(cat "$(monitor_pid_file)" 2>/dev/null || true)" | |
| 300 | + if [ "${current_pid}" = "$$" ]; then | |
| 301 | + rm -f "$(monitor_pid_file)" "$(monitor_targets_file)" | |
| 302 | + fi | |
| 303 | + ' EXIT | |
| 304 | + fi | |
| 305 | + | |
| 231 | 306 | declare -A fail_streak=() |
| 232 | 307 | declare -A last_restart_epoch=() |
| 233 | 308 | declare -A restart_history=() |
| ... | ... | @@ -295,14 +370,7 @@ monitor_services() { |
| 295 | 370 | } |
| 296 | 371 | |
| 297 | 372 | is_monitor_daemon_running() { |
| 298 | - local pf | |
| 299 | - pf="$(monitor_pid_file)" | |
| 300 | - if [ ! -f "${pf}" ]; then | |
| 301 | - return 1 | |
| 302 | - fi | |
| 303 | - local pid | |
| 304 | - pid="$(cat "${pf}" 2>/dev/null || true)" | |
| 305 | - [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null | |
| 373 | + sync_monitor_daemon_state | |
| 306 | 374 | } |
| 307 | 375 | |
| 308 | 376 | stop_monitor_daemon() { |
| ... | ... | @@ -348,9 +416,15 @@ start_monitor_daemon() { |
| 348 | 416 | fi |
| 349 | 417 | |
| 350 | 418 | echo "${targets}" > "${tf}" |
| 351 | - nohup "${PROJECT_ROOT}/scripts/service_ctl.sh" monitor ${targets} >/dev/null 2>&1 & | |
| 419 | + MONITOR_DAEMON=1 nohup "${PROJECT_ROOT}/scripts/service_ctl.sh" monitor ${targets} >> "$(monitor_log_file)" 2>&1 & | |
| 352 | 420 | local pid=$! |
| 353 | 421 | echo "${pid}" > "${pf}" |
| 422 | + sleep 1 | |
| 423 | + if ! kill -0 "${pid}" 2>/dev/null; then | |
| 424 | + rm -f "${pf}" "${tf}" | |
| 425 | + echo "[error] monitor daemon failed to stay alive, inspect $(monitor_log_file)" >&2 | |
| 426 | + return 1 | |
| 427 | + fi | |
| 354 | 428 | echo "[ok] monitor daemon started (pid=${pid}, targets=[${targets}], log=$(monitor_log_file))" |
| 355 | 429 | } |
| 356 | 430 | |
| ... | ... | @@ -378,7 +452,11 @@ is_running_by_pid() { |
| 378 | 452 | fi |
| 379 | 453 | local pid |
| 380 | 454 | pid="$(cat "${pf}" 2>/dev/null || true)" |
| 381 | - [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null | |
| 455 | + if [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then | |
| 456 | + return 0 | |
| 457 | + fi | |
| 458 | + rm -f "${pf}" | |
| 459 | + return 1 | |
| 382 | 460 | } |
| 383 | 461 | |
| 384 | 462 | is_running_by_port() { |
| ... | ... | @@ -403,6 +481,34 @@ get_cnclip_flow_device() { |
| 403 | 481 | sed -n "s/^[[:space:]]*device:[[:space:]]*'\\([^']*\\)'.*/\\1/p" "${flow_file}" | head -n 1 |
| 404 | 482 | } |
| 405 | 483 | |
| 484 | +start_health_retries_for_service() { | |
| 485 | + local service="$1" | |
| 486 | + case "${service}" in | |
| 487 | + reranker) echo 90 ;; | |
| 488 | + *) echo 30 ;; | |
| 489 | + esac | |
| 490 | +} | |
| 491 | + | |
| 492 | +wait_for_startup_health() { | |
| 493 | + local service="$1" | |
| 494 | + local pid="$2" | |
| 495 | + local lf="$3" | |
| 496 | + local retries | |
| 497 | + retries="$(start_health_retries_for_service "${service}")" | |
| 498 | + | |
| 499 | + if wait_for_health "${service}" "${retries}"; then | |
| 500 | + if wait_for_stable_health "${service}" 5 1; then | |
| 501 | + echo "[ok] ${service} healthy (pid=${pid}, log=${lf})" | |
| 502 | + return 0 | |
| 503 | + fi | |
| 504 | + echo "[error] ${service} became unavailable right after startup, inspect ${lf}" >&2 | |
| 505 | + return 1 | |
| 506 | + fi | |
| 507 | + | |
| 508 | + echo "[error] ${service} health check timeout, inspect ${lf}" >&2 | |
| 509 | + return 1 | |
| 510 | +} | |
| 511 | + | |
| 406 | 512 | start_one() { |
| 407 | 513 | local service="$1" |
| 408 | 514 | cd "${PROJECT_ROOT}" |
| ... | ... | @@ -466,37 +572,12 @@ start_one() { |
| 466 | 572 | return 1 |
| 467 | 573 | fi |
| 468 | 574 | ;; |
| 469 | - backend|indexer|frontend|embedding|translator) | |
| 470 | - echo "[start] ${service}" | |
| 471 | - nohup "${cmd}" > >("${PROJECT_ROOT}/scripts/daily_log_router.sh" "${service}" "${LOG_DIR}" "${LOG_RETENTION_DAYS}") 2>&1 & | |
| 472 | - local pid=$! | |
| 473 | - echo "${pid}" > "${pf}" | |
| 474 | - local retries=30 | |
| 475 | - if wait_for_health "${service}" "${retries}"; then | |
| 476 | - echo "[ok] ${service} healthy (pid=${pid}, log=${lf})" | |
| 477 | - else | |
| 478 | - echo "[error] ${service} health check timeout, inspect ${lf}" >&2 | |
| 479 | - return 1 | |
| 480 | - fi | |
| 481 | - ;; | |
| 482 | - reranker) | |
| 575 | + backend|indexer|frontend|embedding|translator|reranker) | |
| 483 | 576 | echo "[start] ${service}" |
| 484 | - # Start reranker directly so pid file points to the script process that | |
| 485 | - # will exec uvicorn, avoiding extra shell wrapper lifecycle edge-cases. | |
| 486 | 577 | nohup "${cmd}" > >("${PROJECT_ROOT}/scripts/daily_log_router.sh" "${service}" "${LOG_DIR}" "${LOG_RETENTION_DAYS}") 2>&1 & |
| 487 | 578 | local pid=$! |
| 488 | 579 | echo "${pid}" > "${pf}" |
| 489 | - if wait_for_health "${service}" 90; then | |
| 490 | - if wait_for_stable_health "${service}" 5 1; then | |
| 491 | - echo "[ok] ${service} healthy (pid=${pid}, log=${lf})" | |
| 492 | - else | |
| 493 | - echo "[error] ${service} became unavailable right after startup, inspect ${lf}" >&2 | |
| 494 | - return 1 | |
| 495 | - fi | |
| 496 | - else | |
| 497 | - echo "[error] ${service} health check timeout, inspect ${lf}" >&2 | |
| 498 | - return 1 | |
| 499 | - fi | |
| 580 | + wait_for_startup_health "${service}" "${pid}" "${lf}" | |
| 500 | 581 | ;; |
| 501 | 582 | *) |
| 502 | 583 | echo "[warn] ${service} unsupported start path" |
| ... | ... | @@ -797,6 +878,7 @@ main() { |
| 797 | 878 | local targets="" |
| 798 | 879 | local monitor_was_running=0 |
| 799 | 880 | local monitor_prev_targets="" |
| 881 | + local auto_monitor_on_start="${SERVICE_CTL_AUTO_MONITOR_ON_START:-1}" | |
| 800 | 882 | |
| 801 | 883 | case "${action}" in |
| 802 | 884 | monitor-stop|monitor-status) |
| ... | ... | @@ -834,11 +916,21 @@ main() { |
| 834 | 916 | for svc in ${targets}; do |
| 835 | 917 | start_one "${svc}" |
| 836 | 918 | done |
| 919 | + if [ "${auto_monitor_on_start}" = "1" ]; then | |
| 920 | + start_monitor_daemon "$(merge_targets "$(monitor_current_targets)" "${targets}")" | |
| 921 | + fi | |
| 837 | 922 | ;; |
| 838 | 923 | stop) |
| 839 | 924 | if is_monitor_daemon_running; then |
| 840 | - echo "[info] stopping monitor daemon before manual stop" | |
| 841 | - stop_monitor_daemon | |
| 925 | + local remaining_targets | |
| 926 | + remaining_targets="$(subtract_targets "$(monitor_current_targets)" "${targets}")" | |
| 927 | + if [ -n "${remaining_targets}" ]; then | |
| 928 | + echo "[info] updating monitor daemon targets -> [${remaining_targets}]" | |
| 929 | + start_monitor_daemon "${remaining_targets}" | |
| 930 | + else | |
| 931 | + echo "[info] stopping monitor daemon before manual stop" | |
| 932 | + stop_monitor_daemon | |
| 933 | + fi | |
| 842 | 934 | fi |
| 843 | 935 | for svc in ${targets}; do |
| 844 | 936 | stop_one "${svc}" |
| ... | ... | @@ -864,6 +956,8 @@ main() { |
| 864 | 956 | monitor_prev_targets="$(apply_target_order monitor "${monitor_prev_targets}")" |
| 865 | 957 | [ -z "${monitor_prev_targets}" ] && monitor_prev_targets="${targets}" |
| 866 | 958 | start_monitor_daemon "${monitor_prev_targets}" |
| 959 | + elif [ "${auto_monitor_on_start}" = "1" ]; then | |
| 960 | + start_monitor_daemon "$(merge_targets "$(monitor_current_targets)" "${targets}")" | |
| 867 | 961 | fi |
| 868 | 962 | ;; |
| 869 | 963 | status) | ... | ... |
tests/ci/test_service_api_contracts.py
| ... | ... | @@ -578,6 +578,14 @@ class _FakeTranslator: |
| 578 | 578 | return f"{text}-{target_lang}" |
| 579 | 579 | |
| 580 | 580 | |
| 581 | +class _FailingTranslator: | |
| 582 | + model = "qwen" | |
| 583 | + use_cache = True | |
| 584 | + | |
| 585 | + def translate(self, text: str, target_lang: str, source_lang: str | None = None, prompt: str | None = None): | |
| 586 | + return None | |
| 587 | + | |
| 588 | + | |
| 581 | 589 | @pytest.fixture |
| 582 | 590 | def translator_client(monkeypatch): |
| 583 | 591 | import api.translator_app as translator_app |
| ... | ... | @@ -598,6 +606,22 @@ def test_translator_api_contract(translator_client: TestClient): |
| 598 | 606 | assert response.json()["translated_text"] == "商品名称-en" |
| 599 | 607 | |
| 600 | 608 | |
| 609 | +def test_translator_api_failure_returns_500(monkeypatch): | |
| 610 | + import api.translator_app as translator_app | |
| 611 | + | |
| 612 | + translator_app.app.router.on_startup.clear() | |
| 613 | + monkeypatch.setattr(translator_app, "get_translator", lambda model="qwen": _FailingTranslator()) | |
| 614 | + | |
| 615 | + with TestClient(translator_app.app) as client: | |
| 616 | + response = client.post( | |
| 617 | + "/translate", | |
| 618 | + json={"text": "商品名称", "target_lang": "en", "source_lang": "zh"}, | |
| 619 | + ) | |
| 620 | + | |
| 621 | + assert response.status_code == 500 | |
| 622 | + assert response.json()["detail"] == "Translation failed" | |
| 623 | + | |
| 624 | + | |
| 601 | 625 | def test_translator_health_contract(translator_client: TestClient): |
| 602 | 626 | response = translator_client.get("/health") |
| 603 | 627 | assert response.status_code == 200 | ... | ... |
| ... | ... | @@ -0,0 +1,30 @@ |
| 1 | +from query.translator import Translator | |
| 2 | + | |
| 3 | + | |
| 4 | +class _RecordingRedis: | |
| 5 | + def __init__(self): | |
| 6 | + self.setex_calls = [] | |
| 7 | + | |
| 8 | + def setex(self, key, ttl, value): | |
| 9 | + self.setex_calls.append((key, ttl, value)) | |
| 10 | + | |
| 11 | + | |
| 12 | +def test_translate_failure_returns_none_and_skips_cache(monkeypatch): | |
| 13 | + translator = Translator(model="qwen", api_key="dummy-key", use_cache=False) | |
| 14 | + fake_redis = _RecordingRedis() | |
| 15 | + translator.use_cache = True | |
| 16 | + translator.redis_client = fake_redis | |
| 17 | + translator.cache_prefix = "trans" | |
| 18 | + translator.expire_seconds = 60 | |
| 19 | + | |
| 20 | + monkeypatch.setattr(translator, "_translate_qwen", lambda *args, **kwargs: None) | |
| 21 | + | |
| 22 | + result = translator.translate( | |
| 23 | + text="商品标题", | |
| 24 | + target_lang="en", | |
| 25 | + source_lang="zh", | |
| 26 | + prompt="translate for product search", | |
| 27 | + ) | |
| 28 | + | |
| 29 | + assert result is None | |
| 30 | + assert fake_redis.setex_calls == [] | ... | ... |