From a7bb846ca0d6aa2bc446efa63b8cf782cbb63b39 Mon Sep 17 00:00:00 2001 From: tangwang Date: Fri, 13 Mar 2026 12:08:20 +0800 Subject: [PATCH] monitor --- .env | 2 +- @ | 17 +++++++++++++++++ activate.sh | 0 query/translator.py | 11 ++++++----- scripts/create_venv.sh | 0 scripts/service_ctl.sh | 176 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------------------- status.sh | 0 tests/ci/test_service_api_contracts.py | 24 ++++++++++++++++++++++++ tests/test_translator_failure_semantics.py | 30 ++++++++++++++++++++++++++++++ 9 files changed, 213 insertions(+), 47 deletions(-) create mode 100644 @ mode change 100644 => 100755 activate.sh mode change 100644 => 100755 scripts/create_venv.sh mode change 100644 => 100755 status.sh create mode 100644 tests/test_translator_failure_semantics.py diff --git a/.env b/.env index 03604e1..4f5e97e 100644 --- a/.env +++ b/.env @@ -36,7 +36,7 @@ API_BASE_URL=http://43.166.252.75:6002 # 通用 DashScope key(翻译/内容理解等模块) -DASHSCOPE_API_KEY=sk-c3b8d4db061840aa8effb748df2a997b +DASHSCOPE_API_KEY=sk-482cc3ff37a8467dab134a7a46830556 # Reranker 专用 key(按地域) RERANK_DASHSCOPE_API_KEY_CN=sk-c3b8d4db061840aa8effb748df2a997b RERANK_DASHSCOPE_API_KEY_US=sk-482cc3ff37a8467dab134a7a46830556 diff --git a/@ b/@ new file mode 100644 index 0000000..5de1e4a --- /dev/null +++ b/@ @@ -0,0 +1,17 @@ + +# Please enter the commit message for your changes. Lines starting +# with '#' will be ignored, and an empty message aborts the commit. +# +# On branch master +# Your branch is ahead of 'origin/master' by 3 commits. +# (use "git push" to publish your local commits) +# +# Changes to be committed: +# modified: README.md +# modified: docs/Usage-Guide.md +# modified: scripts/service_ctl.sh +# new file: status.sh +# +# Changes not staged for commit: +# modified: third-party/clip-as-service (untracked content) +# diff --git a/activate.sh b/activate.sh old mode 100644 new mode 100755 index 3afad50..3afad50 --- a/activate.sh +++ b/activate.sh diff --git a/query/translator.py b/query/translator.py index 2f94019..1ad90e5 100644 --- a/query/translator.py +++ b/query/translator.py @@ -243,21 +243,22 @@ class Translator: else: # deepl result = self._translate_deepl(text, target_lang, source_lang, translation_context, prompt) - # If still failed, return original text with warning + # Surface translation failure to the caller instead of silently + # masquerading the source text as a successful translation. if result is None: logger.warning( f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " - f"Source language: {source_lang or 'auto'} | Result: '{text}' | Status: Translation failed, returning original" + f"Source language: {source_lang or 'auto'} | Status: Translation failed" ) - result = text else: logger.info( f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " f"Source language: {source_lang or 'auto'} | Result: '{result}' | Status: Translation successful" ) - # Cache result - if result and self.use_cache and self.redis_client: + # Cache only successful translations. Failed attempts must not poison + # Redis with the original text. + if result is not None and self.use_cache and self.redis_client: self._set_cached_translation_redis(text, target_lang, result, source_lang, translation_context, prompt) return result diff --git a/scripts/create_venv.sh b/scripts/create_venv.sh old mode 100644 new mode 100755 index 0e878da..0e878da --- a/scripts/create_venv.sh +++ b/scripts/create_venv.sh diff --git a/scripts/service_ctl.sh b/scripts/service_ctl.sh index 55ce57d..054047c 100755 --- a/scripts/service_ctl.sh +++ b/scripts/service_ctl.sh @@ -112,12 +112,68 @@ monitor_targets_file() { echo "${LOG_DIR}/service-monitor.targets" } +sync_monitor_daemon_state() { + local pf + pf="$(monitor_pid_file)" + local tf + tf="$(monitor_targets_file)" + + if [ ! -f "${pf}" ]; then + return 1 + fi + + local pid + pid="$(cat "${pf}" 2>/dev/null || true)" + if [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then + return 0 + fi + + rm -f "${pf}" "${tf}" + return 1 +} + monitor_current_targets() { if [ -f "$(monitor_targets_file)" ]; then cat "$(monitor_targets_file)" 2>/dev/null || true fi } +merge_targets() { + local base="${1:-}" + local extra="${2:-}" + local merged="" + merged="$(normalize_targets "${base} ${extra}")" + if [ -n "${merged}" ]; then + merged="$(apply_target_order monitor "${merged}")" + fi + echo "${merged}" +} + +subtract_targets() { + local base="${1:-}" + local remove="${2:-}" + local out="" + local svc + declare -A removed=() + + for svc in ${remove}; do + removed["${svc}"]=1 + done + + for svc in ${base}; do + if [ "${removed[${svc}]:-0}" != "1" ]; then + out="${out} ${svc}" + fi + done + + out="${out# }" + if [ -n "${out}" ]; then + out="$(normalize_targets "${out}")" + out="$(apply_target_order monitor "${out}")" + fi + echo "${out}" +} + monitor_log_event() { local service="$1" local level="$2" @@ -125,7 +181,11 @@ monitor_log_event() { local ts line ts="$(date '+%F %T')" line="[${ts}] [${level}] [${service}] ${message}" - echo "${line}" | tee -a "$(monitor_log_file)" + if [ -t 1 ]; then + echo "${line}" | tee -a "$(monitor_log_file)" + else + echo "${line}" >> "$(monitor_log_file)" + fi } require_positive_int() { @@ -156,9 +216,13 @@ service_healthy_now() { port="$(get_port "${service}")" path="$(health_path_for_service "${service}")" - if [ -z "${port}" ] || [ -z "${path}" ]; then + if [ -z "${port}" ]; then return 1 fi + if [ -z "${path}" ]; then + is_running_by_pid "${service}" || is_running_by_port "${service}" + return + fi if ! is_running_by_port "${service}"; then return 1 fi @@ -228,6 +292,17 @@ monitor_services() { touch "$(monitor_log_file)" + if [ "${MONITOR_DAEMON:-0}" = "1" ]; then + echo "$$" > "$(monitor_pid_file)" + echo "${targets}" > "$(monitor_targets_file)" + trap ' + current_pid="$(cat "$(monitor_pid_file)" 2>/dev/null || true)" + if [ "${current_pid}" = "$$" ]; then + rm -f "$(monitor_pid_file)" "$(monitor_targets_file)" + fi + ' EXIT + fi + declare -A fail_streak=() declare -A last_restart_epoch=() declare -A restart_history=() @@ -295,14 +370,7 @@ monitor_services() { } is_monitor_daemon_running() { - local pf - pf="$(monitor_pid_file)" - if [ ! -f "${pf}" ]; then - return 1 - fi - local pid - pid="$(cat "${pf}" 2>/dev/null || true)" - [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null + sync_monitor_daemon_state } stop_monitor_daemon() { @@ -348,9 +416,15 @@ start_monitor_daemon() { fi echo "${targets}" > "${tf}" - nohup "${PROJECT_ROOT}/scripts/service_ctl.sh" monitor ${targets} >/dev/null 2>&1 & + MONITOR_DAEMON=1 nohup "${PROJECT_ROOT}/scripts/service_ctl.sh" monitor ${targets} >> "$(monitor_log_file)" 2>&1 & local pid=$! echo "${pid}" > "${pf}" + sleep 1 + if ! kill -0 "${pid}" 2>/dev/null; then + rm -f "${pf}" "${tf}" + echo "[error] monitor daemon failed to stay alive, inspect $(monitor_log_file)" >&2 + return 1 + fi echo "[ok] monitor daemon started (pid=${pid}, targets=[${targets}], log=$(monitor_log_file))" } @@ -378,7 +452,11 @@ is_running_by_pid() { fi local pid pid="$(cat "${pf}" 2>/dev/null || true)" - [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null + if [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then + return 0 + fi + rm -f "${pf}" + return 1 } is_running_by_port() { @@ -403,6 +481,34 @@ get_cnclip_flow_device() { sed -n "s/^[[:space:]]*device:[[:space:]]*'\\([^']*\\)'.*/\\1/p" "${flow_file}" | head -n 1 } +start_health_retries_for_service() { + local service="$1" + case "${service}" in + reranker) echo 90 ;; + *) echo 30 ;; + esac +} + +wait_for_startup_health() { + local service="$1" + local pid="$2" + local lf="$3" + local retries + retries="$(start_health_retries_for_service "${service}")" + + if wait_for_health "${service}" "${retries}"; then + if wait_for_stable_health "${service}" 5 1; then + echo "[ok] ${service} healthy (pid=${pid}, log=${lf})" + return 0 + fi + echo "[error] ${service} became unavailable right after startup, inspect ${lf}" >&2 + return 1 + fi + + echo "[error] ${service} health check timeout, inspect ${lf}" >&2 + return 1 +} + start_one() { local service="$1" cd "${PROJECT_ROOT}" @@ -466,37 +572,12 @@ start_one() { return 1 fi ;; - backend|indexer|frontend|embedding|translator) - echo "[start] ${service}" - nohup "${cmd}" > >("${PROJECT_ROOT}/scripts/daily_log_router.sh" "${service}" "${LOG_DIR}" "${LOG_RETENTION_DAYS}") 2>&1 & - local pid=$! - echo "${pid}" > "${pf}" - local retries=30 - if wait_for_health "${service}" "${retries}"; then - echo "[ok] ${service} healthy (pid=${pid}, log=${lf})" - else - echo "[error] ${service} health check timeout, inspect ${lf}" >&2 - return 1 - fi - ;; - reranker) + backend|indexer|frontend|embedding|translator|reranker) echo "[start] ${service}" - # Start reranker directly so pid file points to the script process that - # will exec uvicorn, avoiding extra shell wrapper lifecycle edge-cases. nohup "${cmd}" > >("${PROJECT_ROOT}/scripts/daily_log_router.sh" "${service}" "${LOG_DIR}" "${LOG_RETENTION_DAYS}") 2>&1 & local pid=$! echo "${pid}" > "${pf}" - if wait_for_health "${service}" 90; then - if wait_for_stable_health "${service}" 5 1; then - echo "[ok] ${service} healthy (pid=${pid}, log=${lf})" - else - echo "[error] ${service} became unavailable right after startup, inspect ${lf}" >&2 - return 1 - fi - else - echo "[error] ${service} health check timeout, inspect ${lf}" >&2 - return 1 - fi + wait_for_startup_health "${service}" "${pid}" "${lf}" ;; *) echo "[warn] ${service} unsupported start path" @@ -797,6 +878,7 @@ main() { local targets="" local monitor_was_running=0 local monitor_prev_targets="" + local auto_monitor_on_start="${SERVICE_CTL_AUTO_MONITOR_ON_START:-1}" case "${action}" in monitor-stop|monitor-status) @@ -834,11 +916,21 @@ main() { for svc in ${targets}; do start_one "${svc}" done + if [ "${auto_monitor_on_start}" = "1" ]; then + start_monitor_daemon "$(merge_targets "$(monitor_current_targets)" "${targets}")" + fi ;; stop) if is_monitor_daemon_running; then - echo "[info] stopping monitor daemon before manual stop" - stop_monitor_daemon + local remaining_targets + remaining_targets="$(subtract_targets "$(monitor_current_targets)" "${targets}")" + if [ -n "${remaining_targets}" ]; then + echo "[info] updating monitor daemon targets -> [${remaining_targets}]" + start_monitor_daemon "${remaining_targets}" + else + echo "[info] stopping monitor daemon before manual stop" + stop_monitor_daemon + fi fi for svc in ${targets}; do stop_one "${svc}" @@ -864,6 +956,8 @@ main() { monitor_prev_targets="$(apply_target_order monitor "${monitor_prev_targets}")" [ -z "${monitor_prev_targets}" ] && monitor_prev_targets="${targets}" start_monitor_daemon "${monitor_prev_targets}" + elif [ "${auto_monitor_on_start}" = "1" ]; then + start_monitor_daemon "$(merge_targets "$(monitor_current_targets)" "${targets}")" fi ;; status) diff --git a/status.sh b/status.sh old mode 100644 new mode 100755 index 38d1920..38d1920 --- a/status.sh +++ b/status.sh diff --git a/tests/ci/test_service_api_contracts.py b/tests/ci/test_service_api_contracts.py index e6f05af..a7488b1 100644 --- a/tests/ci/test_service_api_contracts.py +++ b/tests/ci/test_service_api_contracts.py @@ -578,6 +578,14 @@ class _FakeTranslator: return f"{text}-{target_lang}" +class _FailingTranslator: + model = "qwen" + use_cache = True + + def translate(self, text: str, target_lang: str, source_lang: str | None = None, prompt: str | None = None): + return None + + @pytest.fixture def translator_client(monkeypatch): import api.translator_app as translator_app @@ -598,6 +606,22 @@ def test_translator_api_contract(translator_client: TestClient): assert response.json()["translated_text"] == "商品名称-en" +def test_translator_api_failure_returns_500(monkeypatch): + import api.translator_app as translator_app + + translator_app.app.router.on_startup.clear() + monkeypatch.setattr(translator_app, "get_translator", lambda model="qwen": _FailingTranslator()) + + with TestClient(translator_app.app) as client: + response = client.post( + "/translate", + json={"text": "商品名称", "target_lang": "en", "source_lang": "zh"}, + ) + + assert response.status_code == 500 + assert response.json()["detail"] == "Translation failed" + + def test_translator_health_contract(translator_client: TestClient): response = translator_client.get("/health") assert response.status_code == 200 diff --git a/tests/test_translator_failure_semantics.py b/tests/test_translator_failure_semantics.py new file mode 100644 index 0000000..286468c --- /dev/null +++ b/tests/test_translator_failure_semantics.py @@ -0,0 +1,30 @@ +from query.translator import Translator + + +class _RecordingRedis: + def __init__(self): + self.setex_calls = [] + + def setex(self, key, ttl, value): + self.setex_calls.append((key, ttl, value)) + + +def test_translate_failure_returns_none_and_skips_cache(monkeypatch): + translator = Translator(model="qwen", api_key="dummy-key", use_cache=False) + fake_redis = _RecordingRedis() + translator.use_cache = True + translator.redis_client = fake_redis + translator.cache_prefix = "trans" + translator.expire_seconds = 60 + + monkeypatch.setattr(translator, "_translate_qwen", lambda *args, **kwargs: None) + + result = translator.translate( + text="商品标题", + target_lang="en", + source_lang="zh", + prompt="translate for product search", + ) + + assert result is None + assert fake_redis.setex_calls == [] -- libgit2 0.21.2