Commit a7bb846ca0d6aa2bc446efa63b8cf782cbb63b39

Authored by tangwang
1 parent c6da6bca

monitor

... ... @@ -36,7 +36,7 @@ API_BASE_URL=http://43.166.252.75:6002
36 36  
37 37  
38 38 # 通用 DashScope key(翻译/内容理解等模块)
39   -DASHSCOPE_API_KEY=sk-c3b8d4db061840aa8effb748df2a997b
  39 +DASHSCOPE_API_KEY=sk-482cc3ff37a8467dab134a7a46830556
40 40 # Reranker 专用 key(按地域)
41 41 RERANK_DASHSCOPE_API_KEY_CN=sk-c3b8d4db061840aa8effb748df2a997b
42 42 RERANK_DASHSCOPE_API_KEY_US=sk-482cc3ff37a8467dab134a7a46830556
... ...
@ 0 → 100644
... ... @@ -0,0 +1,17 @@
  1 +
  2 +# Please enter the commit message for your changes. Lines starting
  3 +# with '#' will be ignored, and an empty message aborts the commit.
  4 +#
  5 +# On branch master
  6 +# Your branch is ahead of 'origin/master' by 3 commits.
  7 +# (use "git push" to publish your local commits)
  8 +#
  9 +# Changes to be committed:
  10 +# modified: README.md
  11 +# modified: docs/Usage-Guide.md
  12 +# modified: scripts/service_ctl.sh
  13 +# new file: status.sh
  14 +#
  15 +# Changes not staged for commit:
  16 +# modified: third-party/clip-as-service (untracked content)
  17 +#
... ...
activate.sh 100644 → 100755
query/translator.py
... ... @@ -243,21 +243,22 @@ class Translator:
243 243 else: # deepl
244 244 result = self._translate_deepl(text, target_lang, source_lang, translation_context, prompt)
245 245  
246   - # If still failed, return original text with warning
  246 + # Surface translation failure to the caller instead of silently
  247 + # masquerading the source text as a successful translation.
247 248 if result is None:
248 249 logger.warning(
249 250 f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "
250   - f"Source language: {source_lang or 'auto'} | Result: '{text}' | Status: Translation failed, returning original"
  251 + f"Source language: {source_lang or 'auto'} | Status: Translation failed"
251 252 )
252   - result = text
253 253 else:
254 254 logger.info(
255 255 f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "
256 256 f"Source language: {source_lang or 'auto'} | Result: '{result}' | Status: Translation successful"
257 257 )
258 258  
259   - # Cache result
260   - if result and self.use_cache and self.redis_client:
  259 + # Cache only successful translations. Failed attempts must not poison
  260 + # Redis with the original text.
  261 + if result is not None and self.use_cache and self.redis_client:
261 262 self._set_cached_translation_redis(text, target_lang, result, source_lang, translation_context, prompt)
262 263  
263 264 return result
... ...
scripts/create_venv.sh 100644 → 100755
scripts/service_ctl.sh
... ... @@ -112,12 +112,68 @@ monitor_targets_file() {
112 112 echo "${LOG_DIR}/service-monitor.targets"
113 113 }
114 114  
  115 +sync_monitor_daemon_state() {
  116 + local pf
  117 + pf="$(monitor_pid_file)"
  118 + local tf
  119 + tf="$(monitor_targets_file)"
  120 +
  121 + if [ ! -f "${pf}" ]; then
  122 + return 1
  123 + fi
  124 +
  125 + local pid
  126 + pid="$(cat "${pf}" 2>/dev/null || true)"
  127 + if [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then
  128 + return 0
  129 + fi
  130 +
  131 + rm -f "${pf}" "${tf}"
  132 + return 1
  133 +}
  134 +
115 135 monitor_current_targets() {
116 136 if [ -f "$(monitor_targets_file)" ]; then
117 137 cat "$(monitor_targets_file)" 2>/dev/null || true
118 138 fi
119 139 }
120 140  
  141 +merge_targets() {
  142 + local base="${1:-}"
  143 + local extra="${2:-}"
  144 + local merged=""
  145 + merged="$(normalize_targets "${base} ${extra}")"
  146 + if [ -n "${merged}" ]; then
  147 + merged="$(apply_target_order monitor "${merged}")"
  148 + fi
  149 + echo "${merged}"
  150 +}
  151 +
  152 +subtract_targets() {
  153 + local base="${1:-}"
  154 + local remove="${2:-}"
  155 + local out=""
  156 + local svc
  157 + declare -A removed=()
  158 +
  159 + for svc in ${remove}; do
  160 + removed["${svc}"]=1
  161 + done
  162 +
  163 + for svc in ${base}; do
  164 + if [ "${removed[${svc}]:-0}" != "1" ]; then
  165 + out="${out} ${svc}"
  166 + fi
  167 + done
  168 +
  169 + out="${out# }"
  170 + if [ -n "${out}" ]; then
  171 + out="$(normalize_targets "${out}")"
  172 + out="$(apply_target_order monitor "${out}")"
  173 + fi
  174 + echo "${out}"
  175 +}
  176 +
121 177 monitor_log_event() {
122 178 local service="$1"
123 179 local level="$2"
... ... @@ -125,7 +181,11 @@ monitor_log_event() {
125 181 local ts line
126 182 ts="$(date '+%F %T')"
127 183 line="[${ts}] [${level}] [${service}] ${message}"
128   - echo "${line}" | tee -a "$(monitor_log_file)"
  184 + if [ -t 1 ]; then
  185 + echo "${line}" | tee -a "$(monitor_log_file)"
  186 + else
  187 + echo "${line}" >> "$(monitor_log_file)"
  188 + fi
129 189 }
130 190  
131 191 require_positive_int() {
... ... @@ -156,9 +216,13 @@ service_healthy_now() {
156 216  
157 217 port="$(get_port "${service}")"
158 218 path="$(health_path_for_service "${service}")"
159   - if [ -z "${port}" ] || [ -z "${path}" ]; then
  219 + if [ -z "${port}" ]; then
160 220 return 1
161 221 fi
  222 + if [ -z "${path}" ]; then
  223 + is_running_by_pid "${service}" || is_running_by_port "${service}"
  224 + return
  225 + fi
162 226 if ! is_running_by_port "${service}"; then
163 227 return 1
164 228 fi
... ... @@ -228,6 +292,17 @@ monitor_services() {
228 292  
229 293 touch "$(monitor_log_file)"
230 294  
  295 + if [ "${MONITOR_DAEMON:-0}" = "1" ]; then
  296 + echo "$$" > "$(monitor_pid_file)"
  297 + echo "${targets}" > "$(monitor_targets_file)"
  298 + trap '
  299 + current_pid="$(cat "$(monitor_pid_file)" 2>/dev/null || true)"
  300 + if [ "${current_pid}" = "$$" ]; then
  301 + rm -f "$(monitor_pid_file)" "$(monitor_targets_file)"
  302 + fi
  303 + ' EXIT
  304 + fi
  305 +
231 306 declare -A fail_streak=()
232 307 declare -A last_restart_epoch=()
233 308 declare -A restart_history=()
... ... @@ -295,14 +370,7 @@ monitor_services() {
295 370 }
296 371  
297 372 is_monitor_daemon_running() {
298   - local pf
299   - pf="$(monitor_pid_file)"
300   - if [ ! -f "${pf}" ]; then
301   - return 1
302   - fi
303   - local pid
304   - pid="$(cat "${pf}" 2>/dev/null || true)"
305   - [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null
  373 + sync_monitor_daemon_state
306 374 }
307 375  
308 376 stop_monitor_daemon() {
... ... @@ -348,9 +416,15 @@ start_monitor_daemon() {
348 416 fi
349 417  
350 418 echo "${targets}" > "${tf}"
351   - nohup "${PROJECT_ROOT}/scripts/service_ctl.sh" monitor ${targets} >/dev/null 2>&1 &
  419 + MONITOR_DAEMON=1 nohup "${PROJECT_ROOT}/scripts/service_ctl.sh" monitor ${targets} >> "$(monitor_log_file)" 2>&1 &
352 420 local pid=$!
353 421 echo "${pid}" > "${pf}"
  422 + sleep 1
  423 + if ! kill -0 "${pid}" 2>/dev/null; then
  424 + rm -f "${pf}" "${tf}"
  425 + echo "[error] monitor daemon failed to stay alive, inspect $(monitor_log_file)" >&2
  426 + return 1
  427 + fi
354 428 echo "[ok] monitor daemon started (pid=${pid}, targets=[${targets}], log=$(monitor_log_file))"
355 429 }
356 430  
... ... @@ -378,7 +452,11 @@ is_running_by_pid() {
378 452 fi
379 453 local pid
380 454 pid="$(cat "${pf}" 2>/dev/null || true)"
381   - [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null
  455 + if [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then
  456 + return 0
  457 + fi
  458 + rm -f "${pf}"
  459 + return 1
382 460 }
383 461  
384 462 is_running_by_port() {
... ... @@ -403,6 +481,34 @@ get_cnclip_flow_device() {
403 481 sed -n "s/^[[:space:]]*device:[[:space:]]*'\\([^']*\\)'.*/\\1/p" "${flow_file}" | head -n 1
404 482 }
405 483  
  484 +start_health_retries_for_service() {
  485 + local service="$1"
  486 + case "${service}" in
  487 + reranker) echo 90 ;;
  488 + *) echo 30 ;;
  489 + esac
  490 +}
  491 +
  492 +wait_for_startup_health() {
  493 + local service="$1"
  494 + local pid="$2"
  495 + local lf="$3"
  496 + local retries
  497 + retries="$(start_health_retries_for_service "${service}")"
  498 +
  499 + if wait_for_health "${service}" "${retries}"; then
  500 + if wait_for_stable_health "${service}" 5 1; then
  501 + echo "[ok] ${service} healthy (pid=${pid}, log=${lf})"
  502 + return 0
  503 + fi
  504 + echo "[error] ${service} became unavailable right after startup, inspect ${lf}" >&2
  505 + return 1
  506 + fi
  507 +
  508 + echo "[error] ${service} health check timeout, inspect ${lf}" >&2
  509 + return 1
  510 +}
  511 +
406 512 start_one() {
407 513 local service="$1"
408 514 cd "${PROJECT_ROOT}"
... ... @@ -466,37 +572,12 @@ start_one() {
466 572 return 1
467 573 fi
468 574 ;;
469   - backend|indexer|frontend|embedding|translator)
470   - echo "[start] ${service}"
471   - nohup "${cmd}" > >("${PROJECT_ROOT}/scripts/daily_log_router.sh" "${service}" "${LOG_DIR}" "${LOG_RETENTION_DAYS}") 2>&1 &
472   - local pid=$!
473   - echo "${pid}" > "${pf}"
474   - local retries=30
475   - if wait_for_health "${service}" "${retries}"; then
476   - echo "[ok] ${service} healthy (pid=${pid}, log=${lf})"
477   - else
478   - echo "[error] ${service} health check timeout, inspect ${lf}" >&2
479   - return 1
480   - fi
481   - ;;
482   - reranker)
  575 + backend|indexer|frontend|embedding|translator|reranker)
483 576 echo "[start] ${service}"
484   - # Start reranker directly so pid file points to the script process that
485   - # will exec uvicorn, avoiding extra shell wrapper lifecycle edge-cases.
486 577 nohup "${cmd}" > >("${PROJECT_ROOT}/scripts/daily_log_router.sh" "${service}" "${LOG_DIR}" "${LOG_RETENTION_DAYS}") 2>&1 &
487 578 local pid=$!
488 579 echo "${pid}" > "${pf}"
489   - if wait_for_health "${service}" 90; then
490   - if wait_for_stable_health "${service}" 5 1; then
491   - echo "[ok] ${service} healthy (pid=${pid}, log=${lf})"
492   - else
493   - echo "[error] ${service} became unavailable right after startup, inspect ${lf}" >&2
494   - return 1
495   - fi
496   - else
497   - echo "[error] ${service} health check timeout, inspect ${lf}" >&2
498   - return 1
499   - fi
  580 + wait_for_startup_health "${service}" "${pid}" "${lf}"
500 581 ;;
501 582 *)
502 583 echo "[warn] ${service} unsupported start path"
... ... @@ -797,6 +878,7 @@ main() {
797 878 local targets=""
798 879 local monitor_was_running=0
799 880 local monitor_prev_targets=""
  881 + local auto_monitor_on_start="${SERVICE_CTL_AUTO_MONITOR_ON_START:-1}"
800 882  
801 883 case "${action}" in
802 884 monitor-stop|monitor-status)
... ... @@ -834,11 +916,21 @@ main() {
834 916 for svc in ${targets}; do
835 917 start_one "${svc}"
836 918 done
  919 + if [ "${auto_monitor_on_start}" = "1" ]; then
  920 + start_monitor_daemon "$(merge_targets "$(monitor_current_targets)" "${targets}")"
  921 + fi
837 922 ;;
838 923 stop)
839 924 if is_monitor_daemon_running; then
840   - echo "[info] stopping monitor daemon before manual stop"
841   - stop_monitor_daemon
  925 + local remaining_targets
  926 + remaining_targets="$(subtract_targets "$(monitor_current_targets)" "${targets}")"
  927 + if [ -n "${remaining_targets}" ]; then
  928 + echo "[info] updating monitor daemon targets -> [${remaining_targets}]"
  929 + start_monitor_daemon "${remaining_targets}"
  930 + else
  931 + echo "[info] stopping monitor daemon before manual stop"
  932 + stop_monitor_daemon
  933 + fi
842 934 fi
843 935 for svc in ${targets}; do
844 936 stop_one "${svc}"
... ... @@ -864,6 +956,8 @@ main() {
864 956 monitor_prev_targets="$(apply_target_order monitor "${monitor_prev_targets}")"
865 957 [ -z "${monitor_prev_targets}" ] && monitor_prev_targets="${targets}"
866 958 start_monitor_daemon "${monitor_prev_targets}"
  959 + elif [ "${auto_monitor_on_start}" = "1" ]; then
  960 + start_monitor_daemon "$(merge_targets "$(monitor_current_targets)" "${targets}")"
867 961 fi
868 962 ;;
869 963 status)
... ...
status.sh 100644 → 100755
tests/ci/test_service_api_contracts.py
... ... @@ -578,6 +578,14 @@ class _FakeTranslator:
578 578 return f"{text}-{target_lang}"
579 579  
580 580  
  581 +class _FailingTranslator:
  582 + model = "qwen"
  583 + use_cache = True
  584 +
  585 + def translate(self, text: str, target_lang: str, source_lang: str | None = None, prompt: str | None = None):
  586 + return None
  587 +
  588 +
581 589 @pytest.fixture
582 590 def translator_client(monkeypatch):
583 591 import api.translator_app as translator_app
... ... @@ -598,6 +606,22 @@ def test_translator_api_contract(translator_client: TestClient):
598 606 assert response.json()["translated_text"] == "商品名称-en"
599 607  
600 608  
  609 +def test_translator_api_failure_returns_500(monkeypatch):
  610 + import api.translator_app as translator_app
  611 +
  612 + translator_app.app.router.on_startup.clear()
  613 + monkeypatch.setattr(translator_app, "get_translator", lambda model="qwen": _FailingTranslator())
  614 +
  615 + with TestClient(translator_app.app) as client:
  616 + response = client.post(
  617 + "/translate",
  618 + json={"text": "商品名称", "target_lang": "en", "source_lang": "zh"},
  619 + )
  620 +
  621 + assert response.status_code == 500
  622 + assert response.json()["detail"] == "Translation failed"
  623 +
  624 +
601 625 def test_translator_health_contract(translator_client: TestClient):
602 626 response = translator_client.get("/health")
603 627 assert response.status_code == 200
... ...
tests/test_translator_failure_semantics.py 0 → 100644
... ... @@ -0,0 +1,30 @@
  1 +from query.translator import Translator
  2 +
  3 +
  4 +class _RecordingRedis:
  5 + def __init__(self):
  6 + self.setex_calls = []
  7 +
  8 + def setex(self, key, ttl, value):
  9 + self.setex_calls.append((key, ttl, value))
  10 +
  11 +
  12 +def test_translate_failure_returns_none_and_skips_cache(monkeypatch):
  13 + translator = Translator(model="qwen", api_key="dummy-key", use_cache=False)
  14 + fake_redis = _RecordingRedis()
  15 + translator.use_cache = True
  16 + translator.redis_client = fake_redis
  17 + translator.cache_prefix = "trans"
  18 + translator.expire_seconds = 60
  19 +
  20 + monkeypatch.setattr(translator, "_translate_qwen", lambda *args, **kwargs: None)
  21 +
  22 + result = translator.translate(
  23 + text="商品标题",
  24 + target_lang="en",
  25 + source_lang="zh",
  26 + prompt="translate for product search",
  27 + )
  28 +
  29 + assert result is None
  30 + assert fake_redis.setex_calls == []
... ...