Commit a7bb846ca0d6aa2bc446efa63b8cf782cbb63b39

Authored by tangwang
1 parent c6da6bca

monitor

@@ -36,7 +36,7 @@ API_BASE_URL=http://43.166.252.75:6002 @@ -36,7 +36,7 @@ API_BASE_URL=http://43.166.252.75:6002
36 36
37 37
38 # 通用 DashScope key(翻译/内容理解等模块) 38 # 通用 DashScope key(翻译/内容理解等模块)
39 -DASHSCOPE_API_KEY=sk-c3b8d4db061840aa8effb748df2a997b 39 +DASHSCOPE_API_KEY=sk-482cc3ff37a8467dab134a7a46830556
40 # Reranker 专用 key(按地域) 40 # Reranker 专用 key(按地域)
41 RERANK_DASHSCOPE_API_KEY_CN=sk-c3b8d4db061840aa8effb748df2a997b 41 RERANK_DASHSCOPE_API_KEY_CN=sk-c3b8d4db061840aa8effb748df2a997b
42 RERANK_DASHSCOPE_API_KEY_US=sk-482cc3ff37a8467dab134a7a46830556 42 RERANK_DASHSCOPE_API_KEY_US=sk-482cc3ff37a8467dab134a7a46830556
@@ -0,0 +1,17 @@ @@ -0,0 +1,17 @@
  1 +
  2 +# Please enter the commit message for your changes. Lines starting
  3 +# with '#' will be ignored, and an empty message aborts the commit.
  4 +#
  5 +# On branch master
  6 +# Your branch is ahead of 'origin/master' by 3 commits.
  7 +# (use "git push" to publish your local commits)
  8 +#
  9 +# Changes to be committed:
  10 +# modified: README.md
  11 +# modified: docs/Usage-Guide.md
  12 +# modified: scripts/service_ctl.sh
  13 +# new file: status.sh
  14 +#
  15 +# Changes not staged for commit:
  16 +# modified: third-party/clip-as-service (untracked content)
  17 +#
activate.sh 100644 → 100755
query/translator.py
@@ -243,21 +243,22 @@ class Translator: @@ -243,21 +243,22 @@ class Translator:
243 else: # deepl 243 else: # deepl
244 result = self._translate_deepl(text, target_lang, source_lang, translation_context, prompt) 244 result = self._translate_deepl(text, target_lang, source_lang, translation_context, prompt)
245 245
246 - # If still failed, return original text with warning 246 + # Surface translation failure to the caller instead of silently
  247 + # masquerading the source text as a successful translation.
247 if result is None: 248 if result is None:
248 logger.warning( 249 logger.warning(
249 f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " 250 f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "
250 - f"Source language: {source_lang or 'auto'} | Result: '{text}' | Status: Translation failed, returning original" 251 + f"Source language: {source_lang or 'auto'} | Status: Translation failed"
251 ) 252 )
252 - result = text  
253 else: 253 else:
254 logger.info( 254 logger.info(
255 f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " 255 f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "
256 f"Source language: {source_lang or 'auto'} | Result: '{result}' | Status: Translation successful" 256 f"Source language: {source_lang or 'auto'} | Result: '{result}' | Status: Translation successful"
257 ) 257 )
258 258
259 - # Cache result  
260 - if result and self.use_cache and self.redis_client: 259 + # Cache only successful translations. Failed attempts must not poison
  260 + # Redis with the original text.
  261 + if result is not None and self.use_cache and self.redis_client:
261 self._set_cached_translation_redis(text, target_lang, result, source_lang, translation_context, prompt) 262 self._set_cached_translation_redis(text, target_lang, result, source_lang, translation_context, prompt)
262 263
263 return result 264 return result
scripts/create_venv.sh 100644 → 100755
scripts/service_ctl.sh
@@ -112,12 +112,68 @@ monitor_targets_file() { @@ -112,12 +112,68 @@ monitor_targets_file() {
112 echo "${LOG_DIR}/service-monitor.targets" 112 echo "${LOG_DIR}/service-monitor.targets"
113 } 113 }
114 114
  115 +sync_monitor_daemon_state() {
  116 + local pf
  117 + pf="$(monitor_pid_file)"
  118 + local tf
  119 + tf="$(monitor_targets_file)"
  120 +
  121 + if [ ! -f "${pf}" ]; then
  122 + return 1
  123 + fi
  124 +
  125 + local pid
  126 + pid="$(cat "${pf}" 2>/dev/null || true)"
  127 + if [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then
  128 + return 0
  129 + fi
  130 +
  131 + rm -f "${pf}" "${tf}"
  132 + return 1
  133 +}
  134 +
115 monitor_current_targets() { 135 monitor_current_targets() {
116 if [ -f "$(monitor_targets_file)" ]; then 136 if [ -f "$(monitor_targets_file)" ]; then
117 cat "$(monitor_targets_file)" 2>/dev/null || true 137 cat "$(monitor_targets_file)" 2>/dev/null || true
118 fi 138 fi
119 } 139 }
120 140
  141 +merge_targets() {
  142 + local base="${1:-}"
  143 + local extra="${2:-}"
  144 + local merged=""
  145 + merged="$(normalize_targets "${base} ${extra}")"
  146 + if [ -n "${merged}" ]; then
  147 + merged="$(apply_target_order monitor "${merged}")"
  148 + fi
  149 + echo "${merged}"
  150 +}
  151 +
  152 +subtract_targets() {
  153 + local base="${1:-}"
  154 + local remove="${2:-}"
  155 + local out=""
  156 + local svc
  157 + declare -A removed=()
  158 +
  159 + for svc in ${remove}; do
  160 + removed["${svc}"]=1
  161 + done
  162 +
  163 + for svc in ${base}; do
  164 + if [ "${removed[${svc}]:-0}" != "1" ]; then
  165 + out="${out} ${svc}"
  166 + fi
  167 + done
  168 +
  169 + out="${out# }"
  170 + if [ -n "${out}" ]; then
  171 + out="$(normalize_targets "${out}")"
  172 + out="$(apply_target_order monitor "${out}")"
  173 + fi
  174 + echo "${out}"
  175 +}
  176 +
121 monitor_log_event() { 177 monitor_log_event() {
122 local service="$1" 178 local service="$1"
123 local level="$2" 179 local level="$2"
@@ -125,7 +181,11 @@ monitor_log_event() { @@ -125,7 +181,11 @@ monitor_log_event() {
125 local ts line 181 local ts line
126 ts="$(date '+%F %T')" 182 ts="$(date '+%F %T')"
127 line="[${ts}] [${level}] [${service}] ${message}" 183 line="[${ts}] [${level}] [${service}] ${message}"
128 - echo "${line}" | tee -a "$(monitor_log_file)" 184 + if [ -t 1 ]; then
  185 + echo "${line}" | tee -a "$(monitor_log_file)"
  186 + else
  187 + echo "${line}" >> "$(monitor_log_file)"
  188 + fi
129 } 189 }
130 190
131 require_positive_int() { 191 require_positive_int() {
@@ -156,9 +216,13 @@ service_healthy_now() { @@ -156,9 +216,13 @@ service_healthy_now() {
156 216
157 port="$(get_port "${service}")" 217 port="$(get_port "${service}")"
158 path="$(health_path_for_service "${service}")" 218 path="$(health_path_for_service "${service}")"
159 - if [ -z "${port}" ] || [ -z "${path}" ]; then 219 + if [ -z "${port}" ]; then
160 return 1 220 return 1
161 fi 221 fi
  222 + if [ -z "${path}" ]; then
  223 + is_running_by_pid "${service}" || is_running_by_port "${service}"
  224 + return
  225 + fi
162 if ! is_running_by_port "${service}"; then 226 if ! is_running_by_port "${service}"; then
163 return 1 227 return 1
164 fi 228 fi
@@ -228,6 +292,17 @@ monitor_services() { @@ -228,6 +292,17 @@ monitor_services() {
228 292
229 touch "$(monitor_log_file)" 293 touch "$(monitor_log_file)"
230 294
  295 + if [ "${MONITOR_DAEMON:-0}" = "1" ]; then
  296 + echo "$$" > "$(monitor_pid_file)"
  297 + echo "${targets}" > "$(monitor_targets_file)"
  298 + trap '
  299 + current_pid="$(cat "$(monitor_pid_file)" 2>/dev/null || true)"
  300 + if [ "${current_pid}" = "$$" ]; then
  301 + rm -f "$(monitor_pid_file)" "$(monitor_targets_file)"
  302 + fi
  303 + ' EXIT
  304 + fi
  305 +
231 declare -A fail_streak=() 306 declare -A fail_streak=()
232 declare -A last_restart_epoch=() 307 declare -A last_restart_epoch=()
233 declare -A restart_history=() 308 declare -A restart_history=()
@@ -295,14 +370,7 @@ monitor_services() { @@ -295,14 +370,7 @@ monitor_services() {
295 } 370 }
296 371
297 is_monitor_daemon_running() { 372 is_monitor_daemon_running() {
298 - local pf  
299 - pf="$(monitor_pid_file)"  
300 - if [ ! -f "${pf}" ]; then  
301 - return 1  
302 - fi  
303 - local pid  
304 - pid="$(cat "${pf}" 2>/dev/null || true)"  
305 - [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null 373 + sync_monitor_daemon_state
306 } 374 }
307 375
308 stop_monitor_daemon() { 376 stop_monitor_daemon() {
@@ -348,9 +416,15 @@ start_monitor_daemon() { @@ -348,9 +416,15 @@ start_monitor_daemon() {
348 fi 416 fi
349 417
350 echo "${targets}" > "${tf}" 418 echo "${targets}" > "${tf}"
351 - nohup "${PROJECT_ROOT}/scripts/service_ctl.sh" monitor ${targets} >/dev/null 2>&1 & 419 + MONITOR_DAEMON=1 nohup "${PROJECT_ROOT}/scripts/service_ctl.sh" monitor ${targets} >> "$(monitor_log_file)" 2>&1 &
352 local pid=$! 420 local pid=$!
353 echo "${pid}" > "${pf}" 421 echo "${pid}" > "${pf}"
  422 + sleep 1
  423 + if ! kill -0 "${pid}" 2>/dev/null; then
  424 + rm -f "${pf}" "${tf}"
  425 + echo "[error] monitor daemon failed to stay alive, inspect $(monitor_log_file)" >&2
  426 + return 1
  427 + fi
354 echo "[ok] monitor daemon started (pid=${pid}, targets=[${targets}], log=$(monitor_log_file))" 428 echo "[ok] monitor daemon started (pid=${pid}, targets=[${targets}], log=$(monitor_log_file))"
355 } 429 }
356 430
@@ -378,7 +452,11 @@ is_running_by_pid() { @@ -378,7 +452,11 @@ is_running_by_pid() {
378 fi 452 fi
379 local pid 453 local pid
380 pid="$(cat "${pf}" 2>/dev/null || true)" 454 pid="$(cat "${pf}" 2>/dev/null || true)"
381 - [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null 455 + if [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then
  456 + return 0
  457 + fi
  458 + rm -f "${pf}"
  459 + return 1
382 } 460 }
383 461
384 is_running_by_port() { 462 is_running_by_port() {
@@ -403,6 +481,34 @@ get_cnclip_flow_device() { @@ -403,6 +481,34 @@ get_cnclip_flow_device() {
403 sed -n "s/^[[:space:]]*device:[[:space:]]*'\\([^']*\\)'.*/\\1/p" "${flow_file}" | head -n 1 481 sed -n "s/^[[:space:]]*device:[[:space:]]*'\\([^']*\\)'.*/\\1/p" "${flow_file}" | head -n 1
404 } 482 }
405 483
  484 +start_health_retries_for_service() {
  485 + local service="$1"
  486 + case "${service}" in
  487 + reranker) echo 90 ;;
  488 + *) echo 30 ;;
  489 + esac
  490 +}
  491 +
  492 +wait_for_startup_health() {
  493 + local service="$1"
  494 + local pid="$2"
  495 + local lf="$3"
  496 + local retries
  497 + retries="$(start_health_retries_for_service "${service}")"
  498 +
  499 + if wait_for_health "${service}" "${retries}"; then
  500 + if wait_for_stable_health "${service}" 5 1; then
  501 + echo "[ok] ${service} healthy (pid=${pid}, log=${lf})"
  502 + return 0
  503 + fi
  504 + echo "[error] ${service} became unavailable right after startup, inspect ${lf}" >&2
  505 + return 1
  506 + fi
  507 +
  508 + echo "[error] ${service} health check timeout, inspect ${lf}" >&2
  509 + return 1
  510 +}
  511 +
406 start_one() { 512 start_one() {
407 local service="$1" 513 local service="$1"
408 cd "${PROJECT_ROOT}" 514 cd "${PROJECT_ROOT}"
@@ -466,37 +572,12 @@ start_one() { @@ -466,37 +572,12 @@ start_one() {
466 return 1 572 return 1
467 fi 573 fi
468 ;; 574 ;;
469 - backend|indexer|frontend|embedding|translator)  
470 - echo "[start] ${service}"  
471 - nohup "${cmd}" > >("${PROJECT_ROOT}/scripts/daily_log_router.sh" "${service}" "${LOG_DIR}" "${LOG_RETENTION_DAYS}") 2>&1 &  
472 - local pid=$!  
473 - echo "${pid}" > "${pf}"  
474 - local retries=30  
475 - if wait_for_health "${service}" "${retries}"; then  
476 - echo "[ok] ${service} healthy (pid=${pid}, log=${lf})"  
477 - else  
478 - echo "[error] ${service} health check timeout, inspect ${lf}" >&2  
479 - return 1  
480 - fi  
481 - ;;  
482 - reranker) 575 + backend|indexer|frontend|embedding|translator|reranker)
483 echo "[start] ${service}" 576 echo "[start] ${service}"
484 - # Start reranker directly so pid file points to the script process that  
485 - # will exec uvicorn, avoiding extra shell wrapper lifecycle edge-cases.  
486 nohup "${cmd}" > >("${PROJECT_ROOT}/scripts/daily_log_router.sh" "${service}" "${LOG_DIR}" "${LOG_RETENTION_DAYS}") 2>&1 & 577 nohup "${cmd}" > >("${PROJECT_ROOT}/scripts/daily_log_router.sh" "${service}" "${LOG_DIR}" "${LOG_RETENTION_DAYS}") 2>&1 &
487 local pid=$! 578 local pid=$!
488 echo "${pid}" > "${pf}" 579 echo "${pid}" > "${pf}"
489 - if wait_for_health "${service}" 90; then  
490 - if wait_for_stable_health "${service}" 5 1; then  
491 - echo "[ok] ${service} healthy (pid=${pid}, log=${lf})"  
492 - else  
493 - echo "[error] ${service} became unavailable right after startup, inspect ${lf}" >&2  
494 - return 1  
495 - fi  
496 - else  
497 - echo "[error] ${service} health check timeout, inspect ${lf}" >&2  
498 - return 1  
499 - fi 580 + wait_for_startup_health "${service}" "${pid}" "${lf}"
500 ;; 581 ;;
501 *) 582 *)
502 echo "[warn] ${service} unsupported start path" 583 echo "[warn] ${service} unsupported start path"
@@ -797,6 +878,7 @@ main() { @@ -797,6 +878,7 @@ main() {
797 local targets="" 878 local targets=""
798 local monitor_was_running=0 879 local monitor_was_running=0
799 local monitor_prev_targets="" 880 local monitor_prev_targets=""
  881 + local auto_monitor_on_start="${SERVICE_CTL_AUTO_MONITOR_ON_START:-1}"
800 882
801 case "${action}" in 883 case "${action}" in
802 monitor-stop|monitor-status) 884 monitor-stop|monitor-status)
@@ -834,11 +916,21 @@ main() { @@ -834,11 +916,21 @@ main() {
834 for svc in ${targets}; do 916 for svc in ${targets}; do
835 start_one "${svc}" 917 start_one "${svc}"
836 done 918 done
  919 + if [ "${auto_monitor_on_start}" = "1" ]; then
  920 + start_monitor_daemon "$(merge_targets "$(monitor_current_targets)" "${targets}")"
  921 + fi
837 ;; 922 ;;
838 stop) 923 stop)
839 if is_monitor_daemon_running; then 924 if is_monitor_daemon_running; then
840 - echo "[info] stopping monitor daemon before manual stop"  
841 - stop_monitor_daemon 925 + local remaining_targets
  926 + remaining_targets="$(subtract_targets "$(monitor_current_targets)" "${targets}")"
  927 + if [ -n "${remaining_targets}" ]; then
  928 + echo "[info] updating monitor daemon targets -> [${remaining_targets}]"
  929 + start_monitor_daemon "${remaining_targets}"
  930 + else
  931 + echo "[info] stopping monitor daemon before manual stop"
  932 + stop_monitor_daemon
  933 + fi
842 fi 934 fi
843 for svc in ${targets}; do 935 for svc in ${targets}; do
844 stop_one "${svc}" 936 stop_one "${svc}"
@@ -864,6 +956,8 @@ main() { @@ -864,6 +956,8 @@ main() {
864 monitor_prev_targets="$(apply_target_order monitor "${monitor_prev_targets}")" 956 monitor_prev_targets="$(apply_target_order monitor "${monitor_prev_targets}")"
865 [ -z "${monitor_prev_targets}" ] && monitor_prev_targets="${targets}" 957 [ -z "${monitor_prev_targets}" ] && monitor_prev_targets="${targets}"
866 start_monitor_daemon "${monitor_prev_targets}" 958 start_monitor_daemon "${monitor_prev_targets}"
  959 + elif [ "${auto_monitor_on_start}" = "1" ]; then
  960 + start_monitor_daemon "$(merge_targets "$(monitor_current_targets)" "${targets}")"
867 fi 961 fi
868 ;; 962 ;;
869 status) 963 status)
status.sh 100644 → 100755
tests/ci/test_service_api_contracts.py
@@ -578,6 +578,14 @@ class _FakeTranslator: @@ -578,6 +578,14 @@ class _FakeTranslator:
578 return f"{text}-{target_lang}" 578 return f"{text}-{target_lang}"
579 579
580 580
  581 +class _FailingTranslator:
  582 + model = "qwen"
  583 + use_cache = True
  584 +
  585 + def translate(self, text: str, target_lang: str, source_lang: str | None = None, prompt: str | None = None):
  586 + return None
  587 +
  588 +
581 @pytest.fixture 589 @pytest.fixture
582 def translator_client(monkeypatch): 590 def translator_client(monkeypatch):
583 import api.translator_app as translator_app 591 import api.translator_app as translator_app
@@ -598,6 +606,22 @@ def test_translator_api_contract(translator_client: TestClient): @@ -598,6 +606,22 @@ def test_translator_api_contract(translator_client: TestClient):
598 assert response.json()["translated_text"] == "商品名称-en" 606 assert response.json()["translated_text"] == "商品名称-en"
599 607
600 608
  609 +def test_translator_api_failure_returns_500(monkeypatch):
  610 + import api.translator_app as translator_app
  611 +
  612 + translator_app.app.router.on_startup.clear()
  613 + monkeypatch.setattr(translator_app, "get_translator", lambda model="qwen": _FailingTranslator())
  614 +
  615 + with TestClient(translator_app.app) as client:
  616 + response = client.post(
  617 + "/translate",
  618 + json={"text": "商品名称", "target_lang": "en", "source_lang": "zh"},
  619 + )
  620 +
  621 + assert response.status_code == 500
  622 + assert response.json()["detail"] == "Translation failed"
  623 +
  624 +
601 def test_translator_health_contract(translator_client: TestClient): 625 def test_translator_health_contract(translator_client: TestClient):
602 response = translator_client.get("/health") 626 response = translator_client.get("/health")
603 assert response.status_code == 200 627 assert response.status_code == 200
tests/test_translator_failure_semantics.py 0 → 100644
@@ -0,0 +1,30 @@ @@ -0,0 +1,30 @@
  1 +from query.translator import Translator
  2 +
  3 +
  4 +class _RecordingRedis:
  5 + def __init__(self):
  6 + self.setex_calls = []
  7 +
  8 + def setex(self, key, ttl, value):
  9 + self.setex_calls.append((key, ttl, value))
  10 +
  11 +
  12 +def test_translate_failure_returns_none_and_skips_cache(monkeypatch):
  13 + translator = Translator(model="qwen", api_key="dummy-key", use_cache=False)
  14 + fake_redis = _RecordingRedis()
  15 + translator.use_cache = True
  16 + translator.redis_client = fake_redis
  17 + translator.cache_prefix = "trans"
  18 + translator.expire_seconds = 60
  19 +
  20 + monkeypatch.setattr(translator, "_translate_qwen", lambda *args, **kwargs: None)
  21 +
  22 + result = translator.translate(
  23 + text="商品标题",
  24 + target_lang="en",
  25 + source_lang="zh",
  26 + prompt="translate for product search",
  27 + )
  28 +
  29 + assert result is None
  30 + assert fake_redis.setex_calls == []