Commit 28e57bb16712929459a9d1eb43c2800a94ef2d8b

Authored by tangwang
1 parent af7ee060

日志体系优化

README.md
... ... @@ -29,7 +29,7 @@ source activate.sh
29 29 ./run.sh
30 30  
31 31 # 可选:附加能力服务(按需开启)
32   -./scripts/service_ctl.sh start embedding translator reranker tei cnclip
  32 +./scripts/service_ctl.sh start tei cnclip embedding translator reranker
33 33  
34 34 # 查看状态
35 35 ./scripts/service_ctl.sh status
... ...
api/app.py
... ... @@ -11,6 +11,7 @@ import logging
11 11 import time
12 12 import argparse
13 13 import uvicorn
  14 +from logging.handlers import TimedRotatingFileHandler
14 15 from collections import defaultdict, deque
15 16 from typing import Optional
16 17 from fastapi import FastAPI, Request, HTTPException
... ... @@ -23,18 +24,60 @@ from slowapi import Limiter, _rate_limit_exceeded_handler
23 24 from slowapi.util import get_remote_address
24 25 from slowapi.errors import RateLimitExceeded
25 26  
26   -# Configure logging with better formatting
  27 +# Configure backend logging
27 28 import pathlib
28   -log_dir = pathlib.Path('logs')
29   -log_dir.mkdir(exist_ok=True)
30   -logging.basicConfig(
31   - level=logging.INFO,
32   - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
33   - handlers=[
34   - logging.StreamHandler(),
35   - logging.FileHandler(log_dir / 'api.log', mode='a', encoding='utf-8')
36   - ]
37   -)
  29 +
  30 +
  31 +def configure_backend_logging() -> None:
  32 + log_dir = pathlib.Path("logs")
  33 + log_dir.mkdir(exist_ok=True)
  34 + log_level = os.getenv("LOG_LEVEL", "INFO").upper()
  35 + numeric_level = getattr(logging, log_level, logging.INFO)
  36 +
  37 + default_formatter = logging.Formatter(
  38 + "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
  39 + )
  40 +
  41 + root_logger = logging.getLogger()
  42 + root_logger.setLevel(numeric_level)
  43 + root_logger.handlers.clear()
  44 +
  45 + console_handler = logging.StreamHandler()
  46 + console_handler.setLevel(numeric_level)
  47 + console_handler.setFormatter(default_formatter)
  48 + root_logger.addHandler(console_handler)
  49 +
  50 + backend_handler = TimedRotatingFileHandler(
  51 + filename=log_dir / "api.log",
  52 + when="midnight",
  53 + interval=1,
  54 + backupCount=30,
  55 + encoding="utf-8",
  56 + )
  57 + backend_handler.setLevel(numeric_level)
  58 + backend_handler.setFormatter(default_formatter)
  59 + root_logger.addHandler(backend_handler)
  60 +
  61 + verbose_logger = logging.getLogger("backend.verbose")
  62 + verbose_logger.setLevel(numeric_level)
  63 + verbose_logger.handlers.clear()
  64 + verbose_logger.propagate = False
  65 +
  66 + verbose_handler = TimedRotatingFileHandler(
  67 + filename=log_dir / "backend_verbose.log",
  68 + when="midnight",
  69 + interval=1,
  70 + backupCount=30,
  71 + encoding="utf-8",
  72 + )
  73 + verbose_handler.setLevel(numeric_level)
  74 + verbose_handler.setFormatter(
  75 + logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
  76 + )
  77 + verbose_logger.addHandler(verbose_handler)
  78 +
  79 +
  80 +configure_backend_logging()
38 81 logger = logging.getLogger(__name__)
39 82  
40 83 # Initialize rate limiter
... ...
api/routes/search.py
... ... @@ -5,6 +5,9 @@ Search API routes.
5 5 from fastapi import APIRouter, HTTPException, Query, Request
6 6 from typing import Optional
7 7 import uuid
  8 +import hashlib
  9 +import json
  10 +import logging
8 11  
9 12 from ..models import (
10 13 SearchRequest,
... ... @@ -17,6 +20,15 @@ from ..models import (
17 20 from context.request_context import create_request_context, set_current_request_context, clear_current_request_context
18 21  
19 22 router = APIRouter(prefix="/search", tags=["search"])
  23 +backend_verbose_logger = logging.getLogger("backend.verbose")
  24 +
  25 +
  26 +def _log_backend_verbose(payload: dict) -> None:
  27 + if not backend_verbose_logger.handlers:
  28 + return
  29 + backend_verbose_logger.info(
  30 + json.dumps(payload, ensure_ascii=False, separators=(",", ":"))
  31 + )
20 32  
21 33  
22 34 def extract_request_info(request: Request) -> tuple[str, str]:
... ... @@ -136,15 +148,32 @@ async def search(request: SearchRequest, http_request: Request):
136 148 debug_info=result.debug_info
137 149 )
138 150  
139   - # Log complete response JSON
  151 + response_payload = response.model_dump(mode="json")
  152 + response_json = json.dumps(response_payload, ensure_ascii=False, separators=(",", ":"))
  153 + response_digest = hashlib.sha256(response_json.encode("utf-8")).hexdigest()[:16]
  154 + max_score = float(response.max_score or 0.0)
  155 +
140 156 context.logger.info(
141   - "Search response | "
142   - f"Total results: {response.total} | "
143   - f"Max score: {response.max_score:.4f} | "
144   - f"Time: {response.took_ms}ms | "
145   - f"Response: {response.model_dump_json()}",
  157 + "Search response | Total results: %s | Max score: %.4f | Time: %sms | payload_size: %s chars | digest: %s",
  158 + response.total,
  159 + max_score,
  160 + response.took_ms,
  161 + len(response_json),
  162 + response_digest,
146 163 extra={'reqid': context.reqid, 'uid': context.uid}
147 164 )
  165 + _log_backend_verbose({
  166 + "event": "search_response",
  167 + "reqid": context.reqid,
  168 + "uid": context.uid,
  169 + "tenant_id": tenant_id,
  170 + "total_results": response.total,
  171 + "max_score": max_score,
  172 + "took_ms": response.took_ms,
  173 + "payload_size_chars": len(response_json),
  174 + "sha256_16": response_digest,
  175 + "response": response_payload,
  176 + })
148 177  
149 178 return response
150 179  
... ... @@ -233,15 +262,32 @@ async def search_by_image(request: ImageSearchRequest, http_request: Request):
233 262 performance_info=performance_summary
234 263 )
235 264  
236   - # Log complete response JSON
  265 + response_payload = response.model_dump(mode="json")
  266 + response_json = json.dumps(response_payload, ensure_ascii=False, separators=(",", ":"))
  267 + response_digest = hashlib.sha256(response_json.encode("utf-8")).hexdigest()[:16]
  268 + max_score = float(response.max_score or 0.0)
  269 +
237 270 context.logger.info(
238   - "Image search response | "
239   - f"Total results: {response.total} | "
240   - f"Max score: {response.max_score:.4f} | "
241   - f"Time: {response.took_ms}ms | "
242   - f"Response: {response.model_dump_json()}",
  271 + "Image search response | Total results: %s | Max score: %.4f | Time: %sms | payload_size: %s chars | digest: %s",
  272 + response.total,
  273 + max_score,
  274 + response.took_ms,
  275 + len(response_json),
  276 + response_digest,
243 277 extra={'reqid': context.reqid, 'uid': context.uid}
244 278 )
  279 + _log_backend_verbose({
  280 + "event": "image_search_response",
  281 + "reqid": context.reqid,
  282 + "uid": context.uid,
  283 + "tenant_id": tenant_id,
  284 + "total_results": response.total,
  285 + "max_score": max_score,
  286 + "took_ms": response.took_ms,
  287 + "payload_size_chars": len(response_json),
  288 + "sha256_16": response_digest,
  289 + "response": response_payload,
  290 + })
245 291  
246 292 return response
247 293  
... ...
docs/QUICKSTART.md
... ... @@ -510,16 +510,11 @@ curl http://localhost:6007/health
510 510  
511 511 ### 5.2 常看日志
512 512  
513   -- `logs/backend.log`
514   -- `logs/indexer.log`
515   -- `logs/frontend.log`
516   -- `logs/embedding.log`
517   -- `logs/translator.log`
518   -- `logs/reranker.log`
519   -- `logs/tei.log`
520   -- `logs/cnclip.log`
521   -- `logs/search_engine.log`
522   -- `logs/errors.log`
  513 +- `logs/<service>-YYYY-MM-DD.log`(`service_ctl.sh` 按天写入的真实文件)
  514 +- `logs/<service>.log`(指向当天文件的软链,推荐 `tail -F`)
  515 +- `logs/api.log`(backend 进程内日志,按天轮转)
  516 +- `logs/backend_verbose.log`(backend 大对象详细日志,按天轮转)
  517 +- `logs/indexer.log`(索引结构化 JSON 日志,按天轮转)
523 518  
524 519 ### 5.3 常用排障命令
525 520  
... ...
docs/TEI_SERVICE说明文档.md
... ... @@ -152,7 +152,7 @@ curl -sS -X POST &quot;http://127.0.0.1:6005/embed/text&quot; \
152 152 启动全套(含 TEI):
153 153  
154 154 ```bash
155   -TEI_DEVICE=cuda ./scripts/service_ctl.sh start embedding translator reranker tei cnclip
  155 +TEI_DEVICE=cuda ./scripts/service_ctl.sh start tei cnclip embedding translator reranker
156 156 ```
157 157  
158 158 仅启动 TEI:
... ...
docs/Usage-Guide.md
... ... @@ -368,31 +368,23 @@ RERANKER_PORT=6007
368 368  
369 369 日志文件存储在 `logs/` 目录下:
370 370  
371   -- `logs/backend.log` - 后端服务日志
372   -- `logs/indexer.log` - 索引服务日志
373   -- `logs/frontend.log` - 前端服务日志
374   -- `logs/embedding.log` - 向量服务日志(可选)
375   -- `logs/translator.log` - 翻译服务日志(可选)
376   -- `logs/reranker.log` - 重排服务日志(可选)
377   -- `logs/tei.log` - TEI 启停日志(可选)
378   -- `logs/cnclip.log` - CN-CLIP 启停日志(可选)
379   -- `logs/search_engine.log` - 应用主日志(按天轮转)
380   -- `logs/errors.log` - 错误日志(按天轮转)
  371 +- `logs/<service>-YYYY-MM-DD.log` - `service_ctl.sh` 统一管理的按天日志文件(真实文件)
  372 +- `logs/<service>.log` - 指向当天日志文件的软链(兼容原有命令)
  373 +- `logs/api.log` - backend 进程内日志(按天轮转)
  374 +- `logs/backend_verbose.log` - backend 详细大对象日志(按天轮转,含 `Search response` / `ES query built` 完整内容)
  375 +- `logs/indexer.log` - 索引结构化日志(JSON,按天轮转)
381 376  
382 377 ### 查看实时日志
383 378  
384 379 ```bash
385   -# 查看后端日志
386   -tail -f logs/backend.log
  380 +# 查看后端日志(建议 -F,跨天自动跟随新文件)
  381 +tail -F logs/backend.log
387 382  
388 383 # 查看前端日志
389   -tail -f logs/frontend.log
  384 +tail -F logs/frontend.log
390 385  
391   -# 查看应用主日志
392   -tail -f logs/search_engine.log
393   -
394   -# 查看错误日志
395   -tail -f logs/errors.log
  386 +# 查看 backend 详细大对象日志
  387 +tail -F logs/backend_verbose.log
396 388 ```
397 389  
398 390 ### 日志级别
... ... @@ -406,7 +398,7 @@ LOG_LEVEL=DEBUG # DEBUG, INFO, WARNING, ERROR, CRITICAL
406 398  
407 399 ### 日志轮转
408 400  
409   -日志文件按天自动轮转,保留30天的历史日志
  401 +日志文件按天自动轮转,默认保留 30 天历史日志(可通过 `LOG_RETENTION_DAYS` 调整 `service_ctl.sh` 管理日志的保留天数)
410 402  
411 403 ---
412 404  
... ...
embeddings/server.py
... ... @@ -56,6 +56,29 @@ _TEXT_MICROBATCH_WINDOW_SEC = max(
56 56 _TEXT_REQUEST_TIMEOUT_SEC = max(
57 57 1.0, float(os.getenv("TEXT_REQUEST_TIMEOUT_SEC", "30"))
58 58 )
  59 +_LOG_PREVIEW_COUNT = max(1, int(os.getenv("EMBEDDING_LOG_PREVIEW_COUNT", "3")))
  60 +_LOG_TEXT_PREVIEW_CHARS = max(32, int(os.getenv("EMBEDDING_LOG_TEXT_PREVIEW_CHARS", "120")))
  61 +_LOG_IMAGE_PREVIEW_CHARS = max(32, int(os.getenv("EMBEDDING_LOG_IMAGE_PREVIEW_CHARS", "180")))
  62 +
  63 +
  64 +def _compact_preview(text: str, max_chars: int) -> str:
  65 + compact = " ".join((text or "").split())
  66 + if len(compact) <= max_chars:
  67 + return compact
  68 + return compact[:max_chars] + "..."
  69 +
  70 +
  71 +def _preview_inputs(items: List[str], max_items: int, max_chars: int) -> List[Dict[str, Any]]:
  72 + previews: List[Dict[str, Any]] = []
  73 + for idx, item in enumerate(items[:max_items]):
  74 + previews.append(
  75 + {
  76 + "idx": idx,
  77 + "len": len(item),
  78 + "preview": _compact_preview(item, max_chars),
  79 + }
  80 + )
  81 + return previews
59 82  
60 83  
61 84 def _encode_local_st(texts: List[str], normalize_embeddings: bool) -> Any:
... ... @@ -292,6 +315,14 @@ def embed_text(texts: List[str], normalize: Optional[bool] = None) -&gt; List[Optio
292 315 raise HTTPException(status_code=400, detail=f"Invalid text at index {i}: empty string")
293 316 normalized.append(s)
294 317  
  318 + logger.info(
  319 + "embed_text request | backend=%s inputs=%d normalize=%s preview=%s",
  320 + _text_backend_name,
  321 + len(normalized),
  322 + effective_normalize,
  323 + _preview_inputs(normalized, _LOG_PREVIEW_COUNT, _LOG_TEXT_PREVIEW_CHARS),
  324 + )
  325 +
295 326 t0 = time.perf_counter()
296 327 try:
297 328 # local_st backend uses in-process torch model, keep serialized encode for safety;
... ... @@ -301,10 +332,11 @@ def embed_text(texts: List[str], normalize: Optional[bool] = None) -&gt; List[Optio
301 332 out = [_encode_single_text_with_microbatch(normalized[0], normalize=effective_normalize)]
302 333 elapsed_ms = (time.perf_counter() - t0) * 1000.0
303 334 logger.info(
304   - "embed_text done | backend=%s mode=microbatch-single inputs=%d normalize=%s elapsed_ms=%.2f",
  335 + "embed_text done | backend=%s mode=microbatch-single inputs=%d normalize=%s dim=%d elapsed_ms=%.2f",
305 336 _text_backend_name,
306 337 len(normalized),
307 338 effective_normalize,
  339 + len(out[0]) if out and out[0] is not None else 0,
308 340 elapsed_ms,
309 341 )
310 342 return out
... ... @@ -335,10 +367,11 @@ def embed_text(texts: List[str], normalize: Optional[bool] = None) -&gt; List[Optio
335 367 out.append(vec)
336 368 elapsed_ms = (time.perf_counter() - t0) * 1000.0
337 369 logger.info(
338   - "embed_text done | backend=%s inputs=%d normalize=%s elapsed_ms=%.2f",
  370 + "embed_text done | backend=%s inputs=%d normalize=%s dim=%d elapsed_ms=%.2f",
339 371 _text_backend_name,
340 372 len(normalized),
341 373 effective_normalize,
  374 + len(out[0]) if out and out[0] is not None else 0,
342 375 elapsed_ms,
343 376 )
344 377 return out
... ... @@ -358,6 +391,14 @@ def embed_image(images: List[str], normalize: Optional[bool] = None) -&gt; List[Opt
358 391 raise HTTPException(status_code=400, detail=f"Invalid image at index {i}: empty URL/path")
359 392 urls.append(s)
360 393  
  394 + logger.info(
  395 + "embed_image request | inputs=%d normalize=%s preview=%s",
  396 + len(urls),
  397 + effective_normalize,
  398 + _preview_inputs(urls, _LOG_PREVIEW_COUNT, _LOG_IMAGE_PREVIEW_CHARS),
  399 + )
  400 +
  401 + t0 = time.perf_counter()
361 402 with _image_encode_lock:
362 403 vectors = _image_model.encode_image_urls(
363 404 urls,
... ... @@ -375,4 +416,12 @@ def embed_image(images: List[str], normalize: Optional[bool] = None) -&gt; List[Opt
375 416 if out_vec is None:
376 417 raise RuntimeError(f"Image model returned empty embedding for index {i}")
377 418 out.append(out_vec)
  419 + elapsed_ms = (time.perf_counter() - t0) * 1000.0
  420 + logger.info(
  421 + "embed_image done | inputs=%d normalize=%s dim=%d elapsed_ms=%.2f",
  422 + len(urls),
  423 + effective_normalize,
  424 + len(out[0]) if out and out[0] is not None else 0,
  425 + elapsed_ms,
  426 + )
378 427 return out
... ...
reranker/server.py
... ... @@ -9,6 +9,7 @@ Backend selected via config: services.rerank.backend (bge | qwen3_vllm), env RER
9 9 """
10 10  
11 11 import logging
  12 +import os
12 13 import time
13 14 from typing import Any, Dict, List, Optional
14 15  
... ... @@ -29,6 +30,28 @@ app = FastAPI(title=&quot;saas-search Reranker Service&quot;, version=&quot;1.0.0&quot;)
29 30  
30 31 _reranker: Optional[RerankBackendProtocol] = None
31 32 _backend_name: str = ""
  33 +_LOG_DOC_PREVIEW_COUNT = max(1, int(os.getenv("RERANK_LOG_DOC_PREVIEW_COUNT", "3")))
  34 +_LOG_TEXT_PREVIEW_CHARS = max(32, int(os.getenv("RERANK_LOG_TEXT_PREVIEW_CHARS", "180")))
  35 +
  36 +
  37 +def _compact_preview(text: str, max_chars: int) -> str:
  38 + compact = " ".join((text or "").split())
  39 + if len(compact) <= max_chars:
  40 + return compact
  41 + return compact[:max_chars] + "..."
  42 +
  43 +
  44 +def _preview_docs(docs: List[str], max_items: int, max_chars: int) -> List[Dict[str, Any]]:
  45 + previews: List[Dict[str, Any]] = []
  46 + for idx, doc in enumerate(docs[:max_items]):
  47 + previews.append(
  48 + {
  49 + "idx": idx,
  50 + "len": len(doc),
  51 + "preview": _compact_preview(doc, max_chars),
  52 + }
  53 + )
  54 + return previews
32 55  
33 56  
34 57 class RerankRequest(BaseModel):
... ... @@ -100,19 +123,25 @@ def rerank(request: RerankRequest) -&gt; RerankResponse:
100 123  
101 124 start_ts = time.time()
102 125 logger.info(
103   - "Rerank request | docs=%d normalize=%s",
  126 + "Rerank request | docs=%d normalize=%s query_len=%d query=%r doc_preview=%s",
104 127 len(request.docs),
105 128 normalize,
  129 + len(query),
  130 + _compact_preview(query, _LOG_TEXT_PREVIEW_CHARS),
  131 + _preview_docs(request.docs, _LOG_DOC_PREVIEW_COUNT, _LOG_TEXT_PREVIEW_CHARS),
106 132 )
107 133 scores, meta = _reranker.score_with_meta(query, request.docs, normalize=normalize)
108 134 meta = dict(meta)
109 135 meta.update({"service_elapsed_ms": round((time.time() - start_ts) * 1000.0, 3)})
  136 + score_preview = [round(float(s), 6) for s in scores[:_LOG_DOC_PREVIEW_COUNT]]
110 137 logger.info(
111   - "Rerank done | docs=%d unique=%s dedup=%s elapsed_ms=%s",
  138 + "Rerank done | docs=%d unique=%s dedup=%s elapsed_ms=%s query=%r score_preview=%s",
112 139 meta.get("input_docs"),
113 140 meta.get("unique_docs"),
114 141 meta.get("dedup_ratio"),
115 142 meta.get("service_elapsed_ms"),
  143 + _compact_preview(query, _LOG_TEXT_PREVIEW_CHARS),
  144 + score_preview,
116 145 )
117 146  
118 147 return RerankResponse(scores=scores, meta=meta)
... ...
scripts/daily_log_router.sh 0 → 100755
... ... @@ -0,0 +1,56 @@
  1 +#!/bin/bash
  2 +#
  3 +# Route incoming log stream into per-day files.
  4 +#
  5 +# Usage:
  6 +# command 2>&1 | ./scripts/daily_log_router.sh <service> <log_dir> [retention_days]
  7 +#
  8 +
  9 +set -euo pipefail
  10 +
  11 +if [ "$#" -lt 2 ]; then
  12 + echo "Usage: $0 <service> <log_dir> [retention_days]" >&2
  13 + exit 1
  14 +fi
  15 +
  16 +SERVICE_NAME="$1"
  17 +LOG_DIR="$2"
  18 +RETENTION_DAYS="${3:-30}"
  19 +
  20 +mkdir -p "${LOG_DIR}"
  21 +
  22 +awk -v dir="${LOG_DIR}" -v service="${SERVICE_NAME}" -v retention_days="${RETENTION_DAYS}" '
  23 +function rotate_file(day) {
  24 + return sprintf("%s/%s-%s.log", dir, service, day)
  25 +}
  26 +
  27 +function update_symlink(day) {
  28 + cmd = sprintf("ln -sfn \"%s-%s.log\" \"%s/%s.log\"", service, day, dir, service)
  29 + system(cmd)
  30 +}
  31 +
  32 +function cleanup_old_logs() {
  33 + cmd = sprintf("find \"%s\" -maxdepth 1 -type f -name \"%s-*.log\" -mtime +%d -delete >/dev/null 2>&1", dir, service, retention_days)
  34 + system(cmd)
  35 +}
  36 +
  37 +{
  38 + day = strftime("%Y-%m-%d")
  39 + target = rotate_file(day)
  40 +
  41 + if (target != current_target) {
  42 + update_symlink(day)
  43 + cleanup_old_logs()
  44 + current_target = target
  45 + }
  46 +
  47 + print >> current_target
  48 + fflush(current_target)
  49 +}
  50 +
  51 +END {
  52 + if (current_target != "") {
  53 + close(current_target)
  54 + }
  55 +}
  56 +'
... ...
scripts/service_ctl.sh
... ... @@ -8,6 +8,7 @@ set -euo pipefail
8 8  
9 9 PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
10 10 LOG_DIR="${PROJECT_ROOT}/logs"
  11 +LOG_RETENTION_DAYS="${LOG_RETENTION_DAYS:-30}"
11 12  
12 13 mkdir -p "${LOG_DIR}"
13 14  
... ... @@ -46,6 +47,16 @@ log_file() {
46 47 echo "${LOG_DIR}/${service}.log"
47 48 }
48 49  
  50 +prepare_daily_log_target() {
  51 + local service="$1"
  52 + local day
  53 + local today_file
  54 + day="$(date +%F)"
  55 + today_file="${LOG_DIR}/${service}-${day}.log"
  56 + touch "${today_file}"
  57 + ln -sfn "$(basename "${today_file}")" "$(log_file "${service}")"
  58 +}
  59 +
49 60 service_start_cmd() {
50 61 local service="$1"
51 62 case "${service}" in
... ... @@ -176,6 +187,7 @@ start_one() {
176 187 local pf lf
177 188 pf="$(pid_file "${service}")"
178 189 lf="$(log_file "${service}")"
  190 + prepare_daily_log_target "${service}"
179 191  
180 192 if [ "${service}" != "tei" ]; then
181 193 if is_running_by_pid "${service}" || is_running_by_port "${service}"; then
... ... @@ -203,12 +215,12 @@ start_one() {
203 215 cnclip|tei)
204 216 echo "[start] ${service} (managed by native script)"
205 217 if [ "${service}" = "cnclip" ]; then
206   - if ! CNCLIP_DEVICE="${CNCLIP_DEVICE:-cuda}" bash -lc "${cmd}" >> "${lf}" 2>&1; then
  218 + if ! CNCLIP_DEVICE="${CNCLIP_DEVICE:-cuda}" "${cmd}" > >("${PROJECT_ROOT}/scripts/daily_log_router.sh" "${service}" "${LOG_DIR}" "${LOG_RETENTION_DAYS}") 2>&1; then
207 219 echo "[error] ${service} start script failed, inspect ${lf}" >&2
208 220 return 1
209 221 fi
210 222 else
211   - if ! bash -lc "${cmd}" >> "${lf}" 2>&1; then
  223 + if ! "${cmd}" > >("${PROJECT_ROOT}/scripts/daily_log_router.sh" "${service}" "${LOG_DIR}" "${LOG_RETENTION_DAYS}") 2>&1; then
212 224 echo "[error] ${service} start script failed, inspect ${lf}" >&2
213 225 return 1
214 226 fi
... ... @@ -229,7 +241,7 @@ start_one() {
229 241 ;;
230 242 backend|indexer|frontend|embedding|translator)
231 243 echo "[start] ${service}"
232   - nohup bash -lc "${cmd}" > "${lf}" 2>&1 &
  244 + nohup "${cmd}" > >("${PROJECT_ROOT}/scripts/daily_log_router.sh" "${service}" "${LOG_DIR}" "${LOG_RETENTION_DAYS}") 2>&1 &
233 245 local pid=$!
234 246 echo "${pid}" > "${pf}"
235 247 local retries=30
... ... @@ -244,7 +256,7 @@ start_one() {
244 256 echo "[start] ${service}"
245 257 # Start reranker directly so pid file points to the script process that
246 258 # will exec uvicorn, avoiding extra shell wrapper lifecycle edge-cases.
247   - nohup "${cmd}" > "${lf}" 2>&1 &
  259 + nohup "${cmd}" > >("${PROJECT_ROOT}/scripts/daily_log_router.sh" "${service}" "${LOG_DIR}" "${LOG_RETENTION_DAYS}") 2>&1 &
248 260 local pid=$!
249 261 echo "${pid}" > "${pf}"
250 262 if wait_for_health "${service}" 90; then
... ... @@ -408,9 +420,12 @@ Default target set (when no service provided):
408 420 status -> all known services
409 421  
410 422 Optional service startup:
411   - ./scripts/service_ctl.sh start embedding translator reranker tei cnclip
  423 + ./scripts/service_ctl.sh start tei cnclip embedding translator reranker
412 424 TEI_DEVICE=cuda|cpu ./scripts/service_ctl.sh start tei
413 425 CNCLIP_DEVICE=cuda|cpu ./scripts/service_ctl.sh start cnclip
  426 +
  427 +Log retention:
  428 + LOG_RETENTION_DAYS=30 ./scripts/service_ctl.sh start
414 429 EOF
415 430 }
416 431  
... ...
scripts/start.sh
... ... @@ -12,11 +12,11 @@ echo &quot;saas-search 服务启动&quot;
12 12 echo "========================================"
13 13 echo "默认启动核心服务: backend/indexer/frontend"
14 14 echo "可选服务请显式指定:"
15   -echo " ./scripts/service_ctl.sh start embedding translator reranker tei cnclip"
  15 +echo " ./scripts/service_ctl.sh start tei cnclip embedding translator reranker"
16 16 echo
17 17  
18 18 ./scripts/service_ctl.sh start
19 19  
20 20 echo
21 21 echo "当前服务状态:"
22   -./scripts/service_ctl.sh status backend indexer frontend embedding translator reranker tei cnclip
  22 +./scripts/service_ctl.sh status backend indexer frontend tei cnclip embedding translator reranker
... ...
scripts/start_cnclip_service.sh
... ... @@ -43,7 +43,9 @@ PROJECT_ROOT=&quot;$(cd &quot;$(dirname &quot;$0&quot;)/..&quot; &amp;&amp; pwd)&quot;
43 43 CLIP_SERVER_DIR="${PROJECT_ROOT}/third-party/clip-as-service/server"
44 44 LOG_DIR="${PROJECT_ROOT}/logs"
45 45 PID_FILE="${LOG_DIR}/cnclip.pid"
46   -LOG_FILE="${LOG_DIR}/cnclip.log"
  46 +LOG_LINK="${LOG_DIR}/cnclip.log"
  47 +LOG_FILE="${LOG_DIR}/cnclip-$(date +%F).log"
  48 +LOG_ROUTER_SCRIPT="${PROJECT_ROOT}/scripts/daily_log_router.sh"
47 49  
48 50 # 帮助信息
49 51 show_help() {
... ... @@ -128,6 +130,11 @@ fi
128 130  
129 131 # 创建日志目录
130 132 mkdir -p "${LOG_DIR}"
  133 +if [ ! -x "${LOG_ROUTER_SCRIPT}" ]; then
  134 + echo -e "${RED}错误: 日志路由脚本不存在或不可执行: ${LOG_ROUTER_SCRIPT}${NC}"
  135 + exit 1
  136 +fi
  137 +ln -sfn "$(basename "${LOG_FILE}")" "${LOG_LINK}"
131 138  
132 139 # 检查是否已经有服务在运行
133 140 if [ -f "${PID_FILE}" ]; then
... ... @@ -208,7 +215,7 @@ echo &quot; 服务端口: ${PORT}&quot;
208 215 echo " 协议: gRPC (默认,官方推荐)"
209 216 echo " 其他参数: 使用官方默认值"
210 217 echo " 副本数: ${REPLICAS}"
211   -echo " 日志文件: ${LOG_FILE}"
  218 +echo " 日志文件: ${LOG_LINK}"
212 219 echo ""
213 220  
214 221 # 副本数显存警告
... ... @@ -263,9 +270,9 @@ EOF
263 270  
264 271 echo -e "${GREEN}✓ 已生成配置文件: ${TEMP_FLOW_FILE}${NC}"
265 272  
266   -# 使用 nohup 在后台启动服务
  273 +# 使用 nohup 在后台启动服务(日志按天分流)
267 274 cd "${CLIP_SERVER_DIR}"
268   -nohup python -m clip_server "${TEMP_FLOW_FILE}" > "${LOG_FILE}" 2>&1 &
  275 +nohup bash -lc "exec python -m clip_server \"${TEMP_FLOW_FILE}\" > >(\"${LOG_ROUTER_SCRIPT}\" \"cnclip\" \"${LOG_DIR}\" \"${LOG_RETENTION_DAYS:-30}\") 2>&1" >/dev/null 2>&1 &
269 276  
270 277 # 保存 PID
271 278 SERVICE_PID=$!
... ... @@ -293,7 +300,7 @@ if ps -p ${SERVICE_PID} &gt; /dev/null 2&gt;&amp;1; then
293 300 echo -e " r = c.encode(['测试文本'])"
294 301 echo ""
295 302 echo -e "查看日志:"
296   - echo -e " tail -f ${LOG_FILE}"
  303 + echo -e " tail -F ${LOG_LINK}"
297 304 echo ""
298 305 echo -e "停止服务:"
299 306 echo -e " ./scripts/stop_cnclip_service.sh"
... ... @@ -303,7 +310,7 @@ if ps -p ${SERVICE_PID} &gt; /dev/null 2&gt;&amp;1; then
303 310 echo -e "${YELLOW}等待模型加载完成(约30-60秒)...${NC}"
304 311 sleep 30
305 312 echo -e "${GREEN}✓ 服务已启动,请查看日志确认模型是否加载完成${NC}"
306   - echo -e "${YELLOW}查看日志: tail -f ${LOG_FILE}${NC}"
  313 + echo -e "${YELLOW}查看日志: tail -F ${LOG_LINK}${NC}"
307 314  
308 315 else
309 316 echo -e "${RED}========================================${NC}"
... ... @@ -311,7 +318,7 @@ else
311 318 echo -e "${RED}========================================${NC}"
312 319 echo ""
313 320 echo -e "请查看日志获取详细错误信息:"
314   - echo -e " tail -f ${LOG_FILE}"
  321 + echo -e " tail -F ${LOG_LINK}"
315 322 echo ""
316 323 rm -f "${PID_FILE}"
317 324 exit 1
... ...
search/searcher.py
... ... @@ -8,6 +8,7 @@ from typing import Dict, Any, List, Optional, Union
8 8 import os
9 9 import time, json
10 10 import logging
  11 +import hashlib
11 12  
12 13 from utils.es_client import ESClient
13 14 from query import QueryParser, ParsedQuery
... ... @@ -21,6 +22,15 @@ from api.result_formatter import ResultFormatter
21 22 from indexer.mapping_generator import get_tenant_index_name
22 23  
23 24 logger = logging.getLogger(__name__)
  25 +backend_verbose_logger = logging.getLogger("backend.verbose")
  26 +
  27 +
  28 +def _log_backend_verbose(payload: Dict[str, Any]) -> None:
  29 + if not backend_verbose_logger.handlers:
  30 + return
  31 + backend_verbose_logger.info(
  32 + json.dumps(payload, ensure_ascii=False, separators=(",", ":"))
  33 + )
24 34  
25 35  
26 36 class SearchResult:
... ... @@ -320,16 +330,33 @@ class Searcher:
320 330 context.store_intermediate_result('es_query', es_query)
321 331 context.store_intermediate_result('es_body_for_search', body_for_es)
322 332  
323   - # Serialize ES query as a compact JSON string (no spaces or newlines)
324   - es_query_compact = json.dumps(es_query, ensure_ascii=False, separators=(',', ':'))
  333 + # Serialize ES query to compute a compact size + stable digest for correlation
  334 + es_query_compact = json.dumps(es_query, ensure_ascii=False, separators=(",", ":"))
  335 + es_query_digest = hashlib.sha256(es_query_compact.encode("utf-8")).hexdigest()[:16]
  336 + knn_enabled = bool(enable_embedding and parsed_query.query_vector is not None)
  337 + vector_dims = int(len(parsed_query.query_vector)) if parsed_query.query_vector is not None else 0
325 338  
326 339 context.logger.info(
327   - f"ES query built | size: {len(es_query_compact)} chars | "
328   - f"KNN: {'yes' if enable_embedding and parsed_query.query_vector is not None else 'no'} | "
329   - f"facets: {'yes' if facets else 'no'} | "
330   - f"query: {es_query_compact}",
  340 + "ES query built | size: %s chars | digest: %s | KNN: %s | vector_dims: %s | facets: %s",
  341 + len(es_query_compact),
  342 + es_query_digest,
  343 + "yes" if knn_enabled else "no",
  344 + vector_dims,
  345 + "yes" if facets else "no",
331 346 extra={'reqid': context.reqid, 'uid': context.uid}
332 347 )
  348 + _log_backend_verbose({
  349 + "event": "es_query_built",
  350 + "reqid": context.reqid,
  351 + "uid": context.uid,
  352 + "tenant_id": tenant_id,
  353 + "size_chars": len(es_query_compact),
  354 + "sha256_16": es_query_digest,
  355 + "knn_enabled": knn_enabled,
  356 + "vector_dims": vector_dims,
  357 + "has_facets": bool(facets),
  358 + "query": es_query,
  359 + })
333 360 except Exception as e:
334 361 context.set_error(e)
335 362 context.logger.error(
... ...