Commit 28e57bb16712929459a9d1eb43c2800a94ef2d8b

Authored by tangwang
1 parent af7ee060

日志体系优化

@@ -29,7 +29,7 @@ source activate.sh @@ -29,7 +29,7 @@ source activate.sh
29 ./run.sh 29 ./run.sh
30 30
31 # 可选:附加能力服务(按需开启) 31 # 可选:附加能力服务(按需开启)
32 -./scripts/service_ctl.sh start embedding translator reranker tei cnclip 32 +./scripts/service_ctl.sh start tei cnclip embedding translator reranker
33 33
34 # 查看状态 34 # 查看状态
35 ./scripts/service_ctl.sh status 35 ./scripts/service_ctl.sh status
@@ -11,6 +11,7 @@ import logging @@ -11,6 +11,7 @@ import logging
11 import time 11 import time
12 import argparse 12 import argparse
13 import uvicorn 13 import uvicorn
  14 +from logging.handlers import TimedRotatingFileHandler
14 from collections import defaultdict, deque 15 from collections import defaultdict, deque
15 from typing import Optional 16 from typing import Optional
16 from fastapi import FastAPI, Request, HTTPException 17 from fastapi import FastAPI, Request, HTTPException
@@ -23,18 +24,60 @@ from slowapi import Limiter, _rate_limit_exceeded_handler @@ -23,18 +24,60 @@ from slowapi import Limiter, _rate_limit_exceeded_handler
23 from slowapi.util import get_remote_address 24 from slowapi.util import get_remote_address
24 from slowapi.errors import RateLimitExceeded 25 from slowapi.errors import RateLimitExceeded
25 26
26 -# Configure logging with better formatting 27 +# Configure backend logging
27 import pathlib 28 import pathlib
28 -log_dir = pathlib.Path('logs')  
29 -log_dir.mkdir(exist_ok=True)  
30 -logging.basicConfig(  
31 - level=logging.INFO,  
32 - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',  
33 - handlers=[  
34 - logging.StreamHandler(),  
35 - logging.FileHandler(log_dir / 'api.log', mode='a', encoding='utf-8')  
36 - ]  
37 -) 29 +
  30 +
  31 +def configure_backend_logging() -> None:
  32 + log_dir = pathlib.Path("logs")
  33 + log_dir.mkdir(exist_ok=True)
  34 + log_level = os.getenv("LOG_LEVEL", "INFO").upper()
  35 + numeric_level = getattr(logging, log_level, logging.INFO)
  36 +
  37 + default_formatter = logging.Formatter(
  38 + "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
  39 + )
  40 +
  41 + root_logger = logging.getLogger()
  42 + root_logger.setLevel(numeric_level)
  43 + root_logger.handlers.clear()
  44 +
  45 + console_handler = logging.StreamHandler()
  46 + console_handler.setLevel(numeric_level)
  47 + console_handler.setFormatter(default_formatter)
  48 + root_logger.addHandler(console_handler)
  49 +
  50 + backend_handler = TimedRotatingFileHandler(
  51 + filename=log_dir / "api.log",
  52 + when="midnight",
  53 + interval=1,
  54 + backupCount=30,
  55 + encoding="utf-8",
  56 + )
  57 + backend_handler.setLevel(numeric_level)
  58 + backend_handler.setFormatter(default_formatter)
  59 + root_logger.addHandler(backend_handler)
  60 +
  61 + verbose_logger = logging.getLogger("backend.verbose")
  62 + verbose_logger.setLevel(numeric_level)
  63 + verbose_logger.handlers.clear()
  64 + verbose_logger.propagate = False
  65 +
  66 + verbose_handler = TimedRotatingFileHandler(
  67 + filename=log_dir / "backend_verbose.log",
  68 + when="midnight",
  69 + interval=1,
  70 + backupCount=30,
  71 + encoding="utf-8",
  72 + )
  73 + verbose_handler.setLevel(numeric_level)
  74 + verbose_handler.setFormatter(
  75 + logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
  76 + )
  77 + verbose_logger.addHandler(verbose_handler)
  78 +
  79 +
  80 +configure_backend_logging()
38 logger = logging.getLogger(__name__) 81 logger = logging.getLogger(__name__)
39 82
40 # Initialize rate limiter 83 # Initialize rate limiter
api/routes/search.py
@@ -5,6 +5,9 @@ Search API routes. @@ -5,6 +5,9 @@ Search API routes.
5 from fastapi import APIRouter, HTTPException, Query, Request 5 from fastapi import APIRouter, HTTPException, Query, Request
6 from typing import Optional 6 from typing import Optional
7 import uuid 7 import uuid
  8 +import hashlib
  9 +import json
  10 +import logging
8 11
9 from ..models import ( 12 from ..models import (
10 SearchRequest, 13 SearchRequest,
@@ -17,6 +20,15 @@ from ..models import ( @@ -17,6 +20,15 @@ from ..models import (
17 from context.request_context import create_request_context, set_current_request_context, clear_current_request_context 20 from context.request_context import create_request_context, set_current_request_context, clear_current_request_context
18 21
19 router = APIRouter(prefix="/search", tags=["search"]) 22 router = APIRouter(prefix="/search", tags=["search"])
  23 +backend_verbose_logger = logging.getLogger("backend.verbose")
  24 +
  25 +
  26 +def _log_backend_verbose(payload: dict) -> None:
  27 + if not backend_verbose_logger.handlers:
  28 + return
  29 + backend_verbose_logger.info(
  30 + json.dumps(payload, ensure_ascii=False, separators=(",", ":"))
  31 + )
20 32
21 33
22 def extract_request_info(request: Request) -> tuple[str, str]: 34 def extract_request_info(request: Request) -> tuple[str, str]:
@@ -136,15 +148,32 @@ async def search(request: SearchRequest, http_request: Request): @@ -136,15 +148,32 @@ async def search(request: SearchRequest, http_request: Request):
136 debug_info=result.debug_info 148 debug_info=result.debug_info
137 ) 149 )
138 150
139 - # Log complete response JSON 151 + response_payload = response.model_dump(mode="json")
  152 + response_json = json.dumps(response_payload, ensure_ascii=False, separators=(",", ":"))
  153 + response_digest = hashlib.sha256(response_json.encode("utf-8")).hexdigest()[:16]
  154 + max_score = float(response.max_score or 0.0)
  155 +
140 context.logger.info( 156 context.logger.info(
141 - "Search response | "  
142 - f"Total results: {response.total} | "  
143 - f"Max score: {response.max_score:.4f} | "  
144 - f"Time: {response.took_ms}ms | "  
145 - f"Response: {response.model_dump_json()}", 157 + "Search response | Total results: %s | Max score: %.4f | Time: %sms | payload_size: %s chars | digest: %s",
  158 + response.total,
  159 + max_score,
  160 + response.took_ms,
  161 + len(response_json),
  162 + response_digest,
146 extra={'reqid': context.reqid, 'uid': context.uid} 163 extra={'reqid': context.reqid, 'uid': context.uid}
147 ) 164 )
  165 + _log_backend_verbose({
  166 + "event": "search_response",
  167 + "reqid": context.reqid,
  168 + "uid": context.uid,
  169 + "tenant_id": tenant_id,
  170 + "total_results": response.total,
  171 + "max_score": max_score,
  172 + "took_ms": response.took_ms,
  173 + "payload_size_chars": len(response_json),
  174 + "sha256_16": response_digest,
  175 + "response": response_payload,
  176 + })
148 177
149 return response 178 return response
150 179
@@ -233,15 +262,32 @@ async def search_by_image(request: ImageSearchRequest, http_request: Request): @@ -233,15 +262,32 @@ async def search_by_image(request: ImageSearchRequest, http_request: Request):
233 performance_info=performance_summary 262 performance_info=performance_summary
234 ) 263 )
235 264
236 - # Log complete response JSON 265 + response_payload = response.model_dump(mode="json")
  266 + response_json = json.dumps(response_payload, ensure_ascii=False, separators=(",", ":"))
  267 + response_digest = hashlib.sha256(response_json.encode("utf-8")).hexdigest()[:16]
  268 + max_score = float(response.max_score or 0.0)
  269 +
237 context.logger.info( 270 context.logger.info(
238 - "Image search response | "  
239 - f"Total results: {response.total} | "  
240 - f"Max score: {response.max_score:.4f} | "  
241 - f"Time: {response.took_ms}ms | "  
242 - f"Response: {response.model_dump_json()}", 271 + "Image search response | Total results: %s | Max score: %.4f | Time: %sms | payload_size: %s chars | digest: %s",
  272 + response.total,
  273 + max_score,
  274 + response.took_ms,
  275 + len(response_json),
  276 + response_digest,
243 extra={'reqid': context.reqid, 'uid': context.uid} 277 extra={'reqid': context.reqid, 'uid': context.uid}
244 ) 278 )
  279 + _log_backend_verbose({
  280 + "event": "image_search_response",
  281 + "reqid": context.reqid,
  282 + "uid": context.uid,
  283 + "tenant_id": tenant_id,
  284 + "total_results": response.total,
  285 + "max_score": max_score,
  286 + "took_ms": response.took_ms,
  287 + "payload_size_chars": len(response_json),
  288 + "sha256_16": response_digest,
  289 + "response": response_payload,
  290 + })
245 291
246 return response 292 return response
247 293
docs/QUICKSTART.md
@@ -510,16 +510,11 @@ curl http://localhost:6007/health @@ -510,16 +510,11 @@ curl http://localhost:6007/health
510 510
511 ### 5.2 常看日志 511 ### 5.2 常看日志
512 512
513 -- `logs/backend.log`  
514 -- `logs/indexer.log`  
515 -- `logs/frontend.log`  
516 -- `logs/embedding.log`  
517 -- `logs/translator.log`  
518 -- `logs/reranker.log`  
519 -- `logs/tei.log`  
520 -- `logs/cnclip.log`  
521 -- `logs/search_engine.log`  
522 -- `logs/errors.log` 513 +- `logs/<service>-YYYY-MM-DD.log`(`service_ctl.sh` 按天写入的真实文件)
  514 +- `logs/<service>.log`(指向当天文件的软链,推荐 `tail -F`)
  515 +- `logs/api.log`(backend 进程内日志,按天轮转)
  516 +- `logs/backend_verbose.log`(backend 大对象详细日志,按天轮转)
  517 +- `logs/indexer.log`(索引结构化 JSON 日志,按天轮转)
523 518
524 ### 5.3 常用排障命令 519 ### 5.3 常用排障命令
525 520
docs/TEI_SERVICE说明文档.md
@@ -152,7 +152,7 @@ curl -sS -X POST &quot;http://127.0.0.1:6005/embed/text&quot; \ @@ -152,7 +152,7 @@ curl -sS -X POST &quot;http://127.0.0.1:6005/embed/text&quot; \
152 启动全套(含 TEI): 152 启动全套(含 TEI):
153 153
154 ```bash 154 ```bash
155 -TEI_DEVICE=cuda ./scripts/service_ctl.sh start embedding translator reranker tei cnclip 155 +TEI_DEVICE=cuda ./scripts/service_ctl.sh start tei cnclip embedding translator reranker
156 ``` 156 ```
157 157
158 仅启动 TEI: 158 仅启动 TEI:
docs/Usage-Guide.md
@@ -368,31 +368,23 @@ RERANKER_PORT=6007 @@ -368,31 +368,23 @@ RERANKER_PORT=6007
368 368
369 日志文件存储在 `logs/` 目录下: 369 日志文件存储在 `logs/` 目录下:
370 370
371 -- `logs/backend.log` - 后端服务日志  
372 -- `logs/indexer.log` - 索引服务日志  
373 -- `logs/frontend.log` - 前端服务日志  
374 -- `logs/embedding.log` - 向量服务日志(可选)  
375 -- `logs/translator.log` - 翻译服务日志(可选)  
376 -- `logs/reranker.log` - 重排服务日志(可选)  
377 -- `logs/tei.log` - TEI 启停日志(可选)  
378 -- `logs/cnclip.log` - CN-CLIP 启停日志(可选)  
379 -- `logs/search_engine.log` - 应用主日志(按天轮转)  
380 -- `logs/errors.log` - 错误日志(按天轮转) 371 +- `logs/<service>-YYYY-MM-DD.log` - `service_ctl.sh` 统一管理的按天日志文件(真实文件)
  372 +- `logs/<service>.log` - 指向当天日志文件的软链(兼容原有命令)
  373 +- `logs/api.log` - backend 进程内日志(按天轮转)
  374 +- `logs/backend_verbose.log` - backend 详细大对象日志(按天轮转,含 `Search response` / `ES query built` 完整内容)
  375 +- `logs/indexer.log` - 索引结构化日志(JSON,按天轮转)
381 376
382 ### 查看实时日志 377 ### 查看实时日志
383 378
384 ```bash 379 ```bash
385 -# 查看后端日志  
386 -tail -f logs/backend.log 380 +# 查看后端日志(建议 -F,跨天自动跟随新文件)
  381 +tail -F logs/backend.log
387 382
388 # 查看前端日志 383 # 查看前端日志
389 -tail -f logs/frontend.log 384 +tail -F logs/frontend.log
390 385
391 -# 查看应用主日志  
392 -tail -f logs/search_engine.log  
393 -  
394 -# 查看错误日志  
395 -tail -f logs/errors.log 386 +# 查看 backend 详细大对象日志
  387 +tail -F logs/backend_verbose.log
396 ``` 388 ```
397 389
398 ### 日志级别 390 ### 日志级别
@@ -406,7 +398,7 @@ LOG_LEVEL=DEBUG # DEBUG, INFO, WARNING, ERROR, CRITICAL @@ -406,7 +398,7 @@ LOG_LEVEL=DEBUG # DEBUG, INFO, WARNING, ERROR, CRITICAL
406 398
407 ### 日志轮转 399 ### 日志轮转
408 400
409 -日志文件按天自动轮转,保留30天的历史日志 401 +日志文件按天自动轮转,默认保留 30 天历史日志(可通过 `LOG_RETENTION_DAYS` 调整 `service_ctl.sh` 管理日志的保留天数)
410 402
411 --- 403 ---
412 404
embeddings/server.py
@@ -56,6 +56,29 @@ _TEXT_MICROBATCH_WINDOW_SEC = max( @@ -56,6 +56,29 @@ _TEXT_MICROBATCH_WINDOW_SEC = max(
56 _TEXT_REQUEST_TIMEOUT_SEC = max( 56 _TEXT_REQUEST_TIMEOUT_SEC = max(
57 1.0, float(os.getenv("TEXT_REQUEST_TIMEOUT_SEC", "30")) 57 1.0, float(os.getenv("TEXT_REQUEST_TIMEOUT_SEC", "30"))
58 ) 58 )
  59 +_LOG_PREVIEW_COUNT = max(1, int(os.getenv("EMBEDDING_LOG_PREVIEW_COUNT", "3")))
  60 +_LOG_TEXT_PREVIEW_CHARS = max(32, int(os.getenv("EMBEDDING_LOG_TEXT_PREVIEW_CHARS", "120")))
  61 +_LOG_IMAGE_PREVIEW_CHARS = max(32, int(os.getenv("EMBEDDING_LOG_IMAGE_PREVIEW_CHARS", "180")))
  62 +
  63 +
  64 +def _compact_preview(text: str, max_chars: int) -> str:
  65 + compact = " ".join((text or "").split())
  66 + if len(compact) <= max_chars:
  67 + return compact
  68 + return compact[:max_chars] + "..."
  69 +
  70 +
  71 +def _preview_inputs(items: List[str], max_items: int, max_chars: int) -> List[Dict[str, Any]]:
  72 + previews: List[Dict[str, Any]] = []
  73 + for idx, item in enumerate(items[:max_items]):
  74 + previews.append(
  75 + {
  76 + "idx": idx,
  77 + "len": len(item),
  78 + "preview": _compact_preview(item, max_chars),
  79 + }
  80 + )
  81 + return previews
59 82
60 83
61 def _encode_local_st(texts: List[str], normalize_embeddings: bool) -> Any: 84 def _encode_local_st(texts: List[str], normalize_embeddings: bool) -> Any:
@@ -292,6 +315,14 @@ def embed_text(texts: List[str], normalize: Optional[bool] = None) -&gt; List[Optio @@ -292,6 +315,14 @@ def embed_text(texts: List[str], normalize: Optional[bool] = None) -&gt; List[Optio
292 raise HTTPException(status_code=400, detail=f"Invalid text at index {i}: empty string") 315 raise HTTPException(status_code=400, detail=f"Invalid text at index {i}: empty string")
293 normalized.append(s) 316 normalized.append(s)
294 317
  318 + logger.info(
  319 + "embed_text request | backend=%s inputs=%d normalize=%s preview=%s",
  320 + _text_backend_name,
  321 + len(normalized),
  322 + effective_normalize,
  323 + _preview_inputs(normalized, _LOG_PREVIEW_COUNT, _LOG_TEXT_PREVIEW_CHARS),
  324 + )
  325 +
295 t0 = time.perf_counter() 326 t0 = time.perf_counter()
296 try: 327 try:
297 # local_st backend uses in-process torch model, keep serialized encode for safety; 328 # local_st backend uses in-process torch model, keep serialized encode for safety;
@@ -301,10 +332,11 @@ def embed_text(texts: List[str], normalize: Optional[bool] = None) -&gt; List[Optio @@ -301,10 +332,11 @@ def embed_text(texts: List[str], normalize: Optional[bool] = None) -&gt; List[Optio
301 out = [_encode_single_text_with_microbatch(normalized[0], normalize=effective_normalize)] 332 out = [_encode_single_text_with_microbatch(normalized[0], normalize=effective_normalize)]
302 elapsed_ms = (time.perf_counter() - t0) * 1000.0 333 elapsed_ms = (time.perf_counter() - t0) * 1000.0
303 logger.info( 334 logger.info(
304 - "embed_text done | backend=%s mode=microbatch-single inputs=%d normalize=%s elapsed_ms=%.2f", 335 + "embed_text done | backend=%s mode=microbatch-single inputs=%d normalize=%s dim=%d elapsed_ms=%.2f",
305 _text_backend_name, 336 _text_backend_name,
306 len(normalized), 337 len(normalized),
307 effective_normalize, 338 effective_normalize,
  339 + len(out[0]) if out and out[0] is not None else 0,
308 elapsed_ms, 340 elapsed_ms,
309 ) 341 )
310 return out 342 return out
@@ -335,10 +367,11 @@ def embed_text(texts: List[str], normalize: Optional[bool] = None) -&gt; List[Optio @@ -335,10 +367,11 @@ def embed_text(texts: List[str], normalize: Optional[bool] = None) -&gt; List[Optio
335 out.append(vec) 367 out.append(vec)
336 elapsed_ms = (time.perf_counter() - t0) * 1000.0 368 elapsed_ms = (time.perf_counter() - t0) * 1000.0
337 logger.info( 369 logger.info(
338 - "embed_text done | backend=%s inputs=%d normalize=%s elapsed_ms=%.2f", 370 + "embed_text done | backend=%s inputs=%d normalize=%s dim=%d elapsed_ms=%.2f",
339 _text_backend_name, 371 _text_backend_name,
340 len(normalized), 372 len(normalized),
341 effective_normalize, 373 effective_normalize,
  374 + len(out[0]) if out and out[0] is not None else 0,
342 elapsed_ms, 375 elapsed_ms,
343 ) 376 )
344 return out 377 return out
@@ -358,6 +391,14 @@ def embed_image(images: List[str], normalize: Optional[bool] = None) -&gt; List[Opt @@ -358,6 +391,14 @@ def embed_image(images: List[str], normalize: Optional[bool] = None) -&gt; List[Opt
358 raise HTTPException(status_code=400, detail=f"Invalid image at index {i}: empty URL/path") 391 raise HTTPException(status_code=400, detail=f"Invalid image at index {i}: empty URL/path")
359 urls.append(s) 392 urls.append(s)
360 393
  394 + logger.info(
  395 + "embed_image request | inputs=%d normalize=%s preview=%s",
  396 + len(urls),
  397 + effective_normalize,
  398 + _preview_inputs(urls, _LOG_PREVIEW_COUNT, _LOG_IMAGE_PREVIEW_CHARS),
  399 + )
  400 +
  401 + t0 = time.perf_counter()
361 with _image_encode_lock: 402 with _image_encode_lock:
362 vectors = _image_model.encode_image_urls( 403 vectors = _image_model.encode_image_urls(
363 urls, 404 urls,
@@ -375,4 +416,12 @@ def embed_image(images: List[str], normalize: Optional[bool] = None) -&gt; List[Opt @@ -375,4 +416,12 @@ def embed_image(images: List[str], normalize: Optional[bool] = None) -&gt; List[Opt
375 if out_vec is None: 416 if out_vec is None:
376 raise RuntimeError(f"Image model returned empty embedding for index {i}") 417 raise RuntimeError(f"Image model returned empty embedding for index {i}")
377 out.append(out_vec) 418 out.append(out_vec)
  419 + elapsed_ms = (time.perf_counter() - t0) * 1000.0
  420 + logger.info(
  421 + "embed_image done | inputs=%d normalize=%s dim=%d elapsed_ms=%.2f",
  422 + len(urls),
  423 + effective_normalize,
  424 + len(out[0]) if out and out[0] is not None else 0,
  425 + elapsed_ms,
  426 + )
378 return out 427 return out
reranker/server.py
@@ -9,6 +9,7 @@ Backend selected via config: services.rerank.backend (bge | qwen3_vllm), env RER @@ -9,6 +9,7 @@ Backend selected via config: services.rerank.backend (bge | qwen3_vllm), env RER
9 """ 9 """
10 10
11 import logging 11 import logging
  12 +import os
12 import time 13 import time
13 from typing import Any, Dict, List, Optional 14 from typing import Any, Dict, List, Optional
14 15
@@ -29,6 +30,28 @@ app = FastAPI(title=&quot;saas-search Reranker Service&quot;, version=&quot;1.0.0&quot;) @@ -29,6 +30,28 @@ app = FastAPI(title=&quot;saas-search Reranker Service&quot;, version=&quot;1.0.0&quot;)
29 30
30 _reranker: Optional[RerankBackendProtocol] = None 31 _reranker: Optional[RerankBackendProtocol] = None
31 _backend_name: str = "" 32 _backend_name: str = ""
  33 +_LOG_DOC_PREVIEW_COUNT = max(1, int(os.getenv("RERANK_LOG_DOC_PREVIEW_COUNT", "3")))
  34 +_LOG_TEXT_PREVIEW_CHARS = max(32, int(os.getenv("RERANK_LOG_TEXT_PREVIEW_CHARS", "180")))
  35 +
  36 +
  37 +def _compact_preview(text: str, max_chars: int) -> str:
  38 + compact = " ".join((text or "").split())
  39 + if len(compact) <= max_chars:
  40 + return compact
  41 + return compact[:max_chars] + "..."
  42 +
  43 +
  44 +def _preview_docs(docs: List[str], max_items: int, max_chars: int) -> List[Dict[str, Any]]:
  45 + previews: List[Dict[str, Any]] = []
  46 + for idx, doc in enumerate(docs[:max_items]):
  47 + previews.append(
  48 + {
  49 + "idx": idx,
  50 + "len": len(doc),
  51 + "preview": _compact_preview(doc, max_chars),
  52 + }
  53 + )
  54 + return previews
32 55
33 56
34 class RerankRequest(BaseModel): 57 class RerankRequest(BaseModel):
@@ -100,19 +123,25 @@ def rerank(request: RerankRequest) -&gt; RerankResponse: @@ -100,19 +123,25 @@ def rerank(request: RerankRequest) -&gt; RerankResponse:
100 123
101 start_ts = time.time() 124 start_ts = time.time()
102 logger.info( 125 logger.info(
103 - "Rerank request | docs=%d normalize=%s", 126 + "Rerank request | docs=%d normalize=%s query_len=%d query=%r doc_preview=%s",
104 len(request.docs), 127 len(request.docs),
105 normalize, 128 normalize,
  129 + len(query),
  130 + _compact_preview(query, _LOG_TEXT_PREVIEW_CHARS),
  131 + _preview_docs(request.docs, _LOG_DOC_PREVIEW_COUNT, _LOG_TEXT_PREVIEW_CHARS),
106 ) 132 )
107 scores, meta = _reranker.score_with_meta(query, request.docs, normalize=normalize) 133 scores, meta = _reranker.score_with_meta(query, request.docs, normalize=normalize)
108 meta = dict(meta) 134 meta = dict(meta)
109 meta.update({"service_elapsed_ms": round((time.time() - start_ts) * 1000.0, 3)}) 135 meta.update({"service_elapsed_ms": round((time.time() - start_ts) * 1000.0, 3)})
  136 + score_preview = [round(float(s), 6) for s in scores[:_LOG_DOC_PREVIEW_COUNT]]
110 logger.info( 137 logger.info(
111 - "Rerank done | docs=%d unique=%s dedup=%s elapsed_ms=%s", 138 + "Rerank done | docs=%d unique=%s dedup=%s elapsed_ms=%s query=%r score_preview=%s",
112 meta.get("input_docs"), 139 meta.get("input_docs"),
113 meta.get("unique_docs"), 140 meta.get("unique_docs"),
114 meta.get("dedup_ratio"), 141 meta.get("dedup_ratio"),
115 meta.get("service_elapsed_ms"), 142 meta.get("service_elapsed_ms"),
  143 + _compact_preview(query, _LOG_TEXT_PREVIEW_CHARS),
  144 + score_preview,
116 ) 145 )
117 146
118 return RerankResponse(scores=scores, meta=meta) 147 return RerankResponse(scores=scores, meta=meta)
scripts/daily_log_router.sh 0 → 100755
@@ -0,0 +1,56 @@ @@ -0,0 +1,56 @@
  1 +#!/bin/bash
  2 +#
  3 +# Route incoming log stream into per-day files.
  4 +#
  5 +# Usage:
  6 +# command 2>&1 | ./scripts/daily_log_router.sh <service> <log_dir> [retention_days]
  7 +#
  8 +
  9 +set -euo pipefail
  10 +
  11 +if [ "$#" -lt 2 ]; then
  12 + echo "Usage: $0 <service> <log_dir> [retention_days]" >&2
  13 + exit 1
  14 +fi
  15 +
  16 +SERVICE_NAME="$1"
  17 +LOG_DIR="$2"
  18 +RETENTION_DAYS="${3:-30}"
  19 +
  20 +mkdir -p "${LOG_DIR}"
  21 +
  22 +awk -v dir="${LOG_DIR}" -v service="${SERVICE_NAME}" -v retention_days="${RETENTION_DAYS}" '
  23 +function rotate_file(day) {
  24 + return sprintf("%s/%s-%s.log", dir, service, day)
  25 +}
  26 +
  27 +function update_symlink(day) {
  28 + cmd = sprintf("ln -sfn \"%s-%s.log\" \"%s/%s.log\"", service, day, dir, service)
  29 + system(cmd)
  30 +}
  31 +
  32 +function cleanup_old_logs() {
  33 + cmd = sprintf("find \"%s\" -maxdepth 1 -type f -name \"%s-*.log\" -mtime +%d -delete >/dev/null 2>&1", dir, service, retention_days)
  34 + system(cmd)
  35 +}
  36 +
  37 +{
  38 + day = strftime("%Y-%m-%d")
  39 + target = rotate_file(day)
  40 +
  41 + if (target != current_target) {
  42 + update_symlink(day)
  43 + cleanup_old_logs()
  44 + current_target = target
  45 + }
  46 +
  47 + print >> current_target
  48 + fflush(current_target)
  49 +}
  50 +
  51 +END {
  52 + if (current_target != "") {
  53 + close(current_target)
  54 + }
  55 +}
  56 +'
scripts/service_ctl.sh
@@ -8,6 +8,7 @@ set -euo pipefail @@ -8,6 +8,7 @@ set -euo pipefail
8 8
9 PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)" 9 PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
10 LOG_DIR="${PROJECT_ROOT}/logs" 10 LOG_DIR="${PROJECT_ROOT}/logs"
  11 +LOG_RETENTION_DAYS="${LOG_RETENTION_DAYS:-30}"
11 12
12 mkdir -p "${LOG_DIR}" 13 mkdir -p "${LOG_DIR}"
13 14
@@ -46,6 +47,16 @@ log_file() { @@ -46,6 +47,16 @@ log_file() {
46 echo "${LOG_DIR}/${service}.log" 47 echo "${LOG_DIR}/${service}.log"
47 } 48 }
48 49
  50 +prepare_daily_log_target() {
  51 + local service="$1"
  52 + local day
  53 + local today_file
  54 + day="$(date +%F)"
  55 + today_file="${LOG_DIR}/${service}-${day}.log"
  56 + touch "${today_file}"
  57 + ln -sfn "$(basename "${today_file}")" "$(log_file "${service}")"
  58 +}
  59 +
49 service_start_cmd() { 60 service_start_cmd() {
50 local service="$1" 61 local service="$1"
51 case "${service}" in 62 case "${service}" in
@@ -176,6 +187,7 @@ start_one() { @@ -176,6 +187,7 @@ start_one() {
176 local pf lf 187 local pf lf
177 pf="$(pid_file "${service}")" 188 pf="$(pid_file "${service}")"
178 lf="$(log_file "${service}")" 189 lf="$(log_file "${service}")"
  190 + prepare_daily_log_target "${service}"
179 191
180 if [ "${service}" != "tei" ]; then 192 if [ "${service}" != "tei" ]; then
181 if is_running_by_pid "${service}" || is_running_by_port "${service}"; then 193 if is_running_by_pid "${service}" || is_running_by_port "${service}"; then
@@ -203,12 +215,12 @@ start_one() { @@ -203,12 +215,12 @@ start_one() {
203 cnclip|tei) 215 cnclip|tei)
204 echo "[start] ${service} (managed by native script)" 216 echo "[start] ${service} (managed by native script)"
205 if [ "${service}" = "cnclip" ]; then 217 if [ "${service}" = "cnclip" ]; then
206 - if ! CNCLIP_DEVICE="${CNCLIP_DEVICE:-cuda}" bash -lc "${cmd}" >> "${lf}" 2>&1; then 218 + if ! CNCLIP_DEVICE="${CNCLIP_DEVICE:-cuda}" "${cmd}" > >("${PROJECT_ROOT}/scripts/daily_log_router.sh" "${service}" "${LOG_DIR}" "${LOG_RETENTION_DAYS}") 2>&1; then
207 echo "[error] ${service} start script failed, inspect ${lf}" >&2 219 echo "[error] ${service} start script failed, inspect ${lf}" >&2
208 return 1 220 return 1
209 fi 221 fi
210 else 222 else
211 - if ! bash -lc "${cmd}" >> "${lf}" 2>&1; then 223 + if ! "${cmd}" > >("${PROJECT_ROOT}/scripts/daily_log_router.sh" "${service}" "${LOG_DIR}" "${LOG_RETENTION_DAYS}") 2>&1; then
212 echo "[error] ${service} start script failed, inspect ${lf}" >&2 224 echo "[error] ${service} start script failed, inspect ${lf}" >&2
213 return 1 225 return 1
214 fi 226 fi
@@ -229,7 +241,7 @@ start_one() { @@ -229,7 +241,7 @@ start_one() {
229 ;; 241 ;;
230 backend|indexer|frontend|embedding|translator) 242 backend|indexer|frontend|embedding|translator)
231 echo "[start] ${service}" 243 echo "[start] ${service}"
232 - nohup bash -lc "${cmd}" > "${lf}" 2>&1 & 244 + nohup "${cmd}" > >("${PROJECT_ROOT}/scripts/daily_log_router.sh" "${service}" "${LOG_DIR}" "${LOG_RETENTION_DAYS}") 2>&1 &
233 local pid=$! 245 local pid=$!
234 echo "${pid}" > "${pf}" 246 echo "${pid}" > "${pf}"
235 local retries=30 247 local retries=30
@@ -244,7 +256,7 @@ start_one() { @@ -244,7 +256,7 @@ start_one() {
244 echo "[start] ${service}" 256 echo "[start] ${service}"
245 # Start reranker directly so pid file points to the script process that 257 # Start reranker directly so pid file points to the script process that
246 # will exec uvicorn, avoiding extra shell wrapper lifecycle edge-cases. 258 # will exec uvicorn, avoiding extra shell wrapper lifecycle edge-cases.
247 - nohup "${cmd}" > "${lf}" 2>&1 & 259 + nohup "${cmd}" > >("${PROJECT_ROOT}/scripts/daily_log_router.sh" "${service}" "${LOG_DIR}" "${LOG_RETENTION_DAYS}") 2>&1 &
248 local pid=$! 260 local pid=$!
249 echo "${pid}" > "${pf}" 261 echo "${pid}" > "${pf}"
250 if wait_for_health "${service}" 90; then 262 if wait_for_health "${service}" 90; then
@@ -408,9 +420,12 @@ Default target set (when no service provided): @@ -408,9 +420,12 @@ Default target set (when no service provided):
408 status -> all known services 420 status -> all known services
409 421
410 Optional service startup: 422 Optional service startup:
411 - ./scripts/service_ctl.sh start embedding translator reranker tei cnclip 423 + ./scripts/service_ctl.sh start tei cnclip embedding translator reranker
412 TEI_DEVICE=cuda|cpu ./scripts/service_ctl.sh start tei 424 TEI_DEVICE=cuda|cpu ./scripts/service_ctl.sh start tei
413 CNCLIP_DEVICE=cuda|cpu ./scripts/service_ctl.sh start cnclip 425 CNCLIP_DEVICE=cuda|cpu ./scripts/service_ctl.sh start cnclip
  426 +
  427 +Log retention:
  428 + LOG_RETENTION_DAYS=30 ./scripts/service_ctl.sh start
414 EOF 429 EOF
415 } 430 }
416 431
@@ -12,11 +12,11 @@ echo &quot;saas-search 服务启动&quot; @@ -12,11 +12,11 @@ echo &quot;saas-search 服务启动&quot;
12 echo "========================================" 12 echo "========================================"
13 echo "默认启动核心服务: backend/indexer/frontend" 13 echo "默认启动核心服务: backend/indexer/frontend"
14 echo "可选服务请显式指定:" 14 echo "可选服务请显式指定:"
15 -echo " ./scripts/service_ctl.sh start embedding translator reranker tei cnclip" 15 +echo " ./scripts/service_ctl.sh start tei cnclip embedding translator reranker"
16 echo 16 echo
17 17
18 ./scripts/service_ctl.sh start 18 ./scripts/service_ctl.sh start
19 19
20 echo 20 echo
21 echo "当前服务状态:" 21 echo "当前服务状态:"
22 -./scripts/service_ctl.sh status backend indexer frontend embedding translator reranker tei cnclip 22 +./scripts/service_ctl.sh status backend indexer frontend tei cnclip embedding translator reranker
scripts/start_cnclip_service.sh
@@ -43,7 +43,9 @@ PROJECT_ROOT=&quot;$(cd &quot;$(dirname &quot;$0&quot;)/..&quot; &amp;&amp; pwd)&quot; @@ -43,7 +43,9 @@ PROJECT_ROOT=&quot;$(cd &quot;$(dirname &quot;$0&quot;)/..&quot; &amp;&amp; pwd)&quot;
43 CLIP_SERVER_DIR="${PROJECT_ROOT}/third-party/clip-as-service/server" 43 CLIP_SERVER_DIR="${PROJECT_ROOT}/third-party/clip-as-service/server"
44 LOG_DIR="${PROJECT_ROOT}/logs" 44 LOG_DIR="${PROJECT_ROOT}/logs"
45 PID_FILE="${LOG_DIR}/cnclip.pid" 45 PID_FILE="${LOG_DIR}/cnclip.pid"
46 -LOG_FILE="${LOG_DIR}/cnclip.log" 46 +LOG_LINK="${LOG_DIR}/cnclip.log"
  47 +LOG_FILE="${LOG_DIR}/cnclip-$(date +%F).log"
  48 +LOG_ROUTER_SCRIPT="${PROJECT_ROOT}/scripts/daily_log_router.sh"
47 49
48 # 帮助信息 50 # 帮助信息
49 show_help() { 51 show_help() {
@@ -128,6 +130,11 @@ fi @@ -128,6 +130,11 @@ fi
128 130
129 # 创建日志目录 131 # 创建日志目录
130 mkdir -p "${LOG_DIR}" 132 mkdir -p "${LOG_DIR}"
  133 +if [ ! -x "${LOG_ROUTER_SCRIPT}" ]; then
  134 + echo -e "${RED}错误: 日志路由脚本不存在或不可执行: ${LOG_ROUTER_SCRIPT}${NC}"
  135 + exit 1
  136 +fi
  137 +ln -sfn "$(basename "${LOG_FILE}")" "${LOG_LINK}"
131 138
132 # 检查是否已经有服务在运行 139 # 检查是否已经有服务在运行
133 if [ -f "${PID_FILE}" ]; then 140 if [ -f "${PID_FILE}" ]; then
@@ -208,7 +215,7 @@ echo &quot; 服务端口: ${PORT}&quot; @@ -208,7 +215,7 @@ echo &quot; 服务端口: ${PORT}&quot;
208 echo " 协议: gRPC (默认,官方推荐)" 215 echo " 协议: gRPC (默认,官方推荐)"
209 echo " 其他参数: 使用官方默认值" 216 echo " 其他参数: 使用官方默认值"
210 echo " 副本数: ${REPLICAS}" 217 echo " 副本数: ${REPLICAS}"
211 -echo " 日志文件: ${LOG_FILE}" 218 +echo " 日志文件: ${LOG_LINK}"
212 echo "" 219 echo ""
213 220
214 # 副本数显存警告 221 # 副本数显存警告
@@ -263,9 +270,9 @@ EOF @@ -263,9 +270,9 @@ EOF
263 270
264 echo -e "${GREEN}✓ 已生成配置文件: ${TEMP_FLOW_FILE}${NC}" 271 echo -e "${GREEN}✓ 已生成配置文件: ${TEMP_FLOW_FILE}${NC}"
265 272
266 -# 使用 nohup 在后台启动服务 273 +# 使用 nohup 在后台启动服务(日志按天分流)
267 cd "${CLIP_SERVER_DIR}" 274 cd "${CLIP_SERVER_DIR}"
268 -nohup python -m clip_server "${TEMP_FLOW_FILE}" > "${LOG_FILE}" 2>&1 & 275 +nohup bash -lc "exec python -m clip_server \"${TEMP_FLOW_FILE}\" > >(\"${LOG_ROUTER_SCRIPT}\" \"cnclip\" \"${LOG_DIR}\" \"${LOG_RETENTION_DAYS:-30}\") 2>&1" >/dev/null 2>&1 &
269 276
270 # 保存 PID 277 # 保存 PID
271 SERVICE_PID=$! 278 SERVICE_PID=$!
@@ -293,7 +300,7 @@ if ps -p ${SERVICE_PID} &gt; /dev/null 2&gt;&amp;1; then @@ -293,7 +300,7 @@ if ps -p ${SERVICE_PID} &gt; /dev/null 2&gt;&amp;1; then
293 echo -e " r = c.encode(['测试文本'])" 300 echo -e " r = c.encode(['测试文本'])"
294 echo "" 301 echo ""
295 echo -e "查看日志:" 302 echo -e "查看日志:"
296 - echo -e " tail -f ${LOG_FILE}" 303 + echo -e " tail -F ${LOG_LINK}"
297 echo "" 304 echo ""
298 echo -e "停止服务:" 305 echo -e "停止服务:"
299 echo -e " ./scripts/stop_cnclip_service.sh" 306 echo -e " ./scripts/stop_cnclip_service.sh"
@@ -303,7 +310,7 @@ if ps -p ${SERVICE_PID} &gt; /dev/null 2&gt;&amp;1; then @@ -303,7 +310,7 @@ if ps -p ${SERVICE_PID} &gt; /dev/null 2&gt;&amp;1; then
303 echo -e "${YELLOW}等待模型加载完成(约30-60秒)...${NC}" 310 echo -e "${YELLOW}等待模型加载完成(约30-60秒)...${NC}"
304 sleep 30 311 sleep 30
305 echo -e "${GREEN}✓ 服务已启动,请查看日志确认模型是否加载完成${NC}" 312 echo -e "${GREEN}✓ 服务已启动,请查看日志确认模型是否加载完成${NC}"
306 - echo -e "${YELLOW}查看日志: tail -f ${LOG_FILE}${NC}" 313 + echo -e "${YELLOW}查看日志: tail -F ${LOG_LINK}${NC}"
307 314
308 else 315 else
309 echo -e "${RED}========================================${NC}" 316 echo -e "${RED}========================================${NC}"
@@ -311,7 +318,7 @@ else @@ -311,7 +318,7 @@ else
311 echo -e "${RED}========================================${NC}" 318 echo -e "${RED}========================================${NC}"
312 echo "" 319 echo ""
313 echo -e "请查看日志获取详细错误信息:" 320 echo -e "请查看日志获取详细错误信息:"
314 - echo -e " tail -f ${LOG_FILE}" 321 + echo -e " tail -F ${LOG_LINK}"
315 echo "" 322 echo ""
316 rm -f "${PID_FILE}" 323 rm -f "${PID_FILE}"
317 exit 1 324 exit 1
search/searcher.py
@@ -8,6 +8,7 @@ from typing import Dict, Any, List, Optional, Union @@ -8,6 +8,7 @@ from typing import Dict, Any, List, Optional, Union
8 import os 8 import os
9 import time, json 9 import time, json
10 import logging 10 import logging
  11 +import hashlib
11 12
12 from utils.es_client import ESClient 13 from utils.es_client import ESClient
13 from query import QueryParser, ParsedQuery 14 from query import QueryParser, ParsedQuery
@@ -21,6 +22,15 @@ from api.result_formatter import ResultFormatter @@ -21,6 +22,15 @@ from api.result_formatter import ResultFormatter
21 from indexer.mapping_generator import get_tenant_index_name 22 from indexer.mapping_generator import get_tenant_index_name
22 23
23 logger = logging.getLogger(__name__) 24 logger = logging.getLogger(__name__)
  25 +backend_verbose_logger = logging.getLogger("backend.verbose")
  26 +
  27 +
  28 +def _log_backend_verbose(payload: Dict[str, Any]) -> None:
  29 + if not backend_verbose_logger.handlers:
  30 + return
  31 + backend_verbose_logger.info(
  32 + json.dumps(payload, ensure_ascii=False, separators=(",", ":"))
  33 + )
24 34
25 35
26 class SearchResult: 36 class SearchResult:
@@ -320,16 +330,33 @@ class Searcher: @@ -320,16 +330,33 @@ class Searcher:
320 context.store_intermediate_result('es_query', es_query) 330 context.store_intermediate_result('es_query', es_query)
321 context.store_intermediate_result('es_body_for_search', body_for_es) 331 context.store_intermediate_result('es_body_for_search', body_for_es)
322 332
323 - # Serialize ES query as a compact JSON string (no spaces or newlines)  
324 - es_query_compact = json.dumps(es_query, ensure_ascii=False, separators=(',', ':')) 333 + # Serialize ES query to compute a compact size + stable digest for correlation
  334 + es_query_compact = json.dumps(es_query, ensure_ascii=False, separators=(",", ":"))
  335 + es_query_digest = hashlib.sha256(es_query_compact.encode("utf-8")).hexdigest()[:16]
  336 + knn_enabled = bool(enable_embedding and parsed_query.query_vector is not None)
  337 + vector_dims = int(len(parsed_query.query_vector)) if parsed_query.query_vector is not None else 0
325 338
326 context.logger.info( 339 context.logger.info(
327 - f"ES query built | size: {len(es_query_compact)} chars | "  
328 - f"KNN: {'yes' if enable_embedding and parsed_query.query_vector is not None else 'no'} | "  
329 - f"facets: {'yes' if facets else 'no'} | "  
330 - f"query: {es_query_compact}", 340 + "ES query built | size: %s chars | digest: %s | KNN: %s | vector_dims: %s | facets: %s",
  341 + len(es_query_compact),
  342 + es_query_digest,
  343 + "yes" if knn_enabled else "no",
  344 + vector_dims,
  345 + "yes" if facets else "no",
331 extra={'reqid': context.reqid, 'uid': context.uid} 346 extra={'reqid': context.reqid, 'uid': context.uid}
332 ) 347 )
  348 + _log_backend_verbose({
  349 + "event": "es_query_built",
  350 + "reqid": context.reqid,
  351 + "uid": context.uid,
  352 + "tenant_id": tenant_id,
  353 + "size_chars": len(es_query_compact),
  354 + "sha256_16": es_query_digest,
  355 + "knn_enabled": knn_enabled,
  356 + "vector_dims": vector_dims,
  357 + "has_facets": bool(facets),
  358 + "query": es_query,
  359 + })
333 except Exception as e: 360 except Exception as e:
334 context.set_error(e) 361 context.set_error(e)
335 context.logger.error( 362 context.logger.error(