product_annotator

tangwang
1 parent 77ab67ad
Showing 9 changed files with 34 additions and 25 deletions Show diff stats
api/routes/indexer.py
config/config.yaml
docs/搜索API对接指南.md
indexer/ANCHORS_AND_SEMANTIC_ATTRIBUTES.md
indexer/document_transformer.py
indexer/process_products.py -> indexer/product_annotator.py
scripts/service_ctl.sh
tests/ci/test_service_api_contracts.py
tests/test_process_products_batching.py
@@ -443,10 +443,10 @@ async def build_docs_from_db(request: BuildDocsFromDbRequest):
  
 def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]], languages: List[str]) -> List[Dict[str, Any]]:
     """
-    同步执行内容理解：调用 process_products.analyze_products，按语言批量跑 LLM，
+    同步执行内容理解：调用 product_annotator.analyze_products，按语言批量跑 LLM，
     再聚合成每 SPU 的 qanchors、semantic_attributes、tags。供 run_in_executor 调用。
     """
-    from indexer.process_products import analyze_products, SUPPORTED_LANGS
+    from indexer.product_annotator import analyze_products, SUPPORTED_LANGS
  
     llm_langs = [lang for lang in languages if lang in SUPPORTED_LANGS]
     if not llm_langs:
@@ -544,7 +544,7 @@ async def enrich_content(request: EnrichContentRequest):
     - 与 /indexer/build-docs 解耦，避免 build-docs 因 LLM 耗时过长而阻塞；调用方可
       先拿不含 qanchors/tags 的 doc，再异步或离线补齐本接口结果后更新 ES。
  
-    实现逻辑与 indexer.process_products.analyze_products 一致，支持多语言与 Redis 缓存。
+    实现逻辑与 indexer.product_annotator.analyze_products 一致，支持多语言与 Redis 缓存。
     """
     try:
         if not request.items:
@@ -161,7 +161,7 @@ services:
         base_url: "http://127.0.0.1:6007"
         service_url: "http://127.0.0.1:6007/rerank"
     # 服务内后端（reranker 进程启动时读取）
-    backend: "dashscope_rerank"  # bge | qwen3_vllm | qwen3_transformers | dashscope_rerank
+    backend: "qwen3_vllm"  # bge | qwen3_vllm | qwen3_transformers | dashscope_rerank
     backends:
       bge:
         model_name: "BAAI/bge-reranker-v2-m3"
@@ -1484,7 +1484,7 @@ curl -X POST &quot;http://127.0.0.1:6004/indexer/build-docs-from-db&quot; \
 ### 5.8 内容理解字段生成接口
  
 - **端点**: `POST /indexer/enrich-content`
-- **描述**: 根据商品内容信息批量生成 **qanchors**（锚文本）、**semantic_attributes**（语义属性）、**tags**（细分标签），供外部 indexer 在「微服务组合」方式下自行拼装 doc 时使用。请求以 `items[]` 传入商品内容字段（必填/可选见下表）。内部逻辑与 `indexer.process_products` 一致，支持多语言与 Redis 缓存；单次请求在线程池中执行，避免阻塞其他接口。
+- **描述**: 根据商品内容信息批量生成 **qanchors**（锚文本）、**semantic_attributes**（语义属性）、**tags**（细分标签），供外部 indexer 在「微服务组合」方式下自行拼装 doc 时使用。请求以 `items[]` 传入商品内容字段（必填/可选见下表）。内部逻辑与 `indexer.product_annotator` 一致，支持多语言与 Redis 缓存；单次请求在线程池中执行，避免阻塞其他接口。
  
 #### 请求参数
  
@@ -82,14 +82,14 @@
  
 ---
  
-### 2. LLM 分析服务：`indexer/process_products.py`
+### 2. LLM 分析服务：`indexer/product_annotator.py`
  
 #### 2.1 入口函数：`analyze_products`
  
-- **文件**：`indexer/process_products.py`
+- **文件**：`indexer/product_annotator.py`
 - **函数签名**：
  
-```365:392:/home/tw/saas-search/indexer/process_products.py
+```365:392:/home/tw/saas-search/indexer/product_annotator.py
 def analyze_products(
     products: List[Dict[str, str]],
     target_lang: str = "zh",
@@ -108,7 +108,7 @@ def analyze_products(
  
 - **支持的输出语言**（在同文件中定义）：
  
-```54:62:/home/tw/saas-search/indexer/process_products.py
+```54:62:/home/tw/saas-search/indexer/product_annotator.py
 LANG_LABELS: Dict[str, str] = {
     "zh": "中文",
     "en": "英文",
@@ -148,7 +148,7 @@ SUPPORTED_LANGS = set(LANG_LABELS.keys())
  
 - Prompt 中会明确要求“**所有输出内容使用目标语言**”，并给出中英文示例：
  
-```65:81:/home/tw/saas-search/indexer/process_products.py
+```65:81:/home/tw/saas-search/indexer/product_annotator.py
 def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> str:
     """创建LLM提示词（根据目标语言输出）"""
     lang_label = LANG_LABELS.get(target_lang, "对应语言")
@@ -170,7 +170,7 @@ def create_prompt(products: List[Dict[str, str]], target_lang: str = &quot;zh&quot;) -&gt; st
  
 - 返回格式固定为 Markdown 表格，首行头为：
  
-```89:91:/home/tw/saas-search/indexer/process_products.py
+```89:91:/home/tw/saas-search/indexer/product_annotator.py
 | 序号 | 商品标题 | 品类路径 | 细分标签 | 适用人群 | 使用场景 | 适用季节 | 关键属性 | 材质说明 | 功能特点 | 商品卖点 | 锚文本 |
 |----|----|----|----|----|----|----|----|----|----|----|----|
 ```
@@ -14,7 +14,7 @@ import logging
 import re
 from typing import Dict, Any, Optional, List
 from config import ConfigLoader
-from indexer.process_products import analyze_products, SUPPORTED_LANGS
+from indexer.product_annotator import analyze_products, SUPPORTED_LANGS
  
 logger = logging.getLogger(__name__)
  
@@ -641,7 +641,7 @@ class SPUDocumentTransformer:
  
     def _fill_llm_attributes(self, doc: Dict[str, Any], spu_row: pd.Series) -> None:
         """
-        调用 indexer.process_products.analyze_products，为当前 SPU 填充：
+        调用 indexer.product_annotator.analyze_products，为当前 SPU 填充：
         - qanchors.{lang}
         - semantic_attributes (lang/name/value)
         """
@@ -43,20 +43,28 @@ OUTPUT_DIR = Path(&quot;output_logs&quot;)
 OUTPUT_FILE = OUTPUT_DIR / "products_analyzed.csv"
 LOG_DIR = OUTPUT_DIR / "logs"
  
-# 设置日志
+# 设置独立日志（不影响全局 indexer.log）
 LOG_DIR.mkdir(parents=True, exist_ok=True)
 timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
 log_file = LOG_DIR / f"process_{timestamp}.log"
  
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(levelname)s - %(message)s',
-    handlers=[
-        logging.FileHandler(log_file, encoding='utf-8'),
-        logging.StreamHandler()
-    ]
-)
-logger = logging.getLogger(__name__)
+logger = logging.getLogger("product_annotator")
+logger.setLevel(logging.INFO)
+
+if not logger.handlers:
+    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
+
+    file_handler = logging.FileHandler(log_file, encoding="utf-8")
+    file_handler.setFormatter(formatter)
+
+    stream_handler = logging.StreamHandler()
+    stream_handler.setFormatter(formatter)
+
+    logger.addHandler(file_handler)
+    logger.addHandler(stream_handler)
+
+    # 避免日志向根 logger 传播，防止写入 logs/indexer.log 等其他文件
+    logger.propagate = False
  
  
 # Redis 缓存（用于 anchors / 语义属性）
@@ -868,6 +868,7 @@ Special targets:
  
 Examples:
   ./scripts/service_ctl.sh up all
+  ./scripts/service_ctl.sh up tei cnclip embedding translator reranker
   ./scripts/service_ctl.sh up backend indexer frontend
   ./scripts/service_ctl.sh restart
   ./scripts/service_ctl.sh monitor-start all
@@ -342,7 +342,7 @@ def test_indexer_build_docs_from_db_contract(indexer_client: TestClient):
  
  
 def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch):
-    import indexer.process_products as process_products
+    import indexer.product_annotator as process_products
  
     def _fake_analyze_products(
         products: List[Dict[str, str]],
@@ -2,7 +2,7 @@ from __future__ import annotations
  
 from typing import Any, Dict, List
  
-import indexer.process_products as process_products
+import indexer.product_annotator as process_products
  
  
 def _mk_products(n: int) -> List[Dict[str, str]]: