Commit 22ae00c78124374e37763045a0b9dd35915c9058

Authored by tangwang
1 parent 77ab67ad

product_annotator

api/routes/indexer.py
... ... @@ -443,10 +443,10 @@ async def build_docs_from_db(request: BuildDocsFromDbRequest):
443 443  
444 444 def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]], languages: List[str]) -> List[Dict[str, Any]]:
445 445 """
446   - 同步执行内容理解:调用 process_products.analyze_products,按语言批量跑 LLM,
  446 + 同步执行内容理解:调用 product_annotator.analyze_products,按语言批量跑 LLM,
447 447 再聚合成每 SPU 的 qanchors、semantic_attributes、tags。供 run_in_executor 调用。
448 448 """
449   - from indexer.process_products import analyze_products, SUPPORTED_LANGS
  449 + from indexer.product_annotator import analyze_products, SUPPORTED_LANGS
450 450  
451 451 llm_langs = [lang for lang in languages if lang in SUPPORTED_LANGS]
452 452 if not llm_langs:
... ... @@ -544,7 +544,7 @@ async def enrich_content(request: EnrichContentRequest):
544 544 - 与 /indexer/build-docs 解耦,避免 build-docs 因 LLM 耗时过长而阻塞;调用方可
545 545 先拿不含 qanchors/tags 的 doc,再异步或离线补齐本接口结果后更新 ES。
546 546  
547   - 实现逻辑与 indexer.process_products.analyze_products 一致,支持多语言与 Redis 缓存。
  547 + 实现逻辑与 indexer.product_annotator.analyze_products 一致,支持多语言与 Redis 缓存。
548 548 """
549 549 try:
550 550 if not request.items:
... ...
config/config.yaml
... ... @@ -161,7 +161,7 @@ services:
161 161 base_url: "http://127.0.0.1:6007"
162 162 service_url: "http://127.0.0.1:6007/rerank"
163 163 # 服务内后端(reranker 进程启动时读取)
164   - backend: "dashscope_rerank" # bge | qwen3_vllm | qwen3_transformers | dashscope_rerank
  164 + backend: "qwen3_vllm" # bge | qwen3_vllm | qwen3_transformers | dashscope_rerank
165 165 backends:
166 166 bge:
167 167 model_name: "BAAI/bge-reranker-v2-m3"
... ...
docs/搜索API对接指南.md
... ... @@ -1484,7 +1484,7 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \
1484 1484 ### 5.8 内容理解字段生成接口
1485 1485  
1486 1486 - **端点**: `POST /indexer/enrich-content`
1487   -- **描述**: 根据商品内容信息批量生成 **qanchors**(锚文本)、**semantic_attributes**(语义属性)、**tags**(细分标签),供外部 indexer 在「微服务组合」方式下自行拼装 doc 时使用。请求以 `items[]` 传入商品内容字段(必填/可选见下表)。内部逻辑与 `indexer.process_products` 一致,支持多语言与 Redis 缓存;单次请求在线程池中执行,避免阻塞其他接口。
  1487 +- **描述**: 根据商品内容信息批量生成 **qanchors**(锚文本)、**semantic_attributes**(语义属性)、**tags**(细分标签),供外部 indexer 在「微服务组合」方式下自行拼装 doc 时使用。请求以 `items[]` 传入商品内容字段(必填/可选见下表)。内部逻辑与 `indexer.product_annotator` 一致,支持多语言与 Redis 缓存;单次请求在线程池中执行,避免阻塞其他接口。
1488 1488  
1489 1489 #### 请求参数
1490 1490  
... ...
indexer/ANCHORS_AND_SEMANTIC_ATTRIBUTES.md
... ... @@ -82,14 +82,14 @@
82 82  
83 83 ---
84 84  
85   -### 2. LLM 分析服务:`indexer/process_products.py`
  85 +### 2. LLM 分析服务:`indexer/product_annotator.py`
86 86  
87 87 #### 2.1 入口函数:`analyze_products`
88 88  
89   -- **文件**:`indexer/process_products.py`
  89 +- **文件**:`indexer/product_annotator.py`
90 90 - **函数签名**:
91 91  
92   -```365:392:/home/tw/saas-search/indexer/process_products.py
  92 +```365:392:/home/tw/saas-search/indexer/product_annotator.py
93 93 def analyze_products(
94 94 products: List[Dict[str, str]],
95 95 target_lang: str = "zh",
... ... @@ -108,7 +108,7 @@ def analyze_products(
108 108  
109 109 - **支持的输出语言**(在同文件中定义):
110 110  
111   -```54:62:/home/tw/saas-search/indexer/process_products.py
  111 +```54:62:/home/tw/saas-search/indexer/product_annotator.py
112 112 LANG_LABELS: Dict[str, str] = {
113 113 "zh": "中文",
114 114 "en": "英文",
... ... @@ -148,7 +148,7 @@ SUPPORTED_LANGS = set(LANG_LABELS.keys())
148 148  
149 149 - Prompt 中会明确要求“**所有输出内容使用目标语言**”,并给出中英文示例:
150 150  
151   -```65:81:/home/tw/saas-search/indexer/process_products.py
  151 +```65:81:/home/tw/saas-search/indexer/product_annotator.py
152 152 def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> str:
153 153 """创建LLM提示词(根据目标语言输出)"""
154 154 lang_label = LANG_LABELS.get(target_lang, "对应语言")
... ... @@ -170,7 +170,7 @@ def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> st
170 170  
171 171 - 返回格式固定为 Markdown 表格,首行头为:
172 172  
173   -```89:91:/home/tw/saas-search/indexer/process_products.py
  173 +```89:91:/home/tw/saas-search/indexer/product_annotator.py
174 174 | 序号 | 商品标题 | 品类路径 | 细分标签 | 适用人群 | 使用场景 | 适用季节 | 关键属性 | 材质说明 | 功能特点 | 商品卖点 | 锚文本 |
175 175 |----|----|----|----|----|----|----|----|----|----|----|----|
176 176 ```
... ...
indexer/document_transformer.py
... ... @@ -14,7 +14,7 @@ import logging
14 14 import re
15 15 from typing import Dict, Any, Optional, List
16 16 from config import ConfigLoader
17   -from indexer.process_products import analyze_products, SUPPORTED_LANGS
  17 +from indexer.product_annotator import analyze_products, SUPPORTED_LANGS
18 18  
19 19 logger = logging.getLogger(__name__)
20 20  
... ... @@ -641,7 +641,7 @@ class SPUDocumentTransformer:
641 641  
642 642 def _fill_llm_attributes(self, doc: Dict[str, Any], spu_row: pd.Series) -> None:
643 643 """
644   - 调用 indexer.process_products.analyze_products,为当前 SPU 填充:
  644 + 调用 indexer.product_annotator.analyze_products,为当前 SPU 填充:
645 645 - qanchors.{lang}
646 646 - semantic_attributes (lang/name/value)
647 647 """
... ...
indexer/process_products.py renamed to indexer/product_annotator.py
... ... @@ -43,20 +43,28 @@ OUTPUT_DIR = Path("output_logs")
43 43 OUTPUT_FILE = OUTPUT_DIR / "products_analyzed.csv"
44 44 LOG_DIR = OUTPUT_DIR / "logs"
45 45  
46   -# 设置日志
  46 +# 设置独立日志(不影响全局 indexer.log)
47 47 LOG_DIR.mkdir(parents=True, exist_ok=True)
48 48 timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
49 49 log_file = LOG_DIR / f"process_{timestamp}.log"
50 50  
51   -logging.basicConfig(
52   - level=logging.INFO,
53   - format='%(asctime)s - %(levelname)s - %(message)s',
54   - handlers=[
55   - logging.FileHandler(log_file, encoding='utf-8'),
56   - logging.StreamHandler()
57   - ]
58   -)
59   -logger = logging.getLogger(__name__)
  51 +logger = logging.getLogger("product_annotator")
  52 +logger.setLevel(logging.INFO)
  53 +
  54 +if not logger.handlers:
  55 + formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
  56 +
  57 + file_handler = logging.FileHandler(log_file, encoding="utf-8")
  58 + file_handler.setFormatter(formatter)
  59 +
  60 + stream_handler = logging.StreamHandler()
  61 + stream_handler.setFormatter(formatter)
  62 +
  63 + logger.addHandler(file_handler)
  64 + logger.addHandler(stream_handler)
  65 +
  66 + # 避免日志向根 logger 传播,防止写入 logs/indexer.log 等其他文件
  67 + logger.propagate = False
60 68  
61 69  
62 70 # Redis 缓存(用于 anchors / 语义属性)
... ...
scripts/service_ctl.sh
... ... @@ -868,6 +868,7 @@ Special targets:
868 868  
869 869 Examples:
870 870 ./scripts/service_ctl.sh up all
  871 + ./scripts/service_ctl.sh up tei cnclip embedding translator reranker
871 872 ./scripts/service_ctl.sh up backend indexer frontend
872 873 ./scripts/service_ctl.sh restart
873 874 ./scripts/service_ctl.sh monitor-start all
... ...
tests/ci/test_service_api_contracts.py
... ... @@ -342,7 +342,7 @@ def test_indexer_build_docs_from_db_contract(indexer_client: TestClient):
342 342  
343 343  
344 344 def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch):
345   - import indexer.process_products as process_products
  345 + import indexer.product_annotator as process_products
346 346  
347 347 def _fake_analyze_products(
348 348 products: List[Dict[str, str]],
... ...
tests/test_process_products_batching.py
... ... @@ -2,7 +2,7 @@ from __future__ import annotations
2 2  
3 3 from typing import Any, Dict, List
4 4  
5   -import indexer.process_products as process_products
  5 +import indexer.product_annotator as process_products
6 6  
7 7  
8 8 def _mk_products(n: int) -> List[Dict[str, str]]:
... ...