Commit 22ae00c78124374e37763045a0b9dd35915c9058
1 parent
77ab67ad
product_annotator
Showing
9 changed files
with
34 additions
and
25 deletions
Show diff stats
api/routes/indexer.py
| ... | ... | @@ -443,10 +443,10 @@ async def build_docs_from_db(request: BuildDocsFromDbRequest): |
| 443 | 443 | |
| 444 | 444 | def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]], languages: List[str]) -> List[Dict[str, Any]]: |
| 445 | 445 | """ |
| 446 | - 同步执行内容理解:调用 process_products.analyze_products,按语言批量跑 LLM, | |
| 446 | + 同步执行内容理解:调用 product_annotator.analyze_products,按语言批量跑 LLM, | |
| 447 | 447 | 再聚合成每 SPU 的 qanchors、semantic_attributes、tags。供 run_in_executor 调用。 |
| 448 | 448 | """ |
| 449 | - from indexer.process_products import analyze_products, SUPPORTED_LANGS | |
| 449 | + from indexer.product_annotator import analyze_products, SUPPORTED_LANGS | |
| 450 | 450 | |
| 451 | 451 | llm_langs = [lang for lang in languages if lang in SUPPORTED_LANGS] |
| 452 | 452 | if not llm_langs: |
| ... | ... | @@ -544,7 +544,7 @@ async def enrich_content(request: EnrichContentRequest): |
| 544 | 544 | - 与 /indexer/build-docs 解耦,避免 build-docs 因 LLM 耗时过长而阻塞;调用方可 |
| 545 | 545 | 先拿不含 qanchors/tags 的 doc,再异步或离线补齐本接口结果后更新 ES。 |
| 546 | 546 | |
| 547 | - 实现逻辑与 indexer.process_products.analyze_products 一致,支持多语言与 Redis 缓存。 | |
| 547 | + 实现逻辑与 indexer.product_annotator.analyze_products 一致,支持多语言与 Redis 缓存。 | |
| 548 | 548 | """ |
| 549 | 549 | try: |
| 550 | 550 | if not request.items: | ... | ... |
config/config.yaml
| ... | ... | @@ -161,7 +161,7 @@ services: |
| 161 | 161 | base_url: "http://127.0.0.1:6007" |
| 162 | 162 | service_url: "http://127.0.0.1:6007/rerank" |
| 163 | 163 | # 服务内后端(reranker 进程启动时读取) |
| 164 | - backend: "dashscope_rerank" # bge | qwen3_vllm | qwen3_transformers | dashscope_rerank | |
| 164 | + backend: "qwen3_vllm" # bge | qwen3_vllm | qwen3_transformers | dashscope_rerank | |
| 165 | 165 | backends: |
| 166 | 166 | bge: |
| 167 | 167 | model_name: "BAAI/bge-reranker-v2-m3" | ... | ... |
docs/搜索API对接指南.md
| ... | ... | @@ -1484,7 +1484,7 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ |
| 1484 | 1484 | ### 5.8 内容理解字段生成接口 |
| 1485 | 1485 | |
| 1486 | 1486 | - **端点**: `POST /indexer/enrich-content` |
| 1487 | -- **描述**: 根据商品内容信息批量生成 **qanchors**(锚文本)、**semantic_attributes**(语义属性)、**tags**(细分标签),供外部 indexer 在「微服务组合」方式下自行拼装 doc 时使用。请求以 `items[]` 传入商品内容字段(必填/可选见下表)。内部逻辑与 `indexer.process_products` 一致,支持多语言与 Redis 缓存;单次请求在线程池中执行,避免阻塞其他接口。 | |
| 1487 | +- **描述**: 根据商品内容信息批量生成 **qanchors**(锚文本)、**semantic_attributes**(语义属性)、**tags**(细分标签),供外部 indexer 在「微服务组合」方式下自行拼装 doc 时使用。请求以 `items[]` 传入商品内容字段(必填/可选见下表)。内部逻辑与 `indexer.product_annotator` 一致,支持多语言与 Redis 缓存;单次请求在线程池中执行,避免阻塞其他接口。 | |
| 1488 | 1488 | |
| 1489 | 1489 | #### 请求参数 |
| 1490 | 1490 | ... | ... |
indexer/ANCHORS_AND_SEMANTIC_ATTRIBUTES.md
| ... | ... | @@ -82,14 +82,14 @@ |
| 82 | 82 | |
| 83 | 83 | --- |
| 84 | 84 | |
| 85 | -### 2. LLM 分析服务:`indexer/process_products.py` | |
| 85 | +### 2. LLM 分析服务:`indexer/product_annotator.py` | |
| 86 | 86 | |
| 87 | 87 | #### 2.1 入口函数:`analyze_products` |
| 88 | 88 | |
| 89 | -- **文件**:`indexer/process_products.py` | |
| 89 | +- **文件**:`indexer/product_annotator.py` | |
| 90 | 90 | - **函数签名**: |
| 91 | 91 | |
| 92 | -```365:392:/home/tw/saas-search/indexer/process_products.py | |
| 92 | +```365:392:/home/tw/saas-search/indexer/product_annotator.py | |
| 93 | 93 | def analyze_products( |
| 94 | 94 | products: List[Dict[str, str]], |
| 95 | 95 | target_lang: str = "zh", |
| ... | ... | @@ -108,7 +108,7 @@ def analyze_products( |
| 108 | 108 | |
| 109 | 109 | - **支持的输出语言**(在同文件中定义): |
| 110 | 110 | |
| 111 | -```54:62:/home/tw/saas-search/indexer/process_products.py | |
| 111 | +```54:62:/home/tw/saas-search/indexer/product_annotator.py | |
| 112 | 112 | LANG_LABELS: Dict[str, str] = { |
| 113 | 113 | "zh": "中文", |
| 114 | 114 | "en": "英文", |
| ... | ... | @@ -148,7 +148,7 @@ SUPPORTED_LANGS = set(LANG_LABELS.keys()) |
| 148 | 148 | |
| 149 | 149 | - Prompt 中会明确要求“**所有输出内容使用目标语言**”,并给出中英文示例: |
| 150 | 150 | |
| 151 | -```65:81:/home/tw/saas-search/indexer/process_products.py | |
| 151 | +```65:81:/home/tw/saas-search/indexer/product_annotator.py | |
| 152 | 152 | def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> str: |
| 153 | 153 | """创建LLM提示词(根据目标语言输出)""" |
| 154 | 154 | lang_label = LANG_LABELS.get(target_lang, "对应语言") |
| ... | ... | @@ -170,7 +170,7 @@ def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> st |
| 170 | 170 | |
| 171 | 171 | - 返回格式固定为 Markdown 表格,首行头为: |
| 172 | 172 | |
| 173 | -```89:91:/home/tw/saas-search/indexer/process_products.py | |
| 173 | +```89:91:/home/tw/saas-search/indexer/product_annotator.py | |
| 174 | 174 | | 序号 | 商品标题 | 品类路径 | 细分标签 | 适用人群 | 使用场景 | 适用季节 | 关键属性 | 材质说明 | 功能特点 | 商品卖点 | 锚文本 | |
| 175 | 175 | |----|----|----|----|----|----|----|----|----|----|----|----| |
| 176 | 176 | ``` | ... | ... |
indexer/document_transformer.py
| ... | ... | @@ -14,7 +14,7 @@ import logging |
| 14 | 14 | import re |
| 15 | 15 | from typing import Dict, Any, Optional, List |
| 16 | 16 | from config import ConfigLoader |
| 17 | -from indexer.process_products import analyze_products, SUPPORTED_LANGS | |
| 17 | +from indexer.product_annotator import analyze_products, SUPPORTED_LANGS | |
| 18 | 18 | |
| 19 | 19 | logger = logging.getLogger(__name__) |
| 20 | 20 | |
| ... | ... | @@ -641,7 +641,7 @@ class SPUDocumentTransformer: |
| 641 | 641 | |
| 642 | 642 | def _fill_llm_attributes(self, doc: Dict[str, Any], spu_row: pd.Series) -> None: |
| 643 | 643 | """ |
| 644 | - 调用 indexer.process_products.analyze_products,为当前 SPU 填充: | |
| 644 | + 调用 indexer.product_annotator.analyze_products,为当前 SPU 填充: | |
| 645 | 645 | - qanchors.{lang} |
| 646 | 646 | - semantic_attributes (lang/name/value) |
| 647 | 647 | """ | ... | ... |
indexer/process_products.py renamed to indexer/product_annotator.py
| ... | ... | @@ -43,20 +43,28 @@ OUTPUT_DIR = Path("output_logs") |
| 43 | 43 | OUTPUT_FILE = OUTPUT_DIR / "products_analyzed.csv" |
| 44 | 44 | LOG_DIR = OUTPUT_DIR / "logs" |
| 45 | 45 | |
| 46 | -# 设置日志 | |
| 46 | +# 设置独立日志(不影响全局 indexer.log) | |
| 47 | 47 | LOG_DIR.mkdir(parents=True, exist_ok=True) |
| 48 | 48 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| 49 | 49 | log_file = LOG_DIR / f"process_{timestamp}.log" |
| 50 | 50 | |
| 51 | -logging.basicConfig( | |
| 52 | - level=logging.INFO, | |
| 53 | - format='%(asctime)s - %(levelname)s - %(message)s', | |
| 54 | - handlers=[ | |
| 55 | - logging.FileHandler(log_file, encoding='utf-8'), | |
| 56 | - logging.StreamHandler() | |
| 57 | - ] | |
| 58 | -) | |
| 59 | -logger = logging.getLogger(__name__) | |
| 51 | +logger = logging.getLogger("product_annotator") | |
| 52 | +logger.setLevel(logging.INFO) | |
| 53 | + | |
| 54 | +if not logger.handlers: | |
| 55 | + formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") | |
| 56 | + | |
| 57 | + file_handler = logging.FileHandler(log_file, encoding="utf-8") | |
| 58 | + file_handler.setFormatter(formatter) | |
| 59 | + | |
| 60 | + stream_handler = logging.StreamHandler() | |
| 61 | + stream_handler.setFormatter(formatter) | |
| 62 | + | |
| 63 | + logger.addHandler(file_handler) | |
| 64 | + logger.addHandler(stream_handler) | |
| 65 | + | |
| 66 | + # 避免日志向根 logger 传播,防止写入 logs/indexer.log 等其他文件 | |
| 67 | + logger.propagate = False | |
| 60 | 68 | |
| 61 | 69 | |
| 62 | 70 | # Redis 缓存(用于 anchors / 语义属性) | ... | ... |
scripts/service_ctl.sh
| ... | ... | @@ -868,6 +868,7 @@ Special targets: |
| 868 | 868 | |
| 869 | 869 | Examples: |
| 870 | 870 | ./scripts/service_ctl.sh up all |
| 871 | + ./scripts/service_ctl.sh up tei cnclip embedding translator reranker | |
| 871 | 872 | ./scripts/service_ctl.sh up backend indexer frontend |
| 872 | 873 | ./scripts/service_ctl.sh restart |
| 873 | 874 | ./scripts/service_ctl.sh monitor-start all | ... | ... |
tests/ci/test_service_api_contracts.py
| ... | ... | @@ -342,7 +342,7 @@ def test_indexer_build_docs_from_db_contract(indexer_client: TestClient): |
| 342 | 342 | |
| 343 | 343 | |
| 344 | 344 | def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch): |
| 345 | - import indexer.process_products as process_products | |
| 345 | + import indexer.product_annotator as process_products | |
| 346 | 346 | |
| 347 | 347 | def _fake_analyze_products( |
| 348 | 348 | products: List[Dict[str, str]], | ... | ... |
tests/test_process_products_batching.py