Commit f251cf2d502df5c8e9da2b00757eee2444694e21

Authored by tangwang
1 parent ded6f29e

suggestion全量索引程序跑通

docs/Usage-Guide.md
... ... @@ -343,6 +343,49 @@ curl -X POST http://localhost:6002/search/image \
343 343  
344 344 ---
345 345  
  346 +## 8. Suggestion 索引与接口使用
  347 +
  348 +### 8.1 构建 Suggestion 索引(全量)
  349 +
  350 +Suggestion 索引会从:
  351 +
  352 +- ES 商品索引:`title.{lang}`, `qanchors.{lang}`
  353 +- MySQL 日志表:`shoplazza_search_log.query`(含 `language`、`request_params`)
  354 +
  355 +聚合生成 `search_suggestions_tenant_{tenant_id}`。
  356 +
  357 +在项目根目录执行:
  358 +
  359 +```bash
  360 +# 为指定租户全量重建 suggestion 索引(会删除旧索引)
  361 +python main.py build-suggestions \
  362 + --tenant-id 162 \
  363 + --es-host http://localhost:9200 \
  364 + --days 30 \
  365 + --recreate
  366 +```
  367 +
  368 +可选参数:
  369 +
  370 +- `--days`:回溯日志天数(默认 30)
  371 +- `--batch-size`:扫描商品索引的批大小(默认 500)
  372 +- `--min-query-len`:参与 suggestion 的最小查询长度(默认 1)
  373 +
  374 +> 建议在商品索引构建完成、日志正常写入一段时间后执行一次全量构建,然后按天/小时增加增量构建任务。
  375 +
  376 +### 8.2 调用 Suggestion 接口
  377 +
  378 +全量构建完成后,可直接通过 `/search/suggestions` 获取自动补全结果:
  379 +
  380 +```bash
  381 +curl "http://localhost:6002/search/suggestions?q=iph&size=5&language=en&with_results=true" \
  382 + -H "X-Tenant-ID: 162"
  383 +```
  384 +
  385 +接口返回结构详见 `docs/搜索API对接指南.md` 的“3.7 搜索建议接口”章节。
  386 +
  387 +---
  388 +
346 389 ## 常见问题
347 390  
348 391 ### Q1: MySQL连接失败
... ...
docs/搜索API对接指南.md
... ... @@ -558,40 +558,59 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"})
558 558  
559 559 ### 3.7 搜索建议接口
560 560  
561   -> ⚠️ **TODO**: 此接口当前为框架实现,功能暂未实现,仅返回空结果。接口和响应格式已经固定,可平滑扩展。
562   -
563 561 - **端点**: `GET /search/suggestions`
564   -- **描述**: 返回搜索建议(自动补全/热词)。当前为框架实现,接口和响应格式已经固定,可平滑扩展
  562 +- **描述**: 返回搜索建议(自动补全/热词),支持多语言与“结果直达”(每条 suggestion 附带商品列表)
565 563  
566 564 #### 查询参数
567 565  
568 566 | 参数 | 类型 | 必填 | 默认值 | 描述 |
569 567 |------|------|------|--------|------|
570 568 | `q` | string | Y | - | 查询字符串(至少 1 个字符) |
571   -| `size` | integer | N | 5 | 返回建议数量(1-20) |
572   -| `types` | string | N | `query` | 建议类型(逗号分隔):`query`, `product`, `category`, `brand` |
  569 +| `size` | integer | N | 10 | 返回建议数量(1-20) |
  570 +| `language` | string | N | `en` | 请求语言,如 `zh` / `en` / `ar` / `ru`,用于路由到对应语种 suggestion 索引 |
  571 +| `with_results` | bool | N | `true` | 是否为每条 suggestion 返回商品列表(结果直达) |
  572 +| `result_size` | integer | N | 3 | 每条 suggestion 返回的商品数量(1-10) |
  573 +| `debug` | bool | N | `false` | 是否开启调试(目前主要用于排查 suggestion 排序与语言解析) |
  574 +
  575 +> **租户标识**:同 [3.1](#31-接口信息),通过请求头 `X-Tenant-ID` 或 query 参数 `tenant_id` 传递。
573 576  
574 577 #### 响应示例
575 578  
576 579 ```json
577 580 {
578   - "query": "芭",
  581 + "query": "iph",
  582 + "language": "en",
  583 + "resolved_language": "en",
579 584 "suggestions": [
580 585 {
581   - "text": "芭比娃娃",
582   - "type": "query",
583   - "highlight": "<em>芭</em>比娃娃",
584   - "popularity": 850
  586 + "text": "iphone 15",
  587 + "lang": "en",
  588 + "score": 12.37,
  589 + "rank_score": 5.1,
  590 + "sources": ["query_log", "qanchor"],
  591 + "lang_source": "log_field",
  592 + "lang_confidence": 1.0,
  593 + "lang_conflict": false,
  594 + "products": [
  595 + {
  596 + "spu_id": "12345",
  597 + "title": "iPhone 15 Pro Max",
  598 + "price": 999.0,
  599 + "image_url": "https://example.com/image.jpg",
  600 + "score": 3.21
  601 + }
  602 + ]
585 603 }
586 604 ],
587   - "took_ms": 5
  605 + "took_ms": 12
588 606 }
589 607 ```
590 608  
591 609 #### 请求示例
592 610  
593 611 ```bash
594   -curl "http://localhost:6002/search/suggestions?q=芭&size=5&types=query,product"
  612 +curl "http://localhost:6002/search/suggestions?q=芭&size=5&language=zh&with_results=true" \
  613 + -H "X-Tenant-ID: 162"
595 614 ```
596 615  
597 616 ### 3.8 即时搜索接口
... ...
docs/搜索API速查表.md
... ... @@ -294,7 +294,7 @@ POST /search/image
294 294 "size": 20
295 295 }
296 296  
297   -GET /search/suggestions?q=芭&size=5
  297 +GET /search/suggestions?q=芭&size=5&language=zh&with_results=true
298 298  
299 299 GET /search/instant?q=玩具&size=5
300 300  
... ...
... ... @@ -12,13 +12,12 @@ import sys
12 12 import os
13 13 import argparse
14 14 import json
15   -import uvicorn
16 15  
17 16 # Add parent directory to path
18 17 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
19 18  
20 19 from config import ConfigLoader
21   -from config.env_config import DB_CONFIG
  20 +from config.env_config import ES_CONFIG
22 21 from utils import ESClient
23 22 from search import Searcher
24 23 from suggestion import SuggestionIndexBuilder
... ... @@ -27,6 +26,7 @@ from utils.db_connector import create_db_connection
27 26  
28 27 def cmd_serve(args):
29 28 """Start API service."""
  29 + import uvicorn
30 30 os.environ['ES_HOST'] = args.es_host
31 31  
32 32 print("Starting API service (multi-tenant)...")
... ... @@ -43,6 +43,7 @@ def cmd_serve(args):
43 43  
44 44 def cmd_serve_indexer(args):
45 45 """Start dedicated Indexer API service (no search endpoints)."""
  46 + import uvicorn
46 47 os.environ['ES_HOST'] = args.es_host
47 48  
48 49 print("Starting Indexer API service...")
... ... @@ -102,17 +103,33 @@ def cmd_search(args):
102 103  
103 104 def cmd_build_suggestions(args):
104 105 """Build suggestion index for a tenant."""
105   - es_client = ESClient(hosts=[args.es_host])
  106 + # Initialize ES client with optional authentication
  107 + es_username = os.getenv("ES_USERNAME") or ES_CONFIG.get("username")
  108 + es_password = os.getenv("ES_PASSWORD") or ES_CONFIG.get("password")
  109 + if es_username and es_password:
  110 + es_client = ESClient(hosts=[args.es_host], username=es_username, password=es_password)
  111 + else:
  112 + es_client = ESClient(hosts=[args.es_host])
106 113 if not es_client.ping():
107 114 print(f"ERROR: Cannot connect to Elasticsearch at {args.es_host}")
108 115 return 1
109 116  
  117 + # Build DB config directly from environment to avoid dotenv dependency
  118 + db_host = os.getenv("DB_HOST")
  119 + db_port = int(os.getenv("DB_PORT", "3306"))
  120 + db_name = os.getenv("DB_DATABASE")
  121 + db_user = os.getenv("DB_USERNAME")
  122 + db_pass = os.getenv("DB_PASSWORD")
  123 + if not all([db_host, db_name, db_user, db_pass]):
  124 + print("ERROR: DB_HOST/DB_PORT/DB_DATABASE/DB_USERNAME/DB_PASSWORD must be set in environment")
  125 + return 1
  126 +
110 127 db_engine = create_db_connection(
111   - host=DB_CONFIG["host"],
112   - port=DB_CONFIG["port"],
113   - database=DB_CONFIG["database"],
114   - username=DB_CONFIG["username"],
115   - password=DB_CONFIG["password"],
  128 + host=db_host,
  129 + port=db_port,
  130 + database=db_name,
  131 + username=db_user,
  132 + password=db_pass,
116 133 )
117 134 builder = SuggestionIndexBuilder(es_client=es_client, db_engine=db_engine)
118 135 result = builder.rebuild_tenant_index(
... ...
scripts/build_suggestions.sh 0 → 100755
... ... @@ -0,0 +1,27 @@
  1 +#!/usr/bin/env bash
  2 +#
  3 +# Convenience script to rebuild suggestion index for a tenant.
  4 +#
  5 +# Usage:
  6 +# ./scripts/build_suggestions.sh <tenant_id> [--days 30] [--batch-size 500] [--min-query-len 1] [--es-host http://localhost:9200]
  7 +#
  8 +
  9 +set -euo pipefail
  10 +
  11 +if [ $# -lt 1 ]; then
  12 + echo "Usage: $0 <tenant_id> [extra args...]"
  13 + echo "Example: $0 162 --days 30 --recreate"
  14 + exit 1
  15 +fi
  16 +
  17 +TENANT_ID="$1"
  18 +shift || true
  19 +
  20 +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
  21 +
  22 +cd "$ROOT_DIR"
  23 +
  24 +python main.py build-suggestions \
  25 + --tenant-id "$TENANT_ID" \
  26 + "$@"
  27 +
... ...
tests/test_suggestions.py 0 → 100644
... ... @@ -0,0 +1,183 @@
  1 +import json
  2 +from typing import Any, Dict, List
  3 +
  4 +import pytest
  5 +
  6 +from suggestion.builder import SuggestionIndexBuilder
  7 +from suggestion.service import SuggestionService
  8 +
  9 +
  10 +class FakeESClient:
  11 + """Minimal fake ES client for SuggestionService tests."""
  12 +
  13 + def __init__(self) -> None:
  14 + self.calls: List[Dict[str, Any]] = []
  15 +
  16 + def search(self, index_name: str, body: Dict[str, Any], size: int = 10, from_: int = 0) -> Dict[str, Any]:
  17 + self.calls.append({"index": index_name, "body": body, "size": size, "from": from_})
  18 + # Suggestion index
  19 + if "search_suggestions_tenant_" in index_name:
  20 + return {
  21 + "hits": {
  22 + "total": {"value": 1},
  23 + "max_score": 3.2,
  24 + "hits": [
  25 + {
  26 + "_id": "1",
  27 + "_score": 3.2,
  28 + "_source": {
  29 + "text": "iphone 15",
  30 + "lang": "en",
  31 + "rank_score": 5.0,
  32 + "sources": ["query_log", "qanchor"],
  33 + "lang_source": "log_field",
  34 + "lang_confidence": 1.0,
  35 + "lang_conflict": False,
  36 + "top_spu_ids": ["12345"],
  37 + },
  38 + }
  39 + ],
  40 + }
  41 + }
  42 + # Product index
  43 + if "search_products_tenant_" in index_name:
  44 + return {
  45 + "hits": {
  46 + "total": {"value": 1},
  47 + "max_score": 2.5,
  48 + "hits": [
  49 + {
  50 + "_id": "12345",
  51 + "_score": 2.5,
  52 + "_source": {
  53 + "spu_id": "12345",
  54 + "title": {"en": "iPhone 15 Pro Max"},
  55 + "min_price": 999.0,
  56 + "image_url": "https://example.com/image.jpg",
  57 + "sales": 100,
  58 + "total_inventory": 50,
  59 + },
  60 + }
  61 + ],
  62 + }
  63 + }
  64 + return {"hits": {"total": {"value": 0}, "max_score": 0.0, "hits": []}}
  65 +
  66 + # For builder.bulk_index usage compatibility in full runs (not used in these unit tests)
  67 + def bulk_index(self, index_name: str, docs: List[Dict[str, Any]]) -> Dict[str, Any]:
  68 + self.calls.append({"index": index_name, "bulk": True, "docs": docs})
  69 + return {"success": len(docs), "failed": 0, "errors": []}
  70 +
  71 + def index_exists(self, index_name: str) -> bool:
  72 + return False
  73 +
  74 + def delete_index(self, index_name: str) -> bool:
  75 + return True
  76 +
  77 + def create_index(self, index_name: str, body: Dict[str, Any]) -> bool:
  78 + self.calls.append({"index": index_name, "create": True, "body": body})
  79 + return True
  80 +
  81 + def refresh(self, index_name: str) -> bool:
  82 + return True
  83 +
  84 +
  85 +@pytest.mark.unit
  86 +def test_resolve_query_language_prefers_log_field(monkeypatch):
  87 + """builder.resolve_query_language 应优先使用日志 language 字段。"""
  88 + fake_es = FakeESClient()
  89 + builder = SuggestionIndexBuilder(es_client=fake_es, db_engine=None)
  90 + # index_languages 里包含 en/zh,primary 设为 zh
  91 + lang, conf, source, conflict = builder._resolve_query_language(
  92 + query="iphone 15",
  93 + log_language="en",
  94 + request_params=None,
  95 + index_languages=["zh", "en"],
  96 + primary_language="zh",
  97 + )
  98 + assert lang == "en"
  99 + assert conf == 1.0
  100 + assert source == "log_field"
  101 + assert conflict is False
  102 +
  103 +
  104 +@pytest.mark.unit
  105 +def test_resolve_query_language_uses_request_params_when_log_missing():
  106 + """当日志 language 为空时,应从 request_params.language 解析。"""
  107 + fake_es = FakeESClient()
  108 + builder = SuggestionIndexBuilder(es_client=fake_es, db_engine=None)
  109 + request_params = json.dumps({"language": "zh"})
  110 + lang, conf, source, conflict = builder._resolve_query_language(
  111 + query="芭比娃娃",
  112 + log_language=None,
  113 + request_params=request_params,
  114 + index_languages=["zh", "en"],
  115 + primary_language="en",
  116 + )
  117 + assert lang == "zh"
  118 + assert conf == 1.0
  119 + assert source == "request_params"
  120 + assert conflict is False
  121 +
  122 +
  123 +@pytest.mark.unit
  124 +def test_resolve_query_language_fallback_to_primary():
  125 + """当无任何语言线索时,应回落到租户 primary_language。"""
  126 + fake_es = FakeESClient()
  127 + builder = SuggestionIndexBuilder(es_client=fake_es, db_engine=None)
  128 + lang, conf, source, conflict = builder._resolve_query_language(
  129 + query="some text",
  130 + log_language=None,
  131 + request_params=None,
  132 + index_languages=["zh", "en"],
  133 + primary_language="zh",
  134 + )
  135 + assert lang == "zh"
  136 + assert source == "default"
  137 + assert conflict is False
  138 +
  139 +
  140 +@pytest.mark.unit
  141 +def test_suggestion_service_basic_flow(monkeypatch):
  142 + """
  143 + SuggestionService.search 应正确调用 ES 并返回 suggestion + products。
  144 + 使用 FakeESClient 避免真实 ES 依赖。
  145 + """
  146 + # 覆盖 tenant_config_loader 以避免依赖外部 config.yaml 改动
  147 + from config import tenant_config_loader as tcl
  148 +
  149 + loader = tcl.get_tenant_config_loader()
  150 + # 强制覆盖内部缓存配置
  151 + loader._config = {
  152 + "default": {"primary_language": "en", "index_languages": ["en", "zh"]},
  153 + "tenants": {
  154 + "1": {"primary_language": "en", "index_languages": ["en", "zh"]},
  155 + },
  156 + }
  157 +
  158 + fake_es = FakeESClient()
  159 + service = SuggestionService(es_client=fake_es)
  160 + result = service.search(
  161 + tenant_id="1",
  162 + query="iph",
  163 + language="en",
  164 + size=5,
  165 + with_results=True,
  166 + result_size=2,
  167 + )
  168 +
  169 + assert result["resolved_language"] == "en"
  170 + assert result["query"] == "iph"
  171 + assert result["took_ms"] >= 0
  172 + suggestions = result["suggestions"]
  173 + assert len(suggestions) == 1
  174 + s0 = suggestions[0]
  175 + assert s0["text"] == "iphone 15"
  176 + assert s0["lang"] == "en"
  177 + assert isinstance(s0.get("products"), list)
  178 + assert len(s0["products"]) >= 1
  179 + p0 = s0["products"][0]
  180 + assert p0["spu_id"] == "12345"
  181 + assert "title" in p0
  182 + assert "price" in p0
  183 +
... ...