Commit f251cf2d502df5c8e9da2b00757eee2444694e21

Authored by tangwang
1 parent ded6f29e

suggestion全量索引程序跑通

docs/Usage-Guide.md
@@ -343,6 +343,49 @@ curl -X POST http://localhost:6002/search/image \ @@ -343,6 +343,49 @@ curl -X POST http://localhost:6002/search/image \
343 343
344 --- 344 ---
345 345
  346 +## 8. Suggestion 索引与接口使用
  347 +
  348 +### 8.1 构建 Suggestion 索引(全量)
  349 +
  350 +Suggestion 索引会从:
  351 +
  352 +- ES 商品索引:`title.{lang}`, `qanchors.{lang}`
  353 +- MySQL 日志表:`shoplazza_search_log.query`(含 `language`、`request_params`)
  354 +
  355 +聚合生成 `search_suggestions_tenant_{tenant_id}`。
  356 +
  357 +在项目根目录执行:
  358 +
  359 +```bash
  360 +# 为指定租户全量重建 suggestion 索引(会删除旧索引)
  361 +python main.py build-suggestions \
  362 + --tenant-id 162 \
  363 + --es-host http://localhost:9200 \
  364 + --days 30 \
  365 + --recreate
  366 +```
  367 +
  368 +可选参数:
  369 +
  370 +- `--days`:回溯日志天数(默认 30)
  371 +- `--batch-size`:扫描商品索引的批大小(默认 500)
  372 +- `--min-query-len`:参与 suggestion 的最小查询长度(默认 1)
  373 +
  374 +> 建议在商品索引构建完成、日志正常写入一段时间后执行一次全量构建,然后按天/小时增加增量构建任务。
  375 +
  376 +### 8.2 调用 Suggestion 接口
  377 +
  378 +全量构建完成后,可直接通过 `/search/suggestions` 获取自动补全结果:
  379 +
  380 +```bash
  381 +curl "http://localhost:6002/search/suggestions?q=iph&size=5&language=en&with_results=true" \
  382 + -H "X-Tenant-ID: 162"
  383 +```
  384 +
  385 +接口返回结构详见 `docs/搜索API对接指南.md` 的“3.7 搜索建议接口”章节。
  386 +
  387 +---
  388 +
346 ## 常见问题 389 ## 常见问题
347 390
348 ### Q1: MySQL连接失败 391 ### Q1: MySQL连接失败
docs/搜索API对接指南.md
@@ -558,40 +558,59 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"}) @@ -558,40 +558,59 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"})
558 558
559 ### 3.7 搜索建议接口 559 ### 3.7 搜索建议接口
560 560
561 -> ⚠️ **TODO**: 此接口当前为框架实现,功能暂未实现,仅返回空结果。接口和响应格式已经固定,可平滑扩展。  
562 -  
563 - **端点**: `GET /search/suggestions` 561 - **端点**: `GET /search/suggestions`
564 -- **描述**: 返回搜索建议(自动补全/热词)。当前为框架实现,接口和响应格式已经固定,可平滑扩展 562 +- **描述**: 返回搜索建议(自动补全/热词),支持多语言与“结果直达”(每条 suggestion 附带商品列表)
565 563
566 #### 查询参数 564 #### 查询参数
567 565
568 | 参数 | 类型 | 必填 | 默认值 | 描述 | 566 | 参数 | 类型 | 必填 | 默认值 | 描述 |
569 |------|------|------|--------|------| 567 |------|------|------|--------|------|
570 | `q` | string | Y | - | 查询字符串(至少 1 个字符) | 568 | `q` | string | Y | - | 查询字符串(至少 1 个字符) |
571 -| `size` | integer | N | 5 | 返回建议数量(1-20) |  
572 -| `types` | string | N | `query` | 建议类型(逗号分隔):`query`, `product`, `category`, `brand` | 569 +| `size` | integer | N | 10 | 返回建议数量(1-20) |
  570 +| `language` | string | N | `en` | 请求语言,如 `zh` / `en` / `ar` / `ru`,用于路由到对应语种 suggestion 索引 |
  571 +| `with_results` | bool | N | `true` | 是否为每条 suggestion 返回商品列表(结果直达) |
  572 +| `result_size` | integer | N | 3 | 每条 suggestion 返回的商品数量(1-10) |
  573 +| `debug` | bool | N | `false` | 是否开启调试(目前主要用于排查 suggestion 排序与语言解析) |
  574 +
  575 +> **租户标识**:同 [3.1](#31-接口信息),通过请求头 `X-Tenant-ID` 或 query 参数 `tenant_id` 传递。
573 576
574 #### 响应示例 577 #### 响应示例
575 578
576 ```json 579 ```json
577 { 580 {
578 - "query": "芭", 581 + "query": "iph",
  582 + "language": "en",
  583 + "resolved_language": "en",
579 "suggestions": [ 584 "suggestions": [
580 { 585 {
581 - "text": "芭比娃娃",  
582 - "type": "query",  
583 - "highlight": "<em>芭</em>比娃娃",  
584 - "popularity": 850 586 + "text": "iphone 15",
  587 + "lang": "en",
  588 + "score": 12.37,
  589 + "rank_score": 5.1,
  590 + "sources": ["query_log", "qanchor"],
  591 + "lang_source": "log_field",
  592 + "lang_confidence": 1.0,
  593 + "lang_conflict": false,
  594 + "products": [
  595 + {
  596 + "spu_id": "12345",
  597 + "title": "iPhone 15 Pro Max",
  598 + "price": 999.0,
  599 + "image_url": "https://example.com/image.jpg",
  600 + "score": 3.21
  601 + }
  602 + ]
585 } 603 }
586 ], 604 ],
587 - "took_ms": 5 605 + "took_ms": 12
588 } 606 }
589 ``` 607 ```
590 608
591 #### 请求示例 609 #### 请求示例
592 610
593 ```bash 611 ```bash
594 -curl "http://localhost:6002/search/suggestions?q=芭&size=5&types=query,product" 612 +curl "http://localhost:6002/search/suggestions?q=芭&size=5&language=zh&with_results=true" \
  613 + -H "X-Tenant-ID: 162"
595 ``` 614 ```
596 615
597 ### 3.8 即时搜索接口 616 ### 3.8 即时搜索接口
docs/搜索API速查表.md
@@ -294,7 +294,7 @@ POST /search/image @@ -294,7 +294,7 @@ POST /search/image
294 "size": 20 294 "size": 20
295 } 295 }
296 296
297 -GET /search/suggestions?q=芭&size=5 297 +GET /search/suggestions?q=芭&size=5&language=zh&with_results=true
298 298
299 GET /search/instant?q=玩具&size=5 299 GET /search/instant?q=玩具&size=5
300 300
@@ -12,13 +12,12 @@ import sys @@ -12,13 +12,12 @@ import sys
12 import os 12 import os
13 import argparse 13 import argparse
14 import json 14 import json
15 -import uvicorn  
16 15
17 # Add parent directory to path 16 # Add parent directory to path
18 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) 17 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
19 18
20 from config import ConfigLoader 19 from config import ConfigLoader
21 -from config.env_config import DB_CONFIG 20 +from config.env_config import ES_CONFIG
22 from utils import ESClient 21 from utils import ESClient
23 from search import Searcher 22 from search import Searcher
24 from suggestion import SuggestionIndexBuilder 23 from suggestion import SuggestionIndexBuilder
@@ -27,6 +26,7 @@ from utils.db_connector import create_db_connection @@ -27,6 +26,7 @@ from utils.db_connector import create_db_connection
27 26
28 def cmd_serve(args): 27 def cmd_serve(args):
29 """Start API service.""" 28 """Start API service."""
  29 + import uvicorn
30 os.environ['ES_HOST'] = args.es_host 30 os.environ['ES_HOST'] = args.es_host
31 31
32 print("Starting API service (multi-tenant)...") 32 print("Starting API service (multi-tenant)...")
@@ -43,6 +43,7 @@ def cmd_serve(args): @@ -43,6 +43,7 @@ def cmd_serve(args):
43 43
44 def cmd_serve_indexer(args): 44 def cmd_serve_indexer(args):
45 """Start dedicated Indexer API service (no search endpoints).""" 45 """Start dedicated Indexer API service (no search endpoints)."""
  46 + import uvicorn
46 os.environ['ES_HOST'] = args.es_host 47 os.environ['ES_HOST'] = args.es_host
47 48
48 print("Starting Indexer API service...") 49 print("Starting Indexer API service...")
@@ -102,17 +103,33 @@ def cmd_search(args): @@ -102,17 +103,33 @@ def cmd_search(args):
102 103
103 def cmd_build_suggestions(args): 104 def cmd_build_suggestions(args):
104 """Build suggestion index for a tenant.""" 105 """Build suggestion index for a tenant."""
105 - es_client = ESClient(hosts=[args.es_host]) 106 + # Initialize ES client with optional authentication
  107 + es_username = os.getenv("ES_USERNAME") or ES_CONFIG.get("username")
  108 + es_password = os.getenv("ES_PASSWORD") or ES_CONFIG.get("password")
  109 + if es_username and es_password:
  110 + es_client = ESClient(hosts=[args.es_host], username=es_username, password=es_password)
  111 + else:
  112 + es_client = ESClient(hosts=[args.es_host])
106 if not es_client.ping(): 113 if not es_client.ping():
107 print(f"ERROR: Cannot connect to Elasticsearch at {args.es_host}") 114 print(f"ERROR: Cannot connect to Elasticsearch at {args.es_host}")
108 return 1 115 return 1
109 116
  117 + # Build DB config directly from environment to avoid dotenv dependency
  118 + db_host = os.getenv("DB_HOST")
  119 + db_port = int(os.getenv("DB_PORT", "3306"))
  120 + db_name = os.getenv("DB_DATABASE")
  121 + db_user = os.getenv("DB_USERNAME")
  122 + db_pass = os.getenv("DB_PASSWORD")
  123 + if not all([db_host, db_name, db_user, db_pass]):
  124 + print("ERROR: DB_HOST/DB_PORT/DB_DATABASE/DB_USERNAME/DB_PASSWORD must be set in environment")
  125 + return 1
  126 +
110 db_engine = create_db_connection( 127 db_engine = create_db_connection(
111 - host=DB_CONFIG["host"],  
112 - port=DB_CONFIG["port"],  
113 - database=DB_CONFIG["database"],  
114 - username=DB_CONFIG["username"],  
115 - password=DB_CONFIG["password"], 128 + host=db_host,
  129 + port=db_port,
  130 + database=db_name,
  131 + username=db_user,
  132 + password=db_pass,
116 ) 133 )
117 builder = SuggestionIndexBuilder(es_client=es_client, db_engine=db_engine) 134 builder = SuggestionIndexBuilder(es_client=es_client, db_engine=db_engine)
118 result = builder.rebuild_tenant_index( 135 result = builder.rebuild_tenant_index(
scripts/build_suggestions.sh 0 → 100755
@@ -0,0 +1,27 @@ @@ -0,0 +1,27 @@
  1 +#!/usr/bin/env bash
  2 +#
  3 +# Convenience script to rebuild suggestion index for a tenant.
  4 +#
  5 +# Usage:
  6 +# ./scripts/build_suggestions.sh <tenant_id> [--days 30] [--batch-size 500] [--min-query-len 1] [--es-host http://localhost:9200]
  7 +#
  8 +
  9 +set -euo pipefail
  10 +
  11 +if [ $# -lt 1 ]; then
  12 + echo "Usage: $0 <tenant_id> [extra args...]"
  13 + echo "Example: $0 162 --days 30 --recreate"
  14 + exit 1
  15 +fi
  16 +
  17 +TENANT_ID="$1"
  18 +shift || true
  19 +
  20 +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
  21 +
  22 +cd "$ROOT_DIR"
  23 +
  24 +python main.py build-suggestions \
  25 + --tenant-id "$TENANT_ID" \
  26 + "$@"
  27 +
tests/test_suggestions.py 0 → 100644
@@ -0,0 +1,183 @@ @@ -0,0 +1,183 @@
  1 +import json
  2 +from typing import Any, Dict, List
  3 +
  4 +import pytest
  5 +
  6 +from suggestion.builder import SuggestionIndexBuilder
  7 +from suggestion.service import SuggestionService
  8 +
  9 +
  10 +class FakeESClient:
  11 + """Minimal fake ES client for SuggestionService tests."""
  12 +
  13 + def __init__(self) -> None:
  14 + self.calls: List[Dict[str, Any]] = []
  15 +
  16 + def search(self, index_name: str, body: Dict[str, Any], size: int = 10, from_: int = 0) -> Dict[str, Any]:
  17 + self.calls.append({"index": index_name, "body": body, "size": size, "from": from_})
  18 + # Suggestion index
  19 + if "search_suggestions_tenant_" in index_name:
  20 + return {
  21 + "hits": {
  22 + "total": {"value": 1},
  23 + "max_score": 3.2,
  24 + "hits": [
  25 + {
  26 + "_id": "1",
  27 + "_score": 3.2,
  28 + "_source": {
  29 + "text": "iphone 15",
  30 + "lang": "en",
  31 + "rank_score": 5.0,
  32 + "sources": ["query_log", "qanchor"],
  33 + "lang_source": "log_field",
  34 + "lang_confidence": 1.0,
  35 + "lang_conflict": False,
  36 + "top_spu_ids": ["12345"],
  37 + },
  38 + }
  39 + ],
  40 + }
  41 + }
  42 + # Product index
  43 + if "search_products_tenant_" in index_name:
  44 + return {
  45 + "hits": {
  46 + "total": {"value": 1},
  47 + "max_score": 2.5,
  48 + "hits": [
  49 + {
  50 + "_id": "12345",
  51 + "_score": 2.5,
  52 + "_source": {
  53 + "spu_id": "12345",
  54 + "title": {"en": "iPhone 15 Pro Max"},
  55 + "min_price": 999.0,
  56 + "image_url": "https://example.com/image.jpg",
  57 + "sales": 100,
  58 + "total_inventory": 50,
  59 + },
  60 + }
  61 + ],
  62 + }
  63 + }
  64 + return {"hits": {"total": {"value": 0}, "max_score": 0.0, "hits": []}}
  65 +
  66 + # For builder.bulk_index usage compatibility in full runs (not used in these unit tests)
  67 + def bulk_index(self, index_name: str, docs: List[Dict[str, Any]]) -> Dict[str, Any]:
  68 + self.calls.append({"index": index_name, "bulk": True, "docs": docs})
  69 + return {"success": len(docs), "failed": 0, "errors": []}
  70 +
  71 + def index_exists(self, index_name: str) -> bool:
  72 + return False
  73 +
  74 + def delete_index(self, index_name: str) -> bool:
  75 + return True
  76 +
  77 + def create_index(self, index_name: str, body: Dict[str, Any]) -> bool:
  78 + self.calls.append({"index": index_name, "create": True, "body": body})
  79 + return True
  80 +
  81 + def refresh(self, index_name: str) -> bool:
  82 + return True
  83 +
  84 +
  85 +@pytest.mark.unit
  86 +def test_resolve_query_language_prefers_log_field(monkeypatch):
  87 + """builder.resolve_query_language 应优先使用日志 language 字段。"""
  88 + fake_es = FakeESClient()
  89 + builder = SuggestionIndexBuilder(es_client=fake_es, db_engine=None)
  90 + # index_languages 里包含 en/zh,primary 设为 zh
  91 + lang, conf, source, conflict = builder._resolve_query_language(
  92 + query="iphone 15",
  93 + log_language="en",
  94 + request_params=None,
  95 + index_languages=["zh", "en"],
  96 + primary_language="zh",
  97 + )
  98 + assert lang == "en"
  99 + assert conf == 1.0
  100 + assert source == "log_field"
  101 + assert conflict is False
  102 +
  103 +
  104 +@pytest.mark.unit
  105 +def test_resolve_query_language_uses_request_params_when_log_missing():
  106 + """当日志 language 为空时,应从 request_params.language 解析。"""
  107 + fake_es = FakeESClient()
  108 + builder = SuggestionIndexBuilder(es_client=fake_es, db_engine=None)
  109 + request_params = json.dumps({"language": "zh"})
  110 + lang, conf, source, conflict = builder._resolve_query_language(
  111 + query="芭比娃娃",
  112 + log_language=None,
  113 + request_params=request_params,
  114 + index_languages=["zh", "en"],
  115 + primary_language="en",
  116 + )
  117 + assert lang == "zh"
  118 + assert conf == 1.0
  119 + assert source == "request_params"
  120 + assert conflict is False
  121 +
  122 +
  123 +@pytest.mark.unit
  124 +def test_resolve_query_language_fallback_to_primary():
  125 + """当无任何语言线索时,应回落到租户 primary_language。"""
  126 + fake_es = FakeESClient()
  127 + builder = SuggestionIndexBuilder(es_client=fake_es, db_engine=None)
  128 + lang, conf, source, conflict = builder._resolve_query_language(
  129 + query="some text",
  130 + log_language=None,
  131 + request_params=None,
  132 + index_languages=["zh", "en"],
  133 + primary_language="zh",
  134 + )
  135 + assert lang == "zh"
  136 + assert source == "default"
  137 + assert conflict is False
  138 +
  139 +
  140 +@pytest.mark.unit
  141 +def test_suggestion_service_basic_flow(monkeypatch):
  142 + """
  143 + SuggestionService.search 应正确调用 ES 并返回 suggestion + products。
  144 + 使用 FakeESClient 避免真实 ES 依赖。
  145 + """
  146 + # 覆盖 tenant_config_loader 以避免依赖外部 config.yaml 改动
  147 + from config import tenant_config_loader as tcl
  148 +
  149 + loader = tcl.get_tenant_config_loader()
  150 + # 强制覆盖内部缓存配置
  151 + loader._config = {
  152 + "default": {"primary_language": "en", "index_languages": ["en", "zh"]},
  153 + "tenants": {
  154 + "1": {"primary_language": "en", "index_languages": ["en", "zh"]},
  155 + },
  156 + }
  157 +
  158 + fake_es = FakeESClient()
  159 + service = SuggestionService(es_client=fake_es)
  160 + result = service.search(
  161 + tenant_id="1",
  162 + query="iph",
  163 + language="en",
  164 + size=5,
  165 + with_results=True,
  166 + result_size=2,
  167 + )
  168 +
  169 + assert result["resolved_language"] == "en"
  170 + assert result["query"] == "iph"
  171 + assert result["took_ms"] >= 0
  172 + suggestions = result["suggestions"]
  173 + assert len(suggestions) == 1
  174 + s0 = suggestions[0]
  175 + assert s0["text"] == "iphone 15"
  176 + assert s0["lang"] == "en"
  177 + assert isinstance(s0.get("products"), list)
  178 + assert len(s0["products"]) >= 1
  179 + p0 = s0["products"][0]
  180 + assert p0["spu_id"] == "12345"
  181 + assert "title" in p0
  182 + assert "price" in p0
  183 +