Commit f251cf2d502df5c8e9da2b00757eee2444694e21
1 parent
ded6f29e
suggestion全量索引程序跑通
Showing
6 changed files
with
310 additions
and
21 deletions
Show diff stats
docs/Usage-Guide.md
| ... | ... | @@ -343,6 +343,49 @@ curl -X POST http://localhost:6002/search/image \ |
| 343 | 343 | |
| 344 | 344 | --- |
| 345 | 345 | |
| 346 | +## 8. Suggestion 索引与接口使用 | |
| 347 | + | |
| 348 | +### 8.1 构建 Suggestion 索引(全量) | |
| 349 | + | |
| 350 | +Suggestion 索引会从: | |
| 351 | + | |
| 352 | +- ES 商品索引:`title.{lang}`, `qanchors.{lang}` | |
| 353 | +- MySQL 日志表:`shoplazza_search_log.query`(含 `language`、`request_params`) | |
| 354 | + | |
| 355 | +聚合生成 `search_suggestions_tenant_{tenant_id}`。 | |
| 356 | + | |
| 357 | +在项目根目录执行: | |
| 358 | + | |
| 359 | +```bash | |
| 360 | +# 为指定租户全量重建 suggestion 索引(会删除旧索引) | |
| 361 | +python main.py build-suggestions \ | |
| 362 | + --tenant-id 162 \ | |
| 363 | + --es-host http://localhost:9200 \ | |
| 364 | + --days 30 \ | |
| 365 | + --recreate | |
| 366 | +``` | |
| 367 | + | |
| 368 | +可选参数: | |
| 369 | + | |
| 370 | +- `--days`:回溯日志天数(默认 30) | |
| 371 | +- `--batch-size`:扫描商品索引的批大小(默认 500) | |
| 372 | +- `--min-query-len`:参与 suggestion 的最小查询长度(默认 1) | |
| 373 | + | |
| 374 | +> 建议在商品索引构建完成、日志正常写入一段时间后执行一次全量构建,然后按天/小时增加增量构建任务。 | |
| 375 | + | |
| 376 | +### 8.2 调用 Suggestion 接口 | |
| 377 | + | |
| 378 | +全量构建完成后,可直接通过 `/search/suggestions` 获取自动补全结果: | |
| 379 | + | |
| 380 | +```bash | |
| 381 | +curl "http://localhost:6002/search/suggestions?q=iph&size=5&language=en&with_results=true" \ | |
| 382 | + -H "X-Tenant-ID: 162" | |
| 383 | +``` | |
| 384 | + | |
| 385 | +接口返回结构详见 `docs/搜索API对接指南.md` 的“3.7 搜索建议接口”章节。 | |
| 386 | + | |
| 387 | +--- | |
| 388 | + | |
| 346 | 389 | ## 常见问题 |
| 347 | 390 | |
| 348 | 391 | ### Q1: MySQL连接失败 | ... | ... |
docs/搜索API对接指南.md
| ... | ... | @@ -558,40 +558,59 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"}) |
| 558 | 558 | |
| 559 | 559 | ### 3.7 搜索建议接口 |
| 560 | 560 | |
| 561 | -> ⚠️ **TODO**: 此接口当前为框架实现,功能暂未实现,仅返回空结果。接口和响应格式已经固定,可平滑扩展。 | |
| 562 | - | |
| 563 | 561 | - **端点**: `GET /search/suggestions` |
| 564 | -- **描述**: 返回搜索建议(自动补全/热词)。当前为框架实现,接口和响应格式已经固定,可平滑扩展。 | |
| 562 | +- **描述**: 返回搜索建议(自动补全/热词),支持多语言与“结果直达”(每条 suggestion 附带商品列表)。 | |
| 565 | 563 | |
| 566 | 564 | #### 查询参数 |
| 567 | 565 | |
| 568 | 566 | | 参数 | 类型 | 必填 | 默认值 | 描述 | |
| 569 | 567 | |------|------|------|--------|------| |
| 570 | 568 | | `q` | string | Y | - | 查询字符串(至少 1 个字符) | |
| 571 | -| `size` | integer | N | 5 | 返回建议数量(1-20) | | |
| 572 | -| `types` | string | N | `query` | 建议类型(逗号分隔):`query`, `product`, `category`, `brand` | | |
| 569 | +| `size` | integer | N | 10 | 返回建议数量(1-20) | | |
| 570 | +| `language` | string | N | `en` | 请求语言,如 `zh` / `en` / `ar` / `ru`,用于路由到对应语种 suggestion 索引 | | |
| 571 | +| `with_results` | bool | N | `true` | 是否为每条 suggestion 返回商品列表(结果直达) | | |
| 572 | +| `result_size` | integer | N | 3 | 每条 suggestion 返回的商品数量(1-10) | | |
| 573 | +| `debug` | bool | N | `false` | 是否开启调试(目前主要用于排查 suggestion 排序与语言解析) | | |
| 574 | + | |
| 575 | +> **租户标识**:同 [3.1](#31-接口信息),通过请求头 `X-Tenant-ID` 或 query 参数 `tenant_id` 传递。 | |
| 573 | 576 | |
| 574 | 577 | #### 响应示例 |
| 575 | 578 | |
| 576 | 579 | ```json |
| 577 | 580 | { |
| 578 | - "query": "芭", | |
| 581 | + "query": "iph", | |
| 582 | + "language": "en", | |
| 583 | + "resolved_language": "en", | |
| 579 | 584 | "suggestions": [ |
| 580 | 585 | { |
| 581 | - "text": "芭比娃娃", | |
| 582 | - "type": "query", | |
| 583 | - "highlight": "<em>芭</em>比娃娃", | |
| 584 | - "popularity": 850 | |
| 586 | + "text": "iphone 15", | |
| 587 | + "lang": "en", | |
| 588 | + "score": 12.37, | |
| 589 | + "rank_score": 5.1, | |
| 590 | + "sources": ["query_log", "qanchor"], | |
| 591 | + "lang_source": "log_field", | |
| 592 | + "lang_confidence": 1.0, | |
| 593 | + "lang_conflict": false, | |
| 594 | + "products": [ | |
| 595 | + { | |
| 596 | + "spu_id": "12345", | |
| 597 | + "title": "iPhone 15 Pro Max", | |
| 598 | + "price": 999.0, | |
| 599 | + "image_url": "https://example.com/image.jpg", | |
| 600 | + "score": 3.21 | |
| 601 | + } | |
| 602 | + ] | |
| 585 | 603 | } |
| 586 | 604 | ], |
| 587 | - "took_ms": 5 | |
| 605 | + "took_ms": 12 | |
| 588 | 606 | } |
| 589 | 607 | ``` |
| 590 | 608 | |
| 591 | 609 | #### 请求示例 |
| 592 | 610 | |
| 593 | 611 | ```bash |
| 594 | -curl "http://localhost:6002/search/suggestions?q=芭&size=5&types=query,product" | |
| 612 | +curl "http://localhost:6002/search/suggestions?q=芭&size=5&language=zh&with_results=true" \ | |
| 613 | + -H "X-Tenant-ID: 162" | |
| 595 | 614 | ``` |
| 596 | 615 | |
| 597 | 616 | ### 3.8 即时搜索接口 | ... | ... |
docs/搜索API速查表.md
main.py
| ... | ... | @@ -12,13 +12,12 @@ import sys |
| 12 | 12 | import os |
| 13 | 13 | import argparse |
| 14 | 14 | import json |
| 15 | -import uvicorn | |
| 16 | 15 | |
| 17 | 16 | # Add parent directory to path |
| 18 | 17 | sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) |
| 19 | 18 | |
| 20 | 19 | from config import ConfigLoader |
| 21 | -from config.env_config import DB_CONFIG | |
| 20 | +from config.env_config import ES_CONFIG | |
| 22 | 21 | from utils import ESClient |
| 23 | 22 | from search import Searcher |
| 24 | 23 | from suggestion import SuggestionIndexBuilder |
| ... | ... | @@ -27,6 +26,7 @@ from utils.db_connector import create_db_connection |
| 27 | 26 | |
| 28 | 27 | def cmd_serve(args): |
| 29 | 28 | """Start API service.""" |
| 29 | + import uvicorn | |
| 30 | 30 | os.environ['ES_HOST'] = args.es_host |
| 31 | 31 | |
| 32 | 32 | print("Starting API service (multi-tenant)...") |
| ... | ... | @@ -43,6 +43,7 @@ def cmd_serve(args): |
| 43 | 43 | |
| 44 | 44 | def cmd_serve_indexer(args): |
| 45 | 45 | """Start dedicated Indexer API service (no search endpoints).""" |
| 46 | + import uvicorn | |
| 46 | 47 | os.environ['ES_HOST'] = args.es_host |
| 47 | 48 | |
| 48 | 49 | print("Starting Indexer API service...") |
| ... | ... | @@ -102,17 +103,33 @@ def cmd_search(args): |
| 102 | 103 | |
| 103 | 104 | def cmd_build_suggestions(args): |
| 104 | 105 | """Build suggestion index for a tenant.""" |
| 105 | - es_client = ESClient(hosts=[args.es_host]) | |
| 106 | + # Initialize ES client with optional authentication | |
| 107 | + es_username = os.getenv("ES_USERNAME") or ES_CONFIG.get("username") | |
| 108 | + es_password = os.getenv("ES_PASSWORD") or ES_CONFIG.get("password") | |
| 109 | + if es_username and es_password: | |
| 110 | + es_client = ESClient(hosts=[args.es_host], username=es_username, password=es_password) | |
| 111 | + else: | |
| 112 | + es_client = ESClient(hosts=[args.es_host]) | |
| 106 | 113 | if not es_client.ping(): |
| 107 | 114 | print(f"ERROR: Cannot connect to Elasticsearch at {args.es_host}") |
| 108 | 115 | return 1 |
| 109 | 116 | |
| 117 | + # Build DB config directly from environment to avoid dotenv dependency | |
| 118 | + db_host = os.getenv("DB_HOST") | |
| 119 | + db_port = int(os.getenv("DB_PORT", "3306")) | |
| 120 | + db_name = os.getenv("DB_DATABASE") | |
| 121 | + db_user = os.getenv("DB_USERNAME") | |
| 122 | + db_pass = os.getenv("DB_PASSWORD") | |
| 123 | + if not all([db_host, db_name, db_user, db_pass]): | |
| 124 | + print("ERROR: DB_HOST/DB_PORT/DB_DATABASE/DB_USERNAME/DB_PASSWORD must be set in environment") | |
| 125 | + return 1 | |
| 126 | + | |
| 110 | 127 | db_engine = create_db_connection( |
| 111 | - host=DB_CONFIG["host"], | |
| 112 | - port=DB_CONFIG["port"], | |
| 113 | - database=DB_CONFIG["database"], | |
| 114 | - username=DB_CONFIG["username"], | |
| 115 | - password=DB_CONFIG["password"], | |
| 128 | + host=db_host, | |
| 129 | + port=db_port, | |
| 130 | + database=db_name, | |
| 131 | + username=db_user, | |
| 132 | + password=db_pass, | |
| 116 | 133 | ) |
| 117 | 134 | builder = SuggestionIndexBuilder(es_client=es_client, db_engine=db_engine) |
| 118 | 135 | result = builder.rebuild_tenant_index( | ... | ... |
| ... | ... | @@ -0,0 +1,27 @@ |
| 1 | +#!/usr/bin/env bash | |
| 2 | +# | |
| 3 | +# Convenience script to rebuild suggestion index for a tenant. | |
| 4 | +# | |
| 5 | +# Usage: | |
| 6 | +# ./scripts/build_suggestions.sh <tenant_id> [--days 30] [--batch-size 500] [--min-query-len 1] [--es-host http://localhost:9200] | |
| 7 | +# | |
| 8 | + | |
| 9 | +set -euo pipefail | |
| 10 | + | |
| 11 | +if [ $# -lt 1 ]; then | |
| 12 | + echo "Usage: $0 <tenant_id> [extra args...]" | |
| 13 | + echo "Example: $0 162 --days 30 --recreate" | |
| 14 | + exit 1 | |
| 15 | +fi | |
| 16 | + | |
| 17 | +TENANT_ID="$1" | |
| 18 | +shift || true | |
| 19 | + | |
| 20 | +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" | |
| 21 | + | |
| 22 | +cd "$ROOT_DIR" | |
| 23 | + | |
| 24 | +python main.py build-suggestions \ | |
| 25 | + --tenant-id "$TENANT_ID" \ | |
| 26 | + "$@" | |
| 27 | + | ... | ... |
| ... | ... | @@ -0,0 +1,183 @@ |
| 1 | +import json | |
| 2 | +from typing import Any, Dict, List | |
| 3 | + | |
| 4 | +import pytest | |
| 5 | + | |
| 6 | +from suggestion.builder import SuggestionIndexBuilder | |
| 7 | +from suggestion.service import SuggestionService | |
| 8 | + | |
| 9 | + | |
| 10 | +class FakeESClient: | |
| 11 | + """Minimal fake ES client for SuggestionService tests.""" | |
| 12 | + | |
| 13 | + def __init__(self) -> None: | |
| 14 | + self.calls: List[Dict[str, Any]] = [] | |
| 15 | + | |
| 16 | + def search(self, index_name: str, body: Dict[str, Any], size: int = 10, from_: int = 0) -> Dict[str, Any]: | |
| 17 | + self.calls.append({"index": index_name, "body": body, "size": size, "from": from_}) | |
| 18 | + # Suggestion index | |
| 19 | + if "search_suggestions_tenant_" in index_name: | |
| 20 | + return { | |
| 21 | + "hits": { | |
| 22 | + "total": {"value": 1}, | |
| 23 | + "max_score": 3.2, | |
| 24 | + "hits": [ | |
| 25 | + { | |
| 26 | + "_id": "1", | |
| 27 | + "_score": 3.2, | |
| 28 | + "_source": { | |
| 29 | + "text": "iphone 15", | |
| 30 | + "lang": "en", | |
| 31 | + "rank_score": 5.0, | |
| 32 | + "sources": ["query_log", "qanchor"], | |
| 33 | + "lang_source": "log_field", | |
| 34 | + "lang_confidence": 1.0, | |
| 35 | + "lang_conflict": False, | |
| 36 | + "top_spu_ids": ["12345"], | |
| 37 | + }, | |
| 38 | + } | |
| 39 | + ], | |
| 40 | + } | |
| 41 | + } | |
| 42 | + # Product index | |
| 43 | + if "search_products_tenant_" in index_name: | |
| 44 | + return { | |
| 45 | + "hits": { | |
| 46 | + "total": {"value": 1}, | |
| 47 | + "max_score": 2.5, | |
| 48 | + "hits": [ | |
| 49 | + { | |
| 50 | + "_id": "12345", | |
| 51 | + "_score": 2.5, | |
| 52 | + "_source": { | |
| 53 | + "spu_id": "12345", | |
| 54 | + "title": {"en": "iPhone 15 Pro Max"}, | |
| 55 | + "min_price": 999.0, | |
| 56 | + "image_url": "https://example.com/image.jpg", | |
| 57 | + "sales": 100, | |
| 58 | + "total_inventory": 50, | |
| 59 | + }, | |
| 60 | + } | |
| 61 | + ], | |
| 62 | + } | |
| 63 | + } | |
| 64 | + return {"hits": {"total": {"value": 0}, "max_score": 0.0, "hits": []}} | |
| 65 | + | |
| 66 | + # For builder.bulk_index usage compatibility in full runs (not used in these unit tests) | |
| 67 | + def bulk_index(self, index_name: str, docs: List[Dict[str, Any]]) -> Dict[str, Any]: | |
| 68 | + self.calls.append({"index": index_name, "bulk": True, "docs": docs}) | |
| 69 | + return {"success": len(docs), "failed": 0, "errors": []} | |
| 70 | + | |
| 71 | + def index_exists(self, index_name: str) -> bool: | |
| 72 | + return False | |
| 73 | + | |
| 74 | + def delete_index(self, index_name: str) -> bool: | |
| 75 | + return True | |
| 76 | + | |
| 77 | + def create_index(self, index_name: str, body: Dict[str, Any]) -> bool: | |
| 78 | + self.calls.append({"index": index_name, "create": True, "body": body}) | |
| 79 | + return True | |
| 80 | + | |
| 81 | + def refresh(self, index_name: str) -> bool: | |
| 82 | + return True | |
| 83 | + | |
| 84 | + | |
| 85 | +@pytest.mark.unit | |
| 86 | +def test_resolve_query_language_prefers_log_field(monkeypatch): | |
| 87 | + """builder.resolve_query_language 应优先使用日志 language 字段。""" | |
| 88 | + fake_es = FakeESClient() | |
| 89 | + builder = SuggestionIndexBuilder(es_client=fake_es, db_engine=None) | |
| 90 | + # index_languages 里包含 en/zh,primary 设为 zh | |
| 91 | + lang, conf, source, conflict = builder._resolve_query_language( | |
| 92 | + query="iphone 15", | |
| 93 | + log_language="en", | |
| 94 | + request_params=None, | |
| 95 | + index_languages=["zh", "en"], | |
| 96 | + primary_language="zh", | |
| 97 | + ) | |
| 98 | + assert lang == "en" | |
| 99 | + assert conf == 1.0 | |
| 100 | + assert source == "log_field" | |
| 101 | + assert conflict is False | |
| 102 | + | |
| 103 | + | |
| 104 | +@pytest.mark.unit | |
| 105 | +def test_resolve_query_language_uses_request_params_when_log_missing(): | |
| 106 | + """当日志 language 为空时,应从 request_params.language 解析。""" | |
| 107 | + fake_es = FakeESClient() | |
| 108 | + builder = SuggestionIndexBuilder(es_client=fake_es, db_engine=None) | |
| 109 | + request_params = json.dumps({"language": "zh"}) | |
| 110 | + lang, conf, source, conflict = builder._resolve_query_language( | |
| 111 | + query="芭比娃娃", | |
| 112 | + log_language=None, | |
| 113 | + request_params=request_params, | |
| 114 | + index_languages=["zh", "en"], | |
| 115 | + primary_language="en", | |
| 116 | + ) | |
| 117 | + assert lang == "zh" | |
| 118 | + assert conf == 1.0 | |
| 119 | + assert source == "request_params" | |
| 120 | + assert conflict is False | |
| 121 | + | |
| 122 | + | |
| 123 | +@pytest.mark.unit | |
| 124 | +def test_resolve_query_language_fallback_to_primary(): | |
| 125 | + """当无任何语言线索时,应回落到租户 primary_language。""" | |
| 126 | + fake_es = FakeESClient() | |
| 127 | + builder = SuggestionIndexBuilder(es_client=fake_es, db_engine=None) | |
| 128 | + lang, conf, source, conflict = builder._resolve_query_language( | |
| 129 | + query="some text", | |
| 130 | + log_language=None, | |
| 131 | + request_params=None, | |
| 132 | + index_languages=["zh", "en"], | |
| 133 | + primary_language="zh", | |
| 134 | + ) | |
| 135 | + assert lang == "zh" | |
| 136 | + assert source == "default" | |
| 137 | + assert conflict is False | |
| 138 | + | |
| 139 | + | |
| 140 | +@pytest.mark.unit | |
| 141 | +def test_suggestion_service_basic_flow(monkeypatch): | |
| 142 | + """ | |
| 143 | + SuggestionService.search 应正确调用 ES 并返回 suggestion + products。 | |
| 144 | + 使用 FakeESClient 避免真实 ES 依赖。 | |
| 145 | + """ | |
| 146 | + # 覆盖 tenant_config_loader 以避免依赖外部 config.yaml 改动 | |
| 147 | + from config import tenant_config_loader as tcl | |
| 148 | + | |
| 149 | + loader = tcl.get_tenant_config_loader() | |
| 150 | + # 强制覆盖内部缓存配置 | |
| 151 | + loader._config = { | |
| 152 | + "default": {"primary_language": "en", "index_languages": ["en", "zh"]}, | |
| 153 | + "tenants": { | |
| 154 | + "1": {"primary_language": "en", "index_languages": ["en", "zh"]}, | |
| 155 | + }, | |
| 156 | + } | |
| 157 | + | |
| 158 | + fake_es = FakeESClient() | |
| 159 | + service = SuggestionService(es_client=fake_es) | |
| 160 | + result = service.search( | |
| 161 | + tenant_id="1", | |
| 162 | + query="iph", | |
| 163 | + language="en", | |
| 164 | + size=5, | |
| 165 | + with_results=True, | |
| 166 | + result_size=2, | |
| 167 | + ) | |
| 168 | + | |
| 169 | + assert result["resolved_language"] == "en" | |
| 170 | + assert result["query"] == "iph" | |
| 171 | + assert result["took_ms"] >= 0 | |
| 172 | + suggestions = result["suggestions"] | |
| 173 | + assert len(suggestions) == 1 | |
| 174 | + s0 = suggestions[0] | |
| 175 | + assert s0["text"] == "iphone 15" | |
| 176 | + assert s0["lang"] == "en" | |
| 177 | + assert isinstance(s0.get("products"), list) | |
| 178 | + assert len(s0["products"]) >= 1 | |
| 179 | + p0 = s0["products"][0] | |
| 180 | + assert p0["spu_id"] == "12345" | |
| 181 | + assert "title" in p0 | |
| 182 | + assert "price" in p0 | |
| 183 | + | ... | ... |