Commit f251cf2d502df5c8e9da2b00757eee2444694e21
1 parent
ded6f29e
suggestion全量索引程序跑通
Showing
6 changed files
with
310 additions
and
21 deletions
Show diff stats
docs/Usage-Guide.md
| @@ -343,6 +343,49 @@ curl -X POST http://localhost:6002/search/image \ | @@ -343,6 +343,49 @@ curl -X POST http://localhost:6002/search/image \ | ||
| 343 | 343 | ||
| 344 | --- | 344 | --- |
| 345 | 345 | ||
| 346 | +## 8. Suggestion 索引与接口使用 | ||
| 347 | + | ||
| 348 | +### 8.1 构建 Suggestion 索引(全量) | ||
| 349 | + | ||
| 350 | +Suggestion 索引会从: | ||
| 351 | + | ||
| 352 | +- ES 商品索引:`title.{lang}`, `qanchors.{lang}` | ||
| 353 | +- MySQL 日志表:`shoplazza_search_log.query`(含 `language`、`request_params`) | ||
| 354 | + | ||
| 355 | +聚合生成 `search_suggestions_tenant_{tenant_id}`。 | ||
| 356 | + | ||
| 357 | +在项目根目录执行: | ||
| 358 | + | ||
| 359 | +```bash | ||
| 360 | +# 为指定租户全量重建 suggestion 索引(会删除旧索引) | ||
| 361 | +python main.py build-suggestions \ | ||
| 362 | + --tenant-id 162 \ | ||
| 363 | + --es-host http://localhost:9200 \ | ||
| 364 | + --days 30 \ | ||
| 365 | + --recreate | ||
| 366 | +``` | ||
| 367 | + | ||
| 368 | +可选参数: | ||
| 369 | + | ||
| 370 | +- `--days`:回溯日志天数(默认 30) | ||
| 371 | +- `--batch-size`:扫描商品索引的批大小(默认 500) | ||
| 372 | +- `--min-query-len`:参与 suggestion 的最小查询长度(默认 1) | ||
| 373 | + | ||
| 374 | +> 建议在商品索引构建完成、日志正常写入一段时间后执行一次全量构建,然后按天/小时增加增量构建任务。 | ||
| 375 | + | ||
| 376 | +### 8.2 调用 Suggestion 接口 | ||
| 377 | + | ||
| 378 | +全量构建完成后,可直接通过 `/search/suggestions` 获取自动补全结果: | ||
| 379 | + | ||
| 380 | +```bash | ||
| 381 | +curl "http://localhost:6002/search/suggestions?q=iph&size=5&language=en&with_results=true" \ | ||
| 382 | + -H "X-Tenant-ID: 162" | ||
| 383 | +``` | ||
| 384 | + | ||
| 385 | +接口返回结构详见 `docs/搜索API对接指南.md` 的“3.7 搜索建议接口”章节。 | ||
| 386 | + | ||
| 387 | +--- | ||
| 388 | + | ||
| 346 | ## 常见问题 | 389 | ## 常见问题 |
| 347 | 390 | ||
| 348 | ### Q1: MySQL连接失败 | 391 | ### Q1: MySQL连接失败 |
docs/搜索API对接指南.md
| @@ -558,40 +558,59 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"}) | @@ -558,40 +558,59 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"}) | ||
| 558 | 558 | ||
| 559 | ### 3.7 搜索建议接口 | 559 | ### 3.7 搜索建议接口 |
| 560 | 560 | ||
| 561 | -> ⚠️ **TODO**: 此接口当前为框架实现,功能暂未实现,仅返回空结果。接口和响应格式已经固定,可平滑扩展。 | ||
| 562 | - | ||
| 563 | - **端点**: `GET /search/suggestions` | 561 | - **端点**: `GET /search/suggestions` |
| 564 | -- **描述**: 返回搜索建议(自动补全/热词)。当前为框架实现,接口和响应格式已经固定,可平滑扩展。 | 562 | +- **描述**: 返回搜索建议(自动补全/热词),支持多语言与“结果直达”(每条 suggestion 附带商品列表)。 |
| 565 | 563 | ||
| 566 | #### 查询参数 | 564 | #### 查询参数 |
| 567 | 565 | ||
| 568 | | 参数 | 类型 | 必填 | 默认值 | 描述 | | 566 | | 参数 | 类型 | 必填 | 默认值 | 描述 | |
| 569 | |------|------|------|--------|------| | 567 | |------|------|------|--------|------| |
| 570 | | `q` | string | Y | - | 查询字符串(至少 1 个字符) | | 568 | | `q` | string | Y | - | 查询字符串(至少 1 个字符) | |
| 571 | -| `size` | integer | N | 5 | 返回建议数量(1-20) | | ||
| 572 | -| `types` | string | N | `query` | 建议类型(逗号分隔):`query`, `product`, `category`, `brand` | | 569 | +| `size` | integer | N | 10 | 返回建议数量(1-20) | |
| 570 | +| `language` | string | N | `en` | 请求语言,如 `zh` / `en` / `ar` / `ru`,用于路由到对应语种 suggestion 索引 | | ||
| 571 | +| `with_results` | bool | N | `true` | 是否为每条 suggestion 返回商品列表(结果直达) | | ||
| 572 | +| `result_size` | integer | N | 3 | 每条 suggestion 返回的商品数量(1-10) | | ||
| 573 | +| `debug` | bool | N | `false` | 是否开启调试(目前主要用于排查 suggestion 排序与语言解析) | | ||
| 574 | + | ||
| 575 | +> **租户标识**:同 [3.1](#31-接口信息),通过请求头 `X-Tenant-ID` 或 query 参数 `tenant_id` 传递。 | ||
| 573 | 576 | ||
| 574 | #### 响应示例 | 577 | #### 响应示例 |
| 575 | 578 | ||
| 576 | ```json | 579 | ```json |
| 577 | { | 580 | { |
| 578 | - "query": "芭", | 581 | + "query": "iph", |
| 582 | + "language": "en", | ||
| 583 | + "resolved_language": "en", | ||
| 579 | "suggestions": [ | 584 | "suggestions": [ |
| 580 | { | 585 | { |
| 581 | - "text": "芭比娃娃", | ||
| 582 | - "type": "query", | ||
| 583 | - "highlight": "<em>芭</em>比娃娃", | ||
| 584 | - "popularity": 850 | 586 | + "text": "iphone 15", |
| 587 | + "lang": "en", | ||
| 588 | + "score": 12.37, | ||
| 589 | + "rank_score": 5.1, | ||
| 590 | + "sources": ["query_log", "qanchor"], | ||
| 591 | + "lang_source": "log_field", | ||
| 592 | + "lang_confidence": 1.0, | ||
| 593 | + "lang_conflict": false, | ||
| 594 | + "products": [ | ||
| 595 | + { | ||
| 596 | + "spu_id": "12345", | ||
| 597 | + "title": "iPhone 15 Pro Max", | ||
| 598 | + "price": 999.0, | ||
| 599 | + "image_url": "https://example.com/image.jpg", | ||
| 600 | + "score": 3.21 | ||
| 601 | + } | ||
| 602 | + ] | ||
| 585 | } | 603 | } |
| 586 | ], | 604 | ], |
| 587 | - "took_ms": 5 | 605 | + "took_ms": 12 |
| 588 | } | 606 | } |
| 589 | ``` | 607 | ``` |
| 590 | 608 | ||
| 591 | #### 请求示例 | 609 | #### 请求示例 |
| 592 | 610 | ||
| 593 | ```bash | 611 | ```bash |
| 594 | -curl "http://localhost:6002/search/suggestions?q=芭&size=5&types=query,product" | 612 | +curl "http://localhost:6002/search/suggestions?q=芭&size=5&language=zh&with_results=true" \ |
| 613 | + -H "X-Tenant-ID: 162" | ||
| 595 | ``` | 614 | ``` |
| 596 | 615 | ||
| 597 | ### 3.8 即时搜索接口 | 616 | ### 3.8 即时搜索接口 |
docs/搜索API速查表.md
| @@ -294,7 +294,7 @@ POST /search/image | @@ -294,7 +294,7 @@ POST /search/image | ||
| 294 | "size": 20 | 294 | "size": 20 |
| 295 | } | 295 | } |
| 296 | 296 | ||
| 297 | -GET /search/suggestions?q=芭&size=5 | 297 | +GET /search/suggestions?q=芭&size=5&language=zh&with_results=true |
| 298 | 298 | ||
| 299 | GET /search/instant?q=玩具&size=5 | 299 | GET /search/instant?q=玩具&size=5 |
| 300 | 300 |
main.py
| @@ -12,13 +12,12 @@ import sys | @@ -12,13 +12,12 @@ import sys | ||
| 12 | import os | 12 | import os |
| 13 | import argparse | 13 | import argparse |
| 14 | import json | 14 | import json |
| 15 | -import uvicorn | ||
| 16 | 15 | ||
| 17 | # Add parent directory to path | 16 | # Add parent directory to path |
| 18 | sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) | 17 | sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) |
| 19 | 18 | ||
| 20 | from config import ConfigLoader | 19 | from config import ConfigLoader |
| 21 | -from config.env_config import DB_CONFIG | 20 | +from config.env_config import ES_CONFIG |
| 22 | from utils import ESClient | 21 | from utils import ESClient |
| 23 | from search import Searcher | 22 | from search import Searcher |
| 24 | from suggestion import SuggestionIndexBuilder | 23 | from suggestion import SuggestionIndexBuilder |
| @@ -27,6 +26,7 @@ from utils.db_connector import create_db_connection | @@ -27,6 +26,7 @@ from utils.db_connector import create_db_connection | ||
| 27 | 26 | ||
| 28 | def cmd_serve(args): | 27 | def cmd_serve(args): |
| 29 | """Start API service.""" | 28 | """Start API service.""" |
| 29 | + import uvicorn | ||
| 30 | os.environ['ES_HOST'] = args.es_host | 30 | os.environ['ES_HOST'] = args.es_host |
| 31 | 31 | ||
| 32 | print("Starting API service (multi-tenant)...") | 32 | print("Starting API service (multi-tenant)...") |
| @@ -43,6 +43,7 @@ def cmd_serve(args): | @@ -43,6 +43,7 @@ def cmd_serve(args): | ||
| 43 | 43 | ||
| 44 | def cmd_serve_indexer(args): | 44 | def cmd_serve_indexer(args): |
| 45 | """Start dedicated Indexer API service (no search endpoints).""" | 45 | """Start dedicated Indexer API service (no search endpoints).""" |
| 46 | + import uvicorn | ||
| 46 | os.environ['ES_HOST'] = args.es_host | 47 | os.environ['ES_HOST'] = args.es_host |
| 47 | 48 | ||
| 48 | print("Starting Indexer API service...") | 49 | print("Starting Indexer API service...") |
| @@ -102,17 +103,33 @@ def cmd_search(args): | @@ -102,17 +103,33 @@ def cmd_search(args): | ||
| 102 | 103 | ||
| 103 | def cmd_build_suggestions(args): | 104 | def cmd_build_suggestions(args): |
| 104 | """Build suggestion index for a tenant.""" | 105 | """Build suggestion index for a tenant.""" |
| 105 | - es_client = ESClient(hosts=[args.es_host]) | 106 | + # Initialize ES client with optional authentication |
| 107 | + es_username = os.getenv("ES_USERNAME") or ES_CONFIG.get("username") | ||
| 108 | + es_password = os.getenv("ES_PASSWORD") or ES_CONFIG.get("password") | ||
| 109 | + if es_username and es_password: | ||
| 110 | + es_client = ESClient(hosts=[args.es_host], username=es_username, password=es_password) | ||
| 111 | + else: | ||
| 112 | + es_client = ESClient(hosts=[args.es_host]) | ||
| 106 | if not es_client.ping(): | 113 | if not es_client.ping(): |
| 107 | print(f"ERROR: Cannot connect to Elasticsearch at {args.es_host}") | 114 | print(f"ERROR: Cannot connect to Elasticsearch at {args.es_host}") |
| 108 | return 1 | 115 | return 1 |
| 109 | 116 | ||
| 117 | + # Build DB config directly from environment to avoid dotenv dependency | ||
| 118 | + db_host = os.getenv("DB_HOST") | ||
| 119 | + db_port = int(os.getenv("DB_PORT", "3306")) | ||
| 120 | + db_name = os.getenv("DB_DATABASE") | ||
| 121 | + db_user = os.getenv("DB_USERNAME") | ||
| 122 | + db_pass = os.getenv("DB_PASSWORD") | ||
| 123 | + if not all([db_host, db_name, db_user, db_pass]): | ||
| 124 | + print("ERROR: DB_HOST/DB_PORT/DB_DATABASE/DB_USERNAME/DB_PASSWORD must be set in environment") | ||
| 125 | + return 1 | ||
| 126 | + | ||
| 110 | db_engine = create_db_connection( | 127 | db_engine = create_db_connection( |
| 111 | - host=DB_CONFIG["host"], | ||
| 112 | - port=DB_CONFIG["port"], | ||
| 113 | - database=DB_CONFIG["database"], | ||
| 114 | - username=DB_CONFIG["username"], | ||
| 115 | - password=DB_CONFIG["password"], | 128 | + host=db_host, |
| 129 | + port=db_port, | ||
| 130 | + database=db_name, | ||
| 131 | + username=db_user, | ||
| 132 | + password=db_pass, | ||
| 116 | ) | 133 | ) |
| 117 | builder = SuggestionIndexBuilder(es_client=es_client, db_engine=db_engine) | 134 | builder = SuggestionIndexBuilder(es_client=es_client, db_engine=db_engine) |
| 118 | result = builder.rebuild_tenant_index( | 135 | result = builder.rebuild_tenant_index( |
| @@ -0,0 +1,27 @@ | @@ -0,0 +1,27 @@ | ||
| 1 | +#!/usr/bin/env bash | ||
| 2 | +# | ||
| 3 | +# Convenience script to rebuild suggestion index for a tenant. | ||
| 4 | +# | ||
| 5 | +# Usage: | ||
| 6 | +# ./scripts/build_suggestions.sh <tenant_id> [--days 30] [--batch-size 500] [--min-query-len 1] [--es-host http://localhost:9200] | ||
| 7 | +# | ||
| 8 | + | ||
| 9 | +set -euo pipefail | ||
| 10 | + | ||
| 11 | +if [ $# -lt 1 ]; then | ||
| 12 | + echo "Usage: $0 <tenant_id> [extra args...]" | ||
| 13 | + echo "Example: $0 162 --days 30 --recreate" | ||
| 14 | + exit 1 | ||
| 15 | +fi | ||
| 16 | + | ||
| 17 | +TENANT_ID="$1" | ||
| 18 | +shift || true | ||
| 19 | + | ||
| 20 | +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" | ||
| 21 | + | ||
| 22 | +cd "$ROOT_DIR" | ||
| 23 | + | ||
| 24 | +python main.py build-suggestions \ | ||
| 25 | + --tenant-id "$TENANT_ID" \ | ||
| 26 | + "$@" | ||
| 27 | + |
| @@ -0,0 +1,183 @@ | @@ -0,0 +1,183 @@ | ||
| 1 | +import json | ||
| 2 | +from typing import Any, Dict, List | ||
| 3 | + | ||
| 4 | +import pytest | ||
| 5 | + | ||
| 6 | +from suggestion.builder import SuggestionIndexBuilder | ||
| 7 | +from suggestion.service import SuggestionService | ||
| 8 | + | ||
| 9 | + | ||
| 10 | +class FakeESClient: | ||
| 11 | + """Minimal fake ES client for SuggestionService tests.""" | ||
| 12 | + | ||
| 13 | + def __init__(self) -> None: | ||
| 14 | + self.calls: List[Dict[str, Any]] = [] | ||
| 15 | + | ||
| 16 | + def search(self, index_name: str, body: Dict[str, Any], size: int = 10, from_: int = 0) -> Dict[str, Any]: | ||
| 17 | + self.calls.append({"index": index_name, "body": body, "size": size, "from": from_}) | ||
| 18 | + # Suggestion index | ||
| 19 | + if "search_suggestions_tenant_" in index_name: | ||
| 20 | + return { | ||
| 21 | + "hits": { | ||
| 22 | + "total": {"value": 1}, | ||
| 23 | + "max_score": 3.2, | ||
| 24 | + "hits": [ | ||
| 25 | + { | ||
| 26 | + "_id": "1", | ||
| 27 | + "_score": 3.2, | ||
| 28 | + "_source": { | ||
| 29 | + "text": "iphone 15", | ||
| 30 | + "lang": "en", | ||
| 31 | + "rank_score": 5.0, | ||
| 32 | + "sources": ["query_log", "qanchor"], | ||
| 33 | + "lang_source": "log_field", | ||
| 34 | + "lang_confidence": 1.0, | ||
| 35 | + "lang_conflict": False, | ||
| 36 | + "top_spu_ids": ["12345"], | ||
| 37 | + }, | ||
| 38 | + } | ||
| 39 | + ], | ||
| 40 | + } | ||
| 41 | + } | ||
| 42 | + # Product index | ||
| 43 | + if "search_products_tenant_" in index_name: | ||
| 44 | + return { | ||
| 45 | + "hits": { | ||
| 46 | + "total": {"value": 1}, | ||
| 47 | + "max_score": 2.5, | ||
| 48 | + "hits": [ | ||
| 49 | + { | ||
| 50 | + "_id": "12345", | ||
| 51 | + "_score": 2.5, | ||
| 52 | + "_source": { | ||
| 53 | + "spu_id": "12345", | ||
| 54 | + "title": {"en": "iPhone 15 Pro Max"}, | ||
| 55 | + "min_price": 999.0, | ||
| 56 | + "image_url": "https://example.com/image.jpg", | ||
| 57 | + "sales": 100, | ||
| 58 | + "total_inventory": 50, | ||
| 59 | + }, | ||
| 60 | + } | ||
| 61 | + ], | ||
| 62 | + } | ||
| 63 | + } | ||
| 64 | + return {"hits": {"total": {"value": 0}, "max_score": 0.0, "hits": []}} | ||
| 65 | + | ||
| 66 | + # For builder.bulk_index usage compatibility in full runs (not used in these unit tests) | ||
| 67 | + def bulk_index(self, index_name: str, docs: List[Dict[str, Any]]) -> Dict[str, Any]: | ||
| 68 | + self.calls.append({"index": index_name, "bulk": True, "docs": docs}) | ||
| 69 | + return {"success": len(docs), "failed": 0, "errors": []} | ||
| 70 | + | ||
| 71 | + def index_exists(self, index_name: str) -> bool: | ||
| 72 | + return False | ||
| 73 | + | ||
| 74 | + def delete_index(self, index_name: str) -> bool: | ||
| 75 | + return True | ||
| 76 | + | ||
| 77 | + def create_index(self, index_name: str, body: Dict[str, Any]) -> bool: | ||
| 78 | + self.calls.append({"index": index_name, "create": True, "body": body}) | ||
| 79 | + return True | ||
| 80 | + | ||
| 81 | + def refresh(self, index_name: str) -> bool: | ||
| 82 | + return True | ||
| 83 | + | ||
| 84 | + | ||
| 85 | +@pytest.mark.unit | ||
| 86 | +def test_resolve_query_language_prefers_log_field(monkeypatch): | ||
| 87 | + """builder.resolve_query_language 应优先使用日志 language 字段。""" | ||
| 88 | + fake_es = FakeESClient() | ||
| 89 | + builder = SuggestionIndexBuilder(es_client=fake_es, db_engine=None) | ||
| 90 | + # index_languages 里包含 en/zh,primary 设为 zh | ||
| 91 | + lang, conf, source, conflict = builder._resolve_query_language( | ||
| 92 | + query="iphone 15", | ||
| 93 | + log_language="en", | ||
| 94 | + request_params=None, | ||
| 95 | + index_languages=["zh", "en"], | ||
| 96 | + primary_language="zh", | ||
| 97 | + ) | ||
| 98 | + assert lang == "en" | ||
| 99 | + assert conf == 1.0 | ||
| 100 | + assert source == "log_field" | ||
| 101 | + assert conflict is False | ||
| 102 | + | ||
| 103 | + | ||
| 104 | +@pytest.mark.unit | ||
| 105 | +def test_resolve_query_language_uses_request_params_when_log_missing(): | ||
| 106 | + """当日志 language 为空时,应从 request_params.language 解析。""" | ||
| 107 | + fake_es = FakeESClient() | ||
| 108 | + builder = SuggestionIndexBuilder(es_client=fake_es, db_engine=None) | ||
| 109 | + request_params = json.dumps({"language": "zh"}) | ||
| 110 | + lang, conf, source, conflict = builder._resolve_query_language( | ||
| 111 | + query="芭比娃娃", | ||
| 112 | + log_language=None, | ||
| 113 | + request_params=request_params, | ||
| 114 | + index_languages=["zh", "en"], | ||
| 115 | + primary_language="en", | ||
| 116 | + ) | ||
| 117 | + assert lang == "zh" | ||
| 118 | + assert conf == 1.0 | ||
| 119 | + assert source == "request_params" | ||
| 120 | + assert conflict is False | ||
| 121 | + | ||
| 122 | + | ||
| 123 | +@pytest.mark.unit | ||
| 124 | +def test_resolve_query_language_fallback_to_primary(): | ||
| 125 | + """当无任何语言线索时,应回落到租户 primary_language。""" | ||
| 126 | + fake_es = FakeESClient() | ||
| 127 | + builder = SuggestionIndexBuilder(es_client=fake_es, db_engine=None) | ||
| 128 | + lang, conf, source, conflict = builder._resolve_query_language( | ||
| 129 | + query="some text", | ||
| 130 | + log_language=None, | ||
| 131 | + request_params=None, | ||
| 132 | + index_languages=["zh", "en"], | ||
| 133 | + primary_language="zh", | ||
| 134 | + ) | ||
| 135 | + assert lang == "zh" | ||
| 136 | + assert source == "default" | ||
| 137 | + assert conflict is False | ||
| 138 | + | ||
| 139 | + | ||
| 140 | +@pytest.mark.unit | ||
| 141 | +def test_suggestion_service_basic_flow(monkeypatch): | ||
| 142 | + """ | ||
| 143 | + SuggestionService.search 应正确调用 ES 并返回 suggestion + products。 | ||
| 144 | + 使用 FakeESClient 避免真实 ES 依赖。 | ||
| 145 | + """ | ||
| 146 | + # 覆盖 tenant_config_loader 以避免依赖外部 config.yaml 改动 | ||
| 147 | + from config import tenant_config_loader as tcl | ||
| 148 | + | ||
| 149 | + loader = tcl.get_tenant_config_loader() | ||
| 150 | + # 强制覆盖内部缓存配置 | ||
| 151 | + loader._config = { | ||
| 152 | + "default": {"primary_language": "en", "index_languages": ["en", "zh"]}, | ||
| 153 | + "tenants": { | ||
| 154 | + "1": {"primary_language": "en", "index_languages": ["en", "zh"]}, | ||
| 155 | + }, | ||
| 156 | + } | ||
| 157 | + | ||
| 158 | + fake_es = FakeESClient() | ||
| 159 | + service = SuggestionService(es_client=fake_es) | ||
| 160 | + result = service.search( | ||
| 161 | + tenant_id="1", | ||
| 162 | + query="iph", | ||
| 163 | + language="en", | ||
| 164 | + size=5, | ||
| 165 | + with_results=True, | ||
| 166 | + result_size=2, | ||
| 167 | + ) | ||
| 168 | + | ||
| 169 | + assert result["resolved_language"] == "en" | ||
| 170 | + assert result["query"] == "iph" | ||
| 171 | + assert result["took_ms"] >= 0 | ||
| 172 | + suggestions = result["suggestions"] | ||
| 173 | + assert len(suggestions) == 1 | ||
| 174 | + s0 = suggestions[0] | ||
| 175 | + assert s0["text"] == "iphone 15" | ||
| 176 | + assert s0["lang"] == "en" | ||
| 177 | + assert isinstance(s0.get("products"), list) | ||
| 178 | + assert len(s0["products"]) >= 1 | ||
| 179 | + p0 = s0["products"][0] | ||
| 180 | + assert p0["spu_id"] == "12345" | ||
| 181 | + assert "title" in p0 | ||
| 182 | + assert "price" in p0 | ||
| 183 | + |